### First, to parse the dataset

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
import math
import matplotlib.pyplot as plt

In [2]:
from bs4 import BeautifulSoup,SoupStrainer

In [3]:
f = open('data/reut2-000.sgm', 'r')
data= f.read()
soup = BeautifulSoup(data)

topics = list(soup.findAll('topics'))
text = list(soup.findAll('text'))

Keep fewer than max entries. Can comment out the next cell after more rigorous testing.

In [4]:
topics = topics[:250]
text = text[:250]

In [5]:
print topics[11].text
print text[11].text

earnacq

OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET
    CLEVELAND, Feb 26 - Ohio Mattress Co said its first
quarter, ending February 28, profits may be below the 2.4 mln
dlrs, or 15 cts a share, earned in the first quarter of fiscal
1986.
    The company said any decline would be due to expenses
related to the acquisitions in the middle of the current
quarter of seven licensees of Sealy Inc, as well as 82 pct of
the outstanding capital stock of Sealy.
    Because of these acquisitions, it said, first quarter sales
will be substantially higher than last year's 67.1 mln dlrs.
    Noting that it typically reports first quarter results in
late march, said the report is likely to be issued in early
April this year.
    It said the delay is due to administrative considerations,
including conducting appraisals, in connection with the
acquisitions.
 Reuter



In [6]:
uniqueTopics = {}

for topicList in topics:
    for t in topicList:
        if t.text != '':
            uniqueTopics[t.text] = True

print len(uniqueTopics)
print uniqueTopics

46
{u'copper': True, u'copra-cake': True, u'livestock': True, u'money-fx': True, u'tea': True, u'sunseed': True, u'trade': True, u'cocoa': True, u'reserves': True, u'soybean': True, u'ship': True, u'cotton': True, u'red-bean': True, u'palm-oil': True, u'housing': True, u'rye': True, u'gnp': True, u'sugar': True, u'rubber': True, u'veg-oil': True, u'oat': True, u'barley': True, u'interest': True, u'crude': True, u'rice': True, u'palmkernel': True, u'plywood': True, u'coffee': True, u'soy-oil': True, u'wheat': True, u'meal-feed': True, u'corn': True, u'nat-gas': True, u'oilseed': True, u'linseed': True, u'lin-oil': True, u'earn': True, u'cpi': True, u'soy-meal': True, u'bop': True, u'sun-oil': True, u'money-supply': True, u'carcass': True, u'acq': True, u'grain': True, u'sorghum': True}


Make these things callable by index

In [7]:
topicsToInd = dict()
indToTopic = dict()

for i in range(len(uniqueTopics)):
    topicsToInd[uniqueTopics.keys()[i]] = i
    indToTopic[i] = uniqueTopics.keys()[i]

Next, keep only the articles that have a non-null list of topics

In [8]:
valid = []

for ind in range(len(topics)):
    if len(topics[ind]) != 0:
        valid.append(ind)
        
print len(valid)
print [t.text for t in topics[valid[1]]]

153
[u'grain', u'wheat', u'corn', u'barley', u'oat', u'sorghum']


In [9]:
topics = [topics[x] for x in valid]
for ind in range(len(topics)):
    topicList = topics[ind]
    toText = []
    
    for t in topicList:
        toText.append(t.text)
        
    topics[ind] = toText[:]

text = [text[x].text for x in valid]

In [10]:
print topics[6]
print text[6]

[u'earn', u'acq']

OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET
    CLEVELAND, Feb 26 - Ohio Mattress Co said its first
quarter, ending February 28, profits may be below the 2.4 mln
dlrs, or 15 cts a share, earned in the first quarter of fiscal
1986.
    The company said any decline would be due to expenses
related to the acquisitions in the middle of the current
quarter of seven licensees of Sealy Inc, as well as 82 pct of
the outstanding capital stock of Sealy.
    Because of these acquisitions, it said, first quarter sales
will be substantially higher than last year's 67.1 mln dlrs.
    Noting that it typically reports first quarter results in
late march, said the report is likely to be issued in early
April this year.
    It said the delay is due to administrative considerations,
including conducting appraisals, in connection with the
acquisitions.
 Reuter



In [11]:
print len(topics), len(text), "   <---- these should be equal"

153 153    <---- these should be equal


Next, create a dictionary of type topics -> article indexes with said topics

In [12]:
articles = dict()

for i in range(len(topics)):
    for t in topics[i]:
        if topicsToInd[t] in articles:
            articles[topicsToInd[t]].append(i)
        else:
            articles[topicsToInd[t]] = [i]

print articles[topicsToInd['wheat']]

[1, 2, 10, 59, 112, 144, 147]


### Now, to generate tf-idf scores

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

Set ngram_range for a more refined analysis

In [14]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')

In [15]:
X = vectorizer.fit_transform(text).todense()

In [16]:
for ind in range(len(X)):
    X[ind] = np.array(X[ind].tolist()[0])

### Can we compute the cosine between two different articles? Create a cosine similarity matrix.

In [17]:
cosine_similarity(X[0].reshape(1,-1), X[6].reshape(1,-1))

array([[ 0.02880969]])

In [18]:
n = len(topics)

In [19]:
csim = [[0 for i in range(n)] for i in range(n)]

for i in range(n):
    for j in range(n):
        csim[i][j] = cosine_similarity(X[i].reshape(1,-1), X[j].reshape(1,-1))[0][0]

### Now, apply our algorithm with facility location as the submodular functions

In [20]:
n = len(topics)
m = len(uniqueTopics)
k = 5

print n, m, k

153 46 5


In [21]:
from localsearch import localsearch
from greedysum import greedysum
from greedymerge import greedymerge
from replacementGreedy import replacementGreedy

In [22]:
solution = []
runtime = []
calls = []

Lvalues = [5,7,10,14,17]
for l in range(20,80,10):
    Lvalues.append(l)

for l in Lvalues:
    rgTime = time.time()
    rgS, rgCost, rgEvals = replacementGreedy(n, m, l, k, csim, articles)
    rgTime = time.time() - rgTime
    
    lsTime = time.time()
    lsS, lsCost, lsEvals = localsearch(n, m, l, k, csim, articles, 0.2)
    lsTime = time.time() - lsTime
    
    gsTime = time.time()
    gsS, gsCost, gsEvals = greedysum(n, m, l, k, csim, articles)
    gsTime = time.time() - gsTime
    
    gmTime = time.time()
    gmS, gmCost, gmEvals = greedymerge(n, m, l, k, csim, articles)
    gmTime = time.time() - gmTime
    
    solution.append([rgCost, lsCost, gsCost, gmCost])
    runtime.append([rgTime, lsTime, gsTime, gmTime])
    calls.append([rgEvals, lsEvals, gsEvals, gmEvals])
    
    print 'Replacement greedy gives cost', rgCost
    
    print ""
    print "Done for l = ", l
    print "\n\n\n"

Local search value after initialization =  51.706115061
Intermediate cost at step  0  =  51.706115061
Local Search gives cost =  51.706115061
Greedy Sum gives cost =  51.706115061
Greedy Merge gives cost =  129.920337744
Size of S is  58
Replacement greedy gives cost 51.706115061

Done for l =  5




Local search value after initialization =  62.0249168378
Intermediate cost at step  0  =  62.0249168378
Local Search gives cost =  62.0249168378
Greedy Sum gives cost =  62.0249168378
Greedy Merge gives cost =  129.920337744
Size of S is  58
Replacement greedy gives cost 62.0249168378

Done for l =  7




Local search value after initialization =  72.1819885195
Intermediate cost at step  0  =  72.1819885195
Local Search gives cost =  72.1819885195
Greedy Sum gives cost =  71.5314408768
Greedy Merge gives cost =  129.920337744
Size of S is  58
Replacement greedy gives cost 72.1819885195

Done for l =  10




Local search value after initialization =  81.8428543583
Intermediate cost at step 

In [23]:
# store locally
filename = open('../data/reuters-K.txt', 'w')

print>>filename, 'solution'
for item in solution:
    print>>filename,item[0],item[1],item[2],item[3]
    
print>>filename, 'runtime'
for item in runtime:
    print>>filename,item[0],item[1],item[2],item[3]
    
print>>filename, 'calls'
for item in calls:
    print>>filename,item[0],item[1],item[2],item[3]

In [24]:
plt.clf()

ax = plt.subplot(111)

fs = 17

# http://matplotlib.org/users/text_intro.html
ax.set_xlabel('l', fontsize=fs)
ax.set_ylabel('Objective Value', fontsize=fs)

plt.ylim(50,140)
plt.xlim(min(Lvalues), max(Lvalues))

xticks = [min(Lvalues)]
for l in range(10, max(Lvalues) + 10, 10):
    xticks.append(l)

ax.set_xticks(xticks)

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 

colors = ['ro', 'cs', 'b^', 'gv']
labelNames = ['Replacement Greedy', 'Local Search', 'Greedy Sum', 'Greedy Merge']

for ind in range(len(colors)-1, -1,-1):
    vals = []
    for i in range(len(solution)):
        vals.append(solution[i][ind])
        
    if ind != 0:
        plt.plot(Lvalues, vals, c = colors[ind][0], marker = colors[ind][1], linewidth=2, label = labelNames[ind])
    else:
        plt.plot(Lvalues, vals, 'r--', linewidth=4, label = labelNames[ind])
    
# http://matplotlib.org/1.3.0/examples/pylab_examples/legend_demo.html
legend = ax.legend(loc='lower right')

# Set the fontsize
for label in legend.get_texts():
    label.set_fontsize(fs)

plt.savefig("../../writeup/images/reuters-fixed-k")

plt.close()

In [25]:
plt.clf()

ax = plt.subplot(111)

# http://matplotlib.org/users/text_intro.html
ax.set_xlabel('l', fontsize=fs)
ax.set_ylabel('Log(runtime)', fontsize=fs)

plt.ylim(-10,25)
plt.xlim(min(Lvalues), max(Lvalues))

xticks = [min(Lvalues)]
for l in range(10, max(Lvalues) + 10, 10):
    xticks.append(l)

ax.set_xticks(xticks)

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 

colors = ['ro', 'cs', 'b^', 'gv']
labelNames = ['Replacement Greedy', 'Local Search', 'Greedy Sum', 'Greedy Merge']

for ind in range(len(colors)-1, -1,-1):
    vals = []
    for i in range(len(runtime)):
        vals.append(math.log(runtime[i][ind], 2))
        
    if ind != 0:
        plt.plot(Lvalues, vals, c = colors[ind][0], marker = colors[ind][1], linewidth=2, label = labelNames[ind])
    else:
        plt.plot(Lvalues, vals, 'r--', linewidth=4, label = labelNames[ind])
    
# http://matplotlib.org/1.3.0/examples/pylab_examples/legend_demo.html
legend = ax.legend(loc='upper right')

# Set the fontsize
for label in legend.get_texts():
    label.set_fontsize(fs)

plt.savefig("../../writeup/images/reuters-runtime-fixed-k")

plt.close()