In [1]:
%autosave 300

Autosaving every 300 seconds


In [3]:
import pandas as pd
import numpy as np
from Reuters import *

In [4]:
# the downloaded dataset
!ls -la data/reuters21578/*.sgm

-rw-r--r-- 1 canwill canwill 1324350 Dec  4  1996 data/reuters21578/reut2-000.sgm
-rw-r--r-- 1 canwill canwill 1254440 Dec  4  1996 data/reuters21578/reut2-001.sgm
-rw-r--r-- 1 canwill canwill 1217495 Dec  4  1996 data/reuters21578/reut2-002.sgm
-rw-r--r-- 1 canwill canwill 1298721 Dec  4  1996 data/reuters21578/reut2-003.sgm
-rw-r--r-- 1 canwill canwill 1321623 Dec  4  1996 data/reuters21578/reut2-004.sgm
-rw-r--r-- 1 canwill canwill 1388644 Dec  4  1996 data/reuters21578/reut2-005.sgm
-rw-r--r-- 1 canwill canwill 1254765 Dec  4  1996 data/reuters21578/reut2-006.sgm
-rw-r--r-- 1 canwill canwill 1256772 Dec  4  1996 data/reuters21578/reut2-007.sgm
-rw-r--r-- 1 canwill canwill 1410117 Dec  4  1996 data/reuters21578/reut2-008.sgm
-rw-r--r-- 1 canwill canwill 1338903 Dec  4  1996 data/reuters21578/reut2-009.sgm
-rw-r--r-- 1 canwill canwill 1371071 Dec  4  1996 data/reuters21578/reut2-010.sgm
-rw-r--r-- 1 canwill canwill 1304117 Dec  4  1996 data/reuters21578/reut2-011.sgm
-rw-

In [5]:
!grep \<TOPICS\>\<D\> data/reuters21578/*.sgm | wc -l

11367


In [6]:
# read and parse the data
# this will download the data if it's not yet available locally
data_streamer = ReutersStreamReader('data/reuters21578').iterdocs()
data = get_minibatch(data_streamer, 50000)
data

Unnamed: 0,text,tags
0,SANDOZ PLANS WEEDKILLER JOINT VENTURE IN USSR\...,"[usa, ussr]"
1,TAIWAN REJECTS TEXTILE MAKERS EXCHANGE RATE PL...,"[usa, taiwan]"
2,NATIONAL FSI INC <NFSI> 4TH QTR LOSS\n\nShr lo...,"[earn, usa]"
3,OCCIDENTAL <OXY> OFFICIAL RESIGNS\n\nMidCon Co...,[usa]
4,ITALY'S BNL TO ISSUE 120 MLN DLR CONVERTIBLE B...,[italy]
5,GE <GE> SAYS AMR <AMR> ORDER WORTH 650 MLN DLR...,[usa]
6,<PRECAMBRIAN SHIELD RESOURCES LTD> YEAR LOSS\n...,"[earn, canada]"
7,U.K. MONEY MARKET GIVEN FURTHER 437 MLN STG HE...,"[money-fx, interest, uk]"
8,GREASE MONKEY HOLDING CORP <GMHC> YEAR NOV 30\...,"[earn, usa]"
9,ACCEPTANCE INSURANCE HOLDINGS INC <ACPT> YEAR\...,"[earn, usa]"


In [7]:
data['tags'] = data.tags.map(lambda x:','.join(x))

In [8]:
type(data.tags[10])

str

In [10]:
file_name = 'data/data.csv'
data.to_csv(file_name, sep='\t')

In [12]:
redata = pd.read_csv(file_name, sep='\t')
redata.tags = redata.tags.map(lambda s:s.split(','))

In [13]:
type(redata.tags[0])

list

In [20]:
redata.tags[10]

['earn', 'usa']

In [27]:
data.tags

0                             usa,ussr
1                           usa,taiwan
2                             earn,usa
3                                  usa
4                                italy
5                                  usa
6                          earn,canada
7                 money-fx,interest,uk
8                             earn,usa
9                             earn,usa
10                            earn,usa
11                         switzerland
12                         earn,canada
13                                  uk
14                            earn,usa
15                            earn,usa
16                            earn,usa
17                                 usa
18                             earn,uk
19                             acq,usa
20                                 usa
21                     trade,usa,china
22                             lei,usa
23                         earn,canada
24                          usa,nasdaq
25                  coffe

In [28]:
from sklearn.preprocessing import LabelBinarizer

# binary encode the tags
lb = LabelBinarizer()
Y = lb.fit_transform(data.tags)
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# get the TF-IDF of the text
vec = TfidfVectorizer(min_df=2, sublinear_tf=True, decode_error='ignore')
X = vec.fit_transform(data.text)
X

<19716x25497 sparse matrix of type '<type 'numpy.float64'>'
	with 1509007 stored elements in Compressed Sparse Row format>

In [30]:
# split into train and test set
N = int(.8 * X.shape[0])
Xtr, ytr = X[:N,:], Y[:N,:]
Xte, yte = X[N:,:], Y[N:,:]

In [31]:
# there are warnings of ill-defined recall/precision etc.
# just ignore them for now
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# logistic regression parameter to optimise
params = {"estimator__C": np.logspace(1, 1.5, num=5)}
# use OneVsRestClassifier for multiclass learning
model = OneVsRestClassifier(LogisticRegression())
# do a grid search on the params, with 5-fold cross-validation
# optimise for F1-score
clf = GridSearchCV(model, param_grid=params, scoring='f1', n_jobs=-1, cv=5)
clf.fit(Xtr, ytr)
clf.best_score_, clf.best_params_

In [None]:
from sklearn.metrics import f1_score

# compute predictions on test set
pred = clf.predict(Xte)
# compute F1-score on test set
f1_score(yte, pred)

In [None]:
# a quick look into some example predictions
# compare with tags in test data
tags = []
for n in xrange(20):
    tags.append((lb.classes_[yte[n]==1], lb.classes_[pred[n]==1]))
    
pd.DataFrame(tags, columns=['actual tags', 'predicted tags'])