In [27]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA

## reuters 21-578 modApte version
> a collection of 10,788 documents from the Reuters financial newswire service, partitioned into a training set with 7769 documents and a test set with 3019 documents

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters
dataset.root

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/')

In [4]:
# dataset.readme()

In [5]:
len(dataset.categories())

90

In [6]:
len(dataset.fileids())

10788

In [7]:
fileids = dataset.fileids()
sample_fileid = [ fileids[i] for i in sorted(random.sample(xrange(len(fileids)), 1)) ][0]
sample_fileid

'training/4679'

In [8]:
dataset.abspath(sample_fileid)

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/training/4679')

In [9]:
len(dataset.words(sample_fileid))

240

In [10]:
dataset.words(sample_fileid)

[u'CHINA', u'TRYING', u'TO', u'INCREASE', u'COTTON', ...]

In [11]:
dataset.raw(sample_fileid)

u"CHINA TRYING TO INCREASE COTTON OUTPUT, PAPER SAYS\n  China's 1987 cotton output must rise\n  above the 1986 level of 3.54 mln tonnes or supply will fall\n  short of increasing demand, the China Daily said.\n      Demand in 1986 rose 10.9 pct over 1985.\n      Output in 1986 fell from 4.15 mln tonnes in 1985 and a\n  record 6.2 mln in 1984, official figures show. The China Daily\n  attributed the decline to several factors, including less\n  favorable weather conditions and new state measures to restrict\n  cotton production after the 1984 build-up of stocks.\n      According to Customs figures, cotton exports rose to\n  558,089 tonnes in calendar 1986 from 347,026 in 1985.\n      To increase output quickly, the state will raise by 10 pct\n  the price it pays for cotton produced above and beyond quota\n  levels, the newspaper said. Its official purchasing agencies\n  will buy cotton produced in excess of that originally\n  contracted for, it added.\n      The China Daily said all cot

In [12]:
dataset.words(sample_fileid)

[u'CHINA', u'TRYING', u'TO', u'INCREASE', u'COTTON', ...]

In [13]:
dataset.sents(sample_fileid)

[[u'CHINA', u'TRYING', u'TO', u'INCREASE', u'COTTON', u'OUTPUT', u',', u'PAPER', u'SAYS', u'China', u"'", u's', u'1987', u'cotton', u'output', u'must', u'rise', u'above', u'the', u'1986', u'level', u'of', u'3', u'.', u'54', u'mln', u'tonnes', u'or', u'supply', u'will', u'fall', u'short', u'of', u'increasing', u'demand', u',', u'the', u'China', u'Daily', u'said', u'.'], [u'Demand', u'in', u'1986', u'rose', u'10', u'.', u'9', u'pct', u'over', u'1985', u'.'], ...]

In [14]:
dataset.paras(sample_fileid)

[[[u'CHINA', u'TRYING', u'TO', u'INCREASE', u'COTTON', u'OUTPUT', u',', u'PAPER', u'SAYS', u'China', u"'", u's', u'1987', u'cotton', u'output', u'must', u'rise', u'above', u'the', u'1986', u'level', u'of', u'3', u'.', u'54', u'mln', u'tonnes', u'or', u'supply', u'will', u'fall', u'short', u'of', u'increasing', u'demand', u',', u'the', u'China', u'Daily', u'said', u'.'], [u'Demand', u'in', u'1986', u'rose', u'10', u'.', u'9', u'pct', u'over', u'1985', u'.'], [u'Output', u'in', u'1986', u'fell', u'from', u'4', u'.', u'15', u'mln', u'tonnes', u'in', u'1985', u'and', u'a', u'record', u'6', u'.', u'2', u'mln', u'in', u'1984', u',', u'official', u'figures', u'show', u'.'], [u'The', u'China', u'Daily', u'attributed', u'the', u'decline', u'to', u'several', u'factors', u',', u'including', u'less', u'favorable', u'weather', u'conditions', u'and', u'new', u'state', u'measures', u'to', u'restrict', u'cotton', u'production', u'after', u'the', u'1984', u'build', u'-', u'up', u'of', u'stocks', u'.'],

In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)

In [16]:
len(corpus_train),len(corpus_test)

(7769, 3019)

In [17]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [18]:
vectorizer = CountVectorizer(
                min_df=10, # tweaking this parameter reduces the length of the feature vector
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')

In [19]:
# need to use both corpuses for fitting because otherwise there may be words that only occur in the
# training set or in the test set
full_corpus = corpus_train + corpus_test
vectorizer.fit(full_corpus)

X_train_counts = vectorizer.transform(corpus_train)
X_test_counts = vectorizer.transform(corpus_test)
X_full_counts = vectorizer.transform(full_corpus)

X_train_counts.shape,X_test_counts.shape, X_full_counts.shape

((7769, 6462), (3019, 6462), (10788, 6462))

In [20]:
#uncomment these to see how the vectorizer is analyzing, tokenizing and preprocessing documents

#vectorizer.build_analyzer()(dataset.raw(fileid))
#vectorizer.build_tokenizer()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")
#vectorizer.build_preprocessor()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")

In [21]:
X_train_counts[0].toarray().ravel()

array([0, 0, 0, ..., 1, 0, 0])

In [22]:
X_test_counts[0].toarray().ravel()

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
transformer = TfidfTransformer()
# again, we need to fit the transformer to all documents (train and test)
transformer.fit(X_full_counts)

X_train_tfidf = transformer.transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)
X_full_tfidf = transformer.transform(X_full_counts)

X_train_tfidf.shape, X_test_tfidf.shape, X_full_tfidf.shape

((7769, 6462), (3019, 6462), (10788, 6462))

In [24]:
X_train_tfidf[0].toarray().ravel()

array([ 0.       ,  0.       ,  0.       , ...,  0.0466051,  0.       ,  0.       ])

In [25]:
X_test_tfidf[0].toarray().ravel()

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [26]:
Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

Y_train.shape,Y_test.shape

((7769, 90), (3019, 90))

In [41]:
clf = LogisticRegression()

meta_clf = OneVsRestClassifier(clf)

pca = RandomizedPCA(n_components=50)
pca.fit(X_train_tfidf.toarray())

X_train_reduced = pca.transform(X_train_tfidf.toarray())
X_test_reduced = pca.transform(X_test_tfidf.toarray())
X_train_reduced.shape

(7769, 50)

In [42]:
meta_clf.fit(X_train_reduced,Y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [43]:
Y_pred = meta_clf.predict(X_test_reduced)

In [44]:
f1_score(Y_test,Y_pred,average='micro')

0.72664015904572576