In [1]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression

## reuters 21-578 modApte version
> a collection of 10,788 documents from the Reuters financial newswire service, partitioned into a training set with 7769 documents and a test set with 3019 documents

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters
dataset.root

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/')

In [4]:
# dataset.readme()

In [5]:
len(dataset.categories())

90

In [6]:
len(dataset.fileids())

10788

In [7]:
fileids = dataset.fileids()
sample_fileid = [ fileids[i] for i in sorted(random.sample(xrange(len(fileids)), 1)) ][0]
sample_fileid

'training/5115'

In [8]:
dataset.abspath(sample_fileid)

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/training/5115')

In [9]:
len(dataset.words(sample_fileid))

180

In [10]:
dataset.words(sample_fileid)

[u'U', u'.', u'S', u'.', u'SAYS', u'CANADA', ...]

In [11]:
dataset.raw(sample_fileid)

u'U.S. SAYS CANADA COMPLYING WITH LUMBER PACT\n  The Commerce Department said that\n  all Canadian firms had begun to pay an agreed to 15 pct\n  surcharge on softwood shipped to U.S. markets.\n      It made the statement after talks with Canadian officials\n  about press reports and speculation in Canada that some\n  exporters were not paying the charge.\n      Canada and the United States agreed last December to the 15\n  pct charge, ending a lengthy trade dispute over alleged\n  Canadian subsidies to Canada\'s softwood exporters.\n      Commerce officials would not say if they found any Canadian\n  companies had been evading the charge, but that following the\n  talks they were convinced all exporters were complying with the\n  agreement.\n      Undersecretary of Commerce Bruce Smart said "We are\n  gratified to learn that companies in Canada have begun paying\n  the export charge on lumber."\n      He added the agreement was important to the health of the\n  U.S. lumber industry and

In [12]:
dataset.words(sample_fileid)

[u'U', u'.', u'S', u'.', u'SAYS', u'CANADA', ...]

In [13]:
dataset.sents(sample_fileid)

[[u'U', u'.', u'S', u'.', u'SAYS', u'CANADA', u'COMPLYING', u'WITH', u'LUMBER', u'PACT', u'The', u'Commerce', u'Department', u'said', u'that', u'all', u'Canadian', u'firms', u'had', u'begun', u'to', u'pay', u'an', u'agreed', u'to', u'15', u'pct', u'surcharge', u'on', u'softwood', u'shipped', u'to', u'U', u'.', u'S', u'.', u'markets', u'.'], [u'It', u'made', u'the', u'statement', u'after', u'talks', u'with', u'Canadian', u'officials', u'about', u'press', u'reports', u'and', u'speculation', u'in', u'Canada', u'that', u'some', u'exporters', u'were', u'not', u'paying', u'the', u'charge', u'.'], ...]

In [14]:
dataset.paras(sample_fileid)

[[[u'U', u'.', u'S', u'.', u'SAYS', u'CANADA', u'COMPLYING', u'WITH', u'LUMBER', u'PACT', u'The', u'Commerce', u'Department', u'said', u'that', u'all', u'Canadian', u'firms', u'had', u'begun', u'to', u'pay', u'an', u'agreed', u'to', u'15', u'pct', u'surcharge', u'on', u'softwood', u'shipped', u'to', u'U', u'.', u'S', u'.', u'markets', u'.'], [u'It', u'made', u'the', u'statement', u'after', u'talks', u'with', u'Canadian', u'officials', u'about', u'press', u'reports', u'and', u'speculation', u'in', u'Canada', u'that', u'some', u'exporters', u'were', u'not', u'paying', u'the', u'charge', u'.'], [u'Canada', u'and', u'the', u'United', u'States', u'agreed', u'last', u'December', u'to', u'the', u'15', u'pct', u'charge', u',', u'ending', u'a', u'lengthy', u'trade', u'dispute', u'over', u'alleged', u'Canadian', u'subsidies', u'to', u'Canada', u"'", u's', u'softwood', u'exporters', u'.'], [u'Commerce', u'officials', u'would', u'not', u'say', u'if', u'they', u'found', u'any', u'Canadian', u'compa

In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)

In [16]:
len(corpus_train),len(corpus_test)

(7769, 3019)

In [17]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [18]:
vectorizer = CountVectorizer(
                min_df=10, # tweaking this parameter reduces the length of the feature vector
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')

In [19]:
# need to use both corpuses for fitting because otherwise there may be words that only occur in the
# training set or in the test set
full_corpus = corpus_train + corpus_test
vectorizer.fit(full_corpus)

X_train_counts = vectorizer.transform(corpus_train)
X_test_counts = vectorizer.transform(corpus_test)
X_full_counts = vectorizer.transform(full_corpus)

X_train_counts.shape,X_test_counts.shape, X_full_counts.shape

((7769, 6462), (3019, 6462), (10788, 6462))

In [20]:
#uncomment these to see how the vectorizer is analyzing, tokenizing and preprocessing documents

#vectorizer.build_analyzer()(dataset.raw(fileid))
#vectorizer.build_tokenizer()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")
#vectorizer.build_preprocessor()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")

In [21]:
X_train_counts[0].toarray().ravel()

array([0, 0, 0, ..., 1, 0, 0])

In [22]:
X_test_counts[0].toarray().ravel()

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
transformer = TfidfTransformer()
# again, we need to fit the transformer to all documents (train and test)
transformer.fit(X_full_counts)

X_train_tfidf = transformer.transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)
X_full_tfidf = transformer.transform(X_full_counts)

X_train_tfidf.shape, X_test_tfidf.shape, X_full_tfidf.shape

((7769, 6462), (3019, 6462), (10788, 6462))

In [24]:
X_train_tfidf[0].toarray().ravel()

array([ 0.       ,  0.       ,  0.       , ...,  0.0466051,  0.       ,  0.       ])

In [25]:
X_test_tfidf[0].toarray().ravel()

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [26]:
Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

Y_train.shape,Y_test.shape

((7769, 90), (3019, 90))

In [27]:
clf = LogisticRegression()
meta_clf = OneVsRestClassifier(clf)

meta_clf.fit(X_train_tfidf,Y_train)


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [28]:
Y_pred = meta_clf.predict(X_test_tfidf)

In [29]:
f1_score(Y_test,Y_pred,average='micro')

0.76201298701298703