In [1]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder

## reuters 21-578 modApte version
> a collection of 10,788 documents from the Reuters financial newswire service, partitioned into a training set with 7769 documents and a test set with 3019 documents

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters
dataset.root
dataset.readme()

u'\n      The Reuters-21578 benchmark corpus, ApteMod version\n\nThis is a publically available version of the well-known Reuters-21578\n"ApteMod" corpus for text categorization.  It has been used in\npublications like these:\n\n * Yiming Yang and X. Liu. "A re-examination of text categorization\n   methods".  1999.  Proceedings of 22nd Annual International SIGIR.\n   http://citeseer.nj.nec.com/yang99reexamination.html\n\n * Thorsten Joachims. "Text categorization with support vector\n   machines: learning with many relevant features".  1998. Proceedings\n   of ECML-98, 10th European Conference on Machine Learning.\n   http://citeseer.nj.nec.com/joachims98text.html\n\nApteMod is a collection of 10,788 documents from the Reuters financial\nnewswire service, partitioned into a training set with 7769 documents\nand a test set with 3019 documents.  The total size of the corpus is\nabout 43 MB.  It is also available for download from\nhttp://kdd.ics.uci.edu/databases/reuters21578/reuters215

In [4]:
dataset.readme()

u'\n      The Reuters-21578 benchmark corpus, ApteMod version\n\nThis is a publically available version of the well-known Reuters-21578\n"ApteMod" corpus for text categorization.  It has been used in\npublications like these:\n\n * Yiming Yang and X. Liu. "A re-examination of text categorization\n   methods".  1999.  Proceedings of 22nd Annual International SIGIR.\n   http://citeseer.nj.nec.com/yang99reexamination.html\n\n * Thorsten Joachims. "Text categorization with support vector\n   machines: learning with many relevant features".  1998. Proceedings\n   of ECML-98, 10th European Conference on Machine Learning.\n   http://citeseer.nj.nec.com/joachims98text.html\n\nApteMod is a collection of 10,788 documents from the Reuters financial\nnewswire service, partitioned into a training set with 7769 documents\nand a test set with 3019 documents.  The total size of the corpus is\nabout 43 MB.  It is also available for download from\nhttp://kdd.ics.uci.edu/databases/reuters21578/reuters215

In [5]:
len(dataset.categories())

90

In [6]:
len(dataset.fileids())

10788

In [7]:
fileids = dataset.fileids()
sample_fileid = [ fileids[i] for i in sorted(random.sample(xrange(len(fileids)), 1)) ][0]
sample_fileid

'training/3565'

In [8]:
dataset.abspath(sample_fileid)

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/training/3565')

In [9]:
len(dataset.words(sample_fileid))

87

In [10]:
len(dataset.categories())

90

In [11]:
dataset.words(sample_fileid)

[u'CPC', u'&', u'lt', u';', u'CPC', u'>', u'TO', ...]

In [12]:
dataset.raw(sample_fileid)

u'CPC &lt;CPC> TO SELL UNIT TO HI-PORT &lt;HIPT>\n  CPC International Inc\n  said it has agreed in principle to sell its Peterson/Puritain\n  Inc subsidiary to Hi-Port Industries Inc.\n      CPC said the sale is not expected to have a significant\n  impact on its earnings and is subject to approval by boards of\n  both companies.  Terms were not disclosed.\n      Peterson/Purittan is a contract packager of personal care\n  and household products.\n  \n\n'

In [13]:
dataset.words(sample_fileid)

[u'CPC', u'&', u'lt', u';', u'CPC', u'>', u'TO', ...]

In [14]:
dataset.sents(sample_fileid)

[[u'CPC', u'&', u'lt', u';', u'CPC', u'>', u'TO', u'SELL', u'UNIT', u'TO', u'HI', u'-', u'PORT', u'&', u'lt', u';', u'HIPT', u'>', u'CPC', u'International', u'Inc', u'said', u'it', u'has', u'agreed', u'in', u'principle', u'to', u'sell', u'its', u'Peterson', u'/', u'Puritain', u'Inc', u'subsidiary', u'to', u'Hi', u'-', u'Port', u'Industries', u'Inc', u'.', u'CPC', u'said', u'the', u'sale', u'is', u'not', u'expected', u'to', u'have', u'a', u'significant', u'impact', u'on', u'its', u'earnings', u'and', u'is', u'subject', u'to', u'approval', u'by', u'boards', u'of', u'both', u'companies', u'.'], [u'Terms', u'were', u'not', u'disclosed', u'.'], ...]

In [15]:
dataset.paras(sample_fileid)

[[[u'CPC', u'&', u'lt', u';', u'CPC', u'>', u'TO', u'SELL', u'UNIT', u'TO', u'HI', u'-', u'PORT', u'&', u'lt', u';', u'HIPT', u'>', u'CPC', u'International', u'Inc', u'said', u'it', u'has', u'agreed', u'in', u'principle', u'to', u'sell', u'its', u'Peterson', u'/', u'Puritain', u'Inc', u'subsidiary', u'to', u'Hi', u'-', u'Port', u'Industries', u'Inc', u'.', u'CPC', u'said', u'the', u'sale', u'is', u'not', u'expected', u'to', u'have', u'a', u'significant', u'impact', u'on', u'its', u'earnings', u'and', u'is', u'subject', u'to', u'approval', u'by', u'boards', u'of', u'both', u'companies', u'.'], [u'Terms', u'were', u'not', u'disclosed', u'.'], [u'Peterson', u'/', u'Purittan', u'is', u'a', u'contract', u'packager', u'of', u'personal', u'care', u'and', u'household', u'products', u'.']]]

In [16]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
sklearn_corpus = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    sklearn_corpus.append(document)

In [17]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [18]:
vectorizer = CountVectorizer(
                min_df=2,
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')

In [19]:
vectorizer.fit(sklearn_corpus)
X_counts = vectorizer.transform(sklearn_corpus)

In [20]:
# vectorizer.build_analyzer()(dataset.raw(fileid))
#vectorizer.build_tokenizer()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")
#vectorizer.build_preprocessor()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")

In [21]:
X_counts[0].toarray().ravel()

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
transformer = TfidfTransformer()
transformer.fit(X_counts)
X_tfidf = transformer.transform(X_counts)

In [23]:
X_tfidf[0].toarray().ravel()

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [26]:
sklearn_targets = []

for (idx,fileid) in enumerate(dataset.fileids()):
    categories = '*'.join(dataset.categories(fileid))

    sklearn_targets.append(categories)
    
series = pd.Series(sklearn_targets)
targets_df = series.str.get_dummies(sep='*')
targets_matrix = targets_df.values
targets_matrix.shape

(10788, 90)