In [1]:
import numpy as np
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters
dataset.root

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/')

In [4]:
dataset.readme()

u'\n      The Reuters-21578 benchmark corpus, ApteMod version\n\nThis is a publically available version of the well-known Reuters-21578\n"ApteMod" corpus for text categorization.  It has been used in\npublications like these:\n\n * Yiming Yang and X. Liu. "A re-examination of text categorization\n   methods".  1999.  Proceedings of 22nd Annual International SIGIR.\n   http://citeseer.nj.nec.com/yang99reexamination.html\n\n * Thorsten Joachims. "Text categorization with support vector\n   machines: learning with many relevant features".  1998. Proceedings\n   of ECML-98, 10th European Conference on Machine Learning.\n   http://citeseer.nj.nec.com/joachims98text.html\n\nApteMod is a collection of 10,788 documents from the Reuters financial\nnewswire service, partitioned into a training set with 7769 documents\nand a test set with 3019 documents.  The total size of the corpus is\nabout 43 MB.  It is also available for download from\nhttp://kdd.ics.uci.edu/databases/reuters21578/reuters215

In [5]:
len(dataset.categories())

90

In [6]:
len(dataset.fileids())

10788

In [7]:
fileids = dataset.fileids()
sample_fileid = [ fileids[i] for i in sorted(random.sample(xrange(len(fileids)), 1)) ][0]
sample_fileid

'training/356'

In [8]:
dataset.abspath(sample_fileid)

ZipFilePathPointer(u'/home/felipe/nltk_data/corpora/reuters.zip', u'reuters/training/356')

In [9]:
len(dataset.words(sample_fileid))

546

In [10]:
dataset.words(sample_fileid)

[u'HUTCHISON', u'SEES', u'HIGHER', u'PAYOUT', u',', ...]

In [11]:
dataset.raw(sample_fileid)

u"HUTCHISON SEES HIGHER PAYOUT, SATISFACTORY PROFITS\n  Hutchison Whampoa Ltd &lt;HWHH.HK>\n  expects satisfactory profits in 1987 and will pay a higher\n  dividend for the year, chairman Li Ka-shing said.\n      He did not make any specific projections for the company's\n  earnings this year but he said the firm will pay a dividend of\n  not less than 32.5 cents per share after a proposed\n  four-for-one stock split and a one-for-four bonus issue.\n      It paid total dividends of 1.30 dlrs per share last year,\n  equal to 26 cents per share, adjusting for the bonus and share\n  split.\n      Hutchison, which has operations ranging from trading to\n  property and container terminals, earlier reported after-tax\n  profits of 1.62 billion dlrs against 1.19 billion dlrs in 1985.\n      The 1986 total excluded extraordinary gains of 563 mln\n  dlrs, partly from the sale of some of its stake in the South\n  China Morning Post, the leading English language newspaper,\n  compared with 369 ml

In [12]:
dataset.words(sample_fileid)

[u'HUTCHISON', u'SEES', u'HIGHER', u'PAYOUT', u',', ...]

In [13]:
dataset.sents(sample_fileid)

[[u'HUTCHISON', u'SEES', u'HIGHER', u'PAYOUT', u',', u'SATISFACTORY', u'PROFITS', u'Hutchison', u'Whampoa', u'Ltd', u'&', u'lt', u';', u'HWHH', u'.', u'HK', u'>', u'expects', u'satisfactory', u'profits', u'in', u'1987', u'and', u'will', u'pay', u'a', u'higher', u'dividend', u'for', u'the', u'year', u',', u'chairman', u'Li', u'Ka', u'-', u'shing', u'said', u'.'], [u'He', u'did', u'not', u'make', u'any', u'specific', u'projections', u'for', u'the', u'company', u"'", u's', u'earnings', u'this', u'year', u'but', u'he', u'said', u'the', u'firm', u'will', u'pay', u'a', u'dividend', u'of', u'not', u'less', u'than', u'32', u'.', u'5', u'cents', u'per', u'share', u'after', u'a', u'proposed', u'four', u'-', u'for', u'-', u'one', u'stock', u'split', u'and', u'a', u'one', u'-', u'for', u'-', u'four', u'bonus', u'issue', u'.'], ...]

In [14]:
dataset.paras(sample_fileid)

[[[u'HUTCHISON', u'SEES', u'HIGHER', u'PAYOUT', u',', u'SATISFACTORY', u'PROFITS', u'Hutchison', u'Whampoa', u'Ltd', u'&', u'lt', u';', u'HWHH', u'.', u'HK', u'>', u'expects', u'satisfactory', u'profits', u'in', u'1987', u'and', u'will', u'pay', u'a', u'higher', u'dividend', u'for', u'the', u'year', u',', u'chairman', u'Li', u'Ka', u'-', u'shing', u'said', u'.'], [u'He', u'did', u'not', u'make', u'any', u'specific', u'projections', u'for', u'the', u'company', u"'", u's', u'earnings', u'this', u'year', u'but', u'he', u'said', u'the', u'firm', u'will', u'pay', u'a', u'dividend', u'of', u'not', u'less', u'than', u'32', u'.', u'5', u'cents', u'per', u'share', u'after', u'a', u'proposed', u'four', u'-', u'for', u'-', u'one', u'stock', u'split', u'and', u'a', u'one', u'-', u'for', u'-', u'four', u'bonus', u'issue', u'.'], [u'It', u'paid', u'total', u'dividends', u'of', u'1', u'.', u'30', u'dlrs', u'per', u'share', u'last', u'year', u',', u'equal', u'to', u'26', u'cents', u'per', u'share', u'

In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
sklearn_corpus = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    sklearn_corpus.append(document)

In [16]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [17]:
vectorizer = CountVectorizer(
                min_df=2,
                strip_accents='ascii',
                preprocessor=preprocessor)

In [18]:
vectorizer.fit(sklearn_corpus)
X_counts = vectorizer.transform(sklearn_corpus)

In [19]:
# vectorizer.build_analyzer()(dataset.raw(fileid))
#vectorizer.build_tokenizer()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")
#vectorizer.build_preprocessor()("ADVANCED INSTITUTIONAL &lt;AIMS> CUTS WORKFORCE\n  Advanced Institutional ")

In [20]:
np.set_printoptions(threshold='nan')
X_counts[0].toarray().ravel()

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [21]:
transformer = TfidfTransformer()
transformer.fit(X_counts)
X_tfidf = transformer.transform(X_counts)

In [22]:
X_tfidf[0].toarray().ravel()

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  