In [74]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder,scale, MinMaxScaler, binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn import svm
from sklearn.neural_network import BernoulliRBM
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

In [12]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
dataset = nltk.corpus.reuters

In [14]:
fileids = dataset.fileids()

In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)

In [16]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [7]:
vectorizer = CountVectorizer(
                min_df=10, # tweaking this parameter reduces the length of the feature vector
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')

In [8]:
# need to use both corpuses for fitting because otherwise there may be words that only occur in the
# training set or in the test set
full_corpus = corpus_train + corpus_test
vectorizer.fit(full_corpus)

X_train_counts = vectorizer.transform(corpus_train)
X_test_counts = vectorizer.transform(corpus_test)
X_full_counts = vectorizer.transform(full_corpus)

In [62]:
X_train_bin = binarize(X_train_counts)
X_test_bin = binarize(X_test_counts)
X_full_bin = binarize(X_full_counts)

In [63]:
transformer = TfidfTransformer()
# again, we need to fit the transformer to all documents (train and test)
transformer.fit(X_full_counts)

X_train_tfidf = transformer.transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)
X_full_tfidf = transformer.transform(X_full_counts)
(X_train_tfidf.shape,X_test_tfidf.shape)

((7769, 6462), (3019, 6462))

In [64]:
%%time

Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

CPU times: user 541 ms, sys: 15.5 ms, total: 557 ms
Wall time: 542 ms


In [20]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_tfidf.toarray())
X_test_scaled = scaler.transform(X_test_tfidf.toarray())

In [84]:
parameters = {
    'rbm__n_components':[2,5,10,25,30,50],
    'rbm__n_iter':[5,10,20,50,100], 
    'rbm__batch_size': [10,50,100,500],
    'rbm__learning_rate': [0.1,0.2,0.3,0.6]}

pipeline = Pipeline([
        ('rbm',BernoulliRBM()),
        ('svmovr',OneVsRestClassifier(svm.LinearSVC(penalty='l1', dual = False, multi_class='crammer_singer')))
    ])

model = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1)

model.fit(X_train_scaled)

ValueError: Expected array-like (array or non-string sequence), got None

In [69]:
X_train_rbm = model.transform(X_train_scaled)
X_test_rbm = model.transform(X_test_scaled)

In [73]:
X_train_rbm[0:5]

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])

In [33]:
clf = svm.LinearSVC(penalty='l1', dual = False, multi_class='crammer_singer')
meta_clf = OneVsRestClassifier(clf)
meta_clf

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=1e-05, verbose=0),
          n_jobs=1)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=1e-05, verbose=0),
          n_jobs=1)

In [35]:
Y_pred = meta_clf.predict(X_test_rbm)

In [36]:
f1_score(Y_test,Y_pred,average='micro')

  'precision', 'predicted', average, warn_for)


0.0