- [gridsearch-for-an-estimator-inside-a-onevsrestclassifier](http://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier)

- [sample pipeline for text feature extraction and evaluation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html)

- [parameter estimation using grid search with cross validation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html#example-model-selection-grid-search-digits-py)

- [gridsearch-searching-for-estimator-parameters](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

- [gridsearch-for-multilabel-onevsrest-classifier](http://stackoverflow.com/questions/14225882/gridsearch-for-multilabel-onevsrestclassifier)


In [1]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder,scale
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters

In [4]:
fileids = dataset.fileids()

In [5]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)
        
(len(corpus_train),len(corpus_test))        

(7769, 3019)

In [6]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [7]:
vectorizer = CountVectorizer(
                min_df=10, # tweaking this parameter reduces the length of the feature vector
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')

In [8]:
transformer = TfidfTransformer()

In [9]:
%%time

Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

CPU times: user 8.7 s, sys: 104 ms, total: 8.81 s
Wall time: 8.86 s


In [10]:
clf = Pipeline([
    ('vect', vectorizer),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC(penalty='l1', dual = False, multi_class='crammer_singer', tol=0.00001)),
])

meta_clf = OneVsRestClassifier(clf)

In [11]:
meta_clf

OneVsRestClassifier(estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1),
        preprocessor=<function preprocessor a...00,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=1e-05, verbose=0))]),
          n_jobs=1)

In [12]:
meta_clf.fit(corpus_train,Y_train)

OneVsRestClassifier(estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1),
        preprocessor=<function preprocessor a...00,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=1e-05, verbose=0))]),
          n_jobs=1)

In [13]:
Y_pred = meta_clf.predict(corpus_test)

In [14]:
f1_score(Y_test,Y_pred,average='micro')

0.87344842345555718