- [gridsearch-for-an-estimator-inside-a-onevsrestclassifier](http://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier)

- [sample pipeline for text feature extraction and evaluation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html)

- [parameter estimation using grid search with cross validation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html#example-model-selection-grid-search-digits-py)

- [gridsearch-searching-for-estimator-parameters](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

- [gridsearch-for-multilabel-onevsrest-classifier](http://stackoverflow.com/questions/14225882/gridsearch-for-multilabel-onevsrestclassifier)


In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder,scale
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn import svm
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.pipeline import Pipeline

In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization

[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = nltk.corpus.reuters

In [4]:
fileids = dataset.fileids()

In [5]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)
        
(len(corpus_train),len(corpus_test))        

(7769, 3019)

In [6]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()

In [10]:
# vectorizer = CountVectorizer(
#                 min_df=10, # tweaking this parameter reduces the length of the feature vector
#                 strip_accents='ascii',
#                 preprocessor=preprocessor,
#                 stop_words='english')

In [11]:
transformer = TfidfTransformer()

In [12]:
Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

Y_train.shape

(7769, 90)

In [14]:
clf = OneVsRestClassifier(Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC()),
]))
parameters = [
    { 
          "estimator__clf__penalty": ["l1"],
          "estimator__clf__dual":[False],
          "estimator__clf__multi_class":["crammer_singer"],
          "estimator__clf__tol": [0.001],
          "estimator__vect__min_df": [2,3,5,10,20,30,40],
          "estimator__vect__max_df": [1.0,0.95,0.9,0.85,0.8],
          "estimator__vect__preprocessor":[preprocessor],
          "estimator__vect__stop_words": ['english'],
          "estimator__vect__strip_accents":['ascii'],
          "estimator__vect__ngram_range":[(1,1),(1,2)]
    }
    ]

# parameters = [
#     { 
#           "estimator__clf__penalty": ["l2"], # default value
#           "estimator__clf__C": [1,10,100,1000],
#           "estimator__clf__multi_class":["crammer_singer","ovr"],
#           "estimator__clf__tol": [0.1,0.01,0.001,0.00001]
#     }
#     ]
# clf.get_params()

In [None]:
best_score = float("inf")

# I had to manually search over the parameter grid because, since we have a mod-apte split
# we cannot do any cross-validations selecting random train/test sets.
# GridSearchCV does not let one do grid search *without* also doing cross validation so we need to do this
for g in ParameterGrid(parameters):
    clf.set_params(**g)
    clf.fit(corpus_train,Y_train)
    
    Y_pred = clf.predict(corpus_test)
    
    current_score = f1_score(Y_test,Y_pred,average='micro')
    
    print("current_score was {} and the current grid was {}".format(current_score,g))
    
    if current_score > best_score:
        best_score = current_score
        best_grid = g

current_score was 0.874052894925 and the current grid was {'estimator__clf__penalty': 'l1', 'estimator__vect__stop_words': 'english', 'estimator__clf__multi_class': 'crammer_singer', 'estimator__clf__tol': 0.001, 'estimator__vect__preprocessor': <function preprocessor at 0x7f8c6083b7d0>, 'estimator__vect__strip_accents': 'ascii', 'estimator__clf__dual': False, 'estimator__vect__min_df': 1, 'estimator__vect__ngram_range': (1, 1)}


In [None]:
best_score

In [None]:
best_grid