## Question 7: Pipeline and Grid Search

In [5]:
#import all packages
import numpy as np
import random
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
#all sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
# used to cache results
import time
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
import pickle

#prepare two dataset (remove header v.s. not remove)
computer_technology = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
recreational_activity = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

#remove_header
comp_train_remove = fetch_20newsgroups(subset='train', categories=computer_technology, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
rec_train_remove = fetch_20newsgroups(subset='train', categories=recreational_activity, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
comp_test_remove = fetch_20newsgroups(subset='test', categories=computer_technology, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
rec_test_remove = fetch_20newsgroups(subset='test', categories=recreational_activity, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

#not remove
comp_train = fetch_20newsgroups(subset='train', categories=computer_technology, shuffle=True, random_state=42)
rec_train = fetch_20newsgroups(subset='train', categories=recreational_activity, shuffle=True, random_state=42)
comp_test = fetch_20newsgroups(subset='test', categories=computer_technology, shuffle=True, random_state=42)
rec_test = fetch_20newsgroups(subset='test', categories=recreational_activity, shuffle=True, random_state=42)

# map to binary classification
#remove_header
all_train_remove = comp_train_remove.data+rec_train_remove.data
all_test_remove = comp_test_remove.data+rec_test_remove.data
target_train_remove = [True] * len(comp_train_remove.data) + [False] * len(rec_train_remove.data)
target_test_remove = [True] * len(comp_test_remove.data) + [False] * len(rec_test_remove.data)

#not remove
all_train = comp_train.data+rec_train.data
all_test = comp_test.data+rec_test.data
target_train = [True] * len(comp_train.data) + [False] * len(rec_train.data)
target_test = [True] * len(comp_test.data) + [False] * len(rec_test.data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
#Define functions for pipeline
#Feature Extraction
min_df = [3, 5]
#lemmatization_tokenizer
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'
    
def lemmatized_tokenizer(text):
    wnl = WordNetLemmatizer()
    clean_text = re.sub(r'[^A-Za-z]', " ", text)
    tokenized_text = nltk.word_tokenize(clean_text) 
    lemaitzed_text = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in pos_tag(tokenized_text)]
    return lemaitzed_text

#general_tokenizer
def tokenizer(text):
    clean_text = re.sub(r'[^A-Za-z]', " ", text)
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text
#---------------------------------------------------------------------#
#Dimensionality Reduction
#LSI
svd = TruncatedSVD(n_components=50, random_state=0)
#NMF
nmf = NMF(n_components=50, init='random', random_state=0)
#---------------------------------------------------------------------#
#Classifier
#Support Vector Machine
svm_clf = svm.SVC(probability=True,gamma=1) #best svm gamma = 1
#Logistic Regression (L1 and L2)
logistic_l1_clf = LogisticRegression(penalty='l1',C=10) #est l1 c=10
logistic_l2_clf = LogisticRegression(penalty='l2',C=100) #best l2 c=100
#Gaussian Naive Bayes
GaussianNB_clf = GaussianNB()

In [0]:
#Pipeline for remove header...etc
#traing and evaluate the pipeline
print("Cell started")
start = time.time()

cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
print("Building Pipeline")
#remove_lemmatized
pipeline1 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=lemmatized_tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
#remove_tokenizer
pipeline2 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
#not-remove_lemmatized
pipeline3 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=lemmatized_tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
#not-remove_tokenizer
pipeline4 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
param_grid = [
     {
         'vect__min_df': min_df,
         'reduce_dim': [svd, nmf],
         'clf': [svm_clf, logistic_l1_clf, logistic_l2_clf, GaussianNB_clf]
     }
]

grid1 = GridSearchCV(pipeline1, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid2 = GridSearchCV(pipeline2, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
#grid3 = GridSearchCV(pipeline3, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
#grid4 = GridSearchCV(pipeline4, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')

# Fit REMOVED HEADERS AND FOOTERS, LEMMITIZED
print("Fitting grid 1...")
t1 = time.time()
grid1.fit(all_train_remove, target_train_remove)
print("Fit grid 1 in %f sec" % (time.time()-t1))

print("Fitting grid 2...")
t1 = time.time()
# Fit REMOVED HEADERS AND FOOTERS, NOT LEMMITIZED
grid2.fit(all_train_remove, target_train_remove)
print("Fit grid 2 in %f sec" % (time.time()-t1))

print("Fitting grid 3...")
t1 = time.time()
# Fit KEPT HEADERS AND FOOTERS, LEMMITIZED
grid3.fit(all_train, target_train)
print("Fit grid 3 in %f sec" % (time.time()-t1))

print("Fitting grid 4...")
t1 = time.time()
# Fit KEPT HEADERS AND FOOTERS, NOT LEMMITIZED
grid4.fit(all_train, target_train)
print("Fit grid 4 in %f sec" % (time.time()-t1))

pickle.dump( grid1, open("grid1.pkl","wb"))
pickle.dump( grid2, open("grid2.pkl","wb"))
pickle.dump( grid3, open("grid3.pkl","wb"))
pickle.dump( grid4, open("grid4.pkl","wb"))

rmtree(cachedir)

end = time.time()
print(end - start)

Cell started
Building Pipeline
Fitting grid 1...


You provided "cachedir='/tmp/tmpo_oo2x6g'", use "location='/tmp/tmpo_oo2x6g'" instead.
  """
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, go

Fit grid 1 in 4079.723052 sec
Fitting grid 2...


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solv

Fit grid 2 in 825.662278 sec
Fitting grid 3...


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only

KeyboardInterrupt: ignored

In [0]:
import pandas as pd
#remove header, lemmatized
pd.DataFrame(grid1.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,40.640022,0.898973,9.453645,0.992695,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.92397,0.951426,0.930233,0.942918,0.917548,0.933219,0.012374,4
1,40.390951,1.062882,9.457797,1.025638,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.927138,0.948258,0.929175,0.945032,0.920719,0.934064,0.010693,3
2,52.366115,2.147812,9.62698,1.061701,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.885956,0.898627,0.887949,0.89852,0.861522,0.886515,0.013549,10
3,50.63254,0.802019,9.557514,1.048083,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.899683,0.911299,0.896406,0.909091,0.891121,0.90152,0.007624,9
4,37.942688,1.089744,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,13
5,37.90011,1.126926,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,14
6,47.175756,2.060629,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,15
7,46.071314,0.696679,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,16
8,38.055257,1.133176,9.444636,1.073403,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=100, class_weight...",0.932418,0.956705,0.933404,0.952431,0.935518,0.942095,0.010322,1
9,38.142144,1.00788,9.457181,1.011294,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=100, class_weight...",0.930306,0.957761,0.934461,0.950317,0.936575,0.941884,0.010398,2


In [0]:
#remove header, not lemmatized
pd.DataFrame(grid2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.61689,0.02971,0.930541,0.049903,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.918691,0.944034,0.918605,0.938689,0.914376,0.926879,0.012047,5
1,6.49145,0.051189,0.928205,0.047754,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.915523,0.94509,0.919662,0.936575,0.920719,0.927514,0.011337,4
2,19.766514,2.769808,1.078769,0.042517,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.881732,0.903907,0.893235,0.900634,0.874207,0.890743,0.011243,10
3,17.126656,2.085552,1.042649,0.048023,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.892291,0.931362,0.900634,0.915433,0.889006,0.905746,0.015732,9
4,3.80797,0.059128,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,13
5,3.767048,0.044233,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,14
6,14.029836,2.607554,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,15
7,12.553254,2.016797,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,16
8,3.925793,0.076674,0.879249,0.049232,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=100, class_weight...",0.93453,0.949314,0.939746,0.938689,0.930233,0.938502,0.006366,1
9,3.844805,0.04667,0.872198,0.047547,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=100, class_weight...",0.933474,0.95037,0.938689,0.935518,0.934461,0.938502,0.006187,1


In [7]:
#print("Cell started")
start = time.time()

cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
print("Building Pipeline")

#not-remove_lemmatized
pipeline3 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=lemmatized_tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
#not-remove_tokenizer
pipeline4 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', tokenizer=tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50,random_state=0)),
    ('clf', GaussianNB()),
])
param_grid = [
     {
         'vect__min_df': min_df,
         'reduce_dim': [svd, nmf],
         'clf': [svm_clf, logistic_l1_clf, logistic_l2_clf, GaussianNB_clf]
     }
]


grid3 = GridSearchCV(pipeline3, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid4 = GridSearchCV(pipeline4, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')

print("Fitting grid 3...")
t1 = time.time()
# Fit KEPT HEADERS AND FOOTERS, LEMMITIZED
grid3.fit(all_train, target_train)
print("Fit grid 3 in %f sec" % (time.time()-t1))

print("Fitting grid 4...")
t1 = time.time()
# Fit KEPT HEADERS AND FOOTERS, NOT LEMMITIZED
grid4.fit(all_train, target_train)
print("Fit grid 4 in %f sec" % (time.time()-t1))

pickle.dump( grid3, open("grid3.pkl","wb"))
pickle.dump( grid4, open("grid4.pkl","wb"))

rmtree(cachedir)

end = time.time()
print(end - start)

You provided "cachedir='/tmp/tmpf94uvl68'", use "location='/tmp/tmpf94uvl68'" instead.
  after removing the cwd from sys.path.
  'stop_words.' % sorted(inconsistent))


Building Pipeline
Fitting grid 3...


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalt

Fit grid 3 in 6611.734051 sec
Fitting grid 4...


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solv

Fit grid 4 in 1290.070636 sec
7902.023711681366


In [8]:
import pandas as pd
#no remove header, lemmatized
pd.DataFrame(grid3.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,64.181182,0.920267,15.444153,1.095389,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.970433,0.971489,0.971459,0.96723,0.98203,0.972528,0.004999,3
1,63.533724,1.108783,15.341253,1.036949,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.971489,0.971489,0.971459,0.96723,0.978858,0.972105,0.003756,4
2,84.775296,1.483801,15.486677,1.058136,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.93981,0.947202,0.935518,0.955603,0.94926,0.945478,0.007087,9
3,78.452185,2.158407,15.555238,1.053808,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.93981,0.947202,0.947146,0.950317,0.960888,0.949073,0.006843,7
4,61.79689,1.043505,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,13
5,61.610356,1.110927,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,14
6,79.313933,1.255128,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,15
7,74.9266,2.193917,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,16
8,62.216436,0.994847,15.337849,1.080174,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=100, class_weight...",0.969377,0.972545,0.972516,0.971459,0.983087,0.973797,0.004786,1
9,61.669585,1.066868,15.280463,1.086692,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=100, class_weight...",0.970433,0.972545,0.971459,0.969345,0.980973,0.972951,0.004149,2


In [9]:
#mo remove header, no lemmatized
pd.DataFrame(grid4.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.546626,0.063307,1.425993,0.048927,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.968321,0.977825,0.968288,0.965116,0.97463,0.970836,0.004664,4
1,8.341619,0.065242,1.424892,0.049552,"SVC(C=1.0, break_ties=False, cache_size=200, c...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.970433,0.976769,0.96723,0.966173,0.978858,0.971893,0.005077,2
2,30.626469,2.381523,1.670159,0.033657,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.946146,0.948258,0.936575,0.942918,0.946089,0.943997,0.004084,10
3,24.574407,1.55689,1.600908,0.052012,"SVC(C=1.0, break_ties=False, cache_size=200, c...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': SVC(C=1.0, break_ties=False, cache_siz...",0.93453,0.953537,0.927061,0.948203,0.95666,0.943998,0.011362,9
4,6.241829,0.089456,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,13
5,6.135799,0.034844,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,14
6,24.226769,2.370256,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",3,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,15
7,19.760971,1.298014,0.0,0.0,"LogisticRegression(C=10, class_weight=None, du...","NMF(alpha=0.0, beta_loss='frobenius', init='ra...",5,"{'clf': LogisticRegression(C=10, class_weight=...",,,,,,,,16
8,6.273757,0.041735,1.388677,0.046304,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",3,"{'clf': LogisticRegression(C=100, class_weight...",0.971489,0.977825,0.968288,0.971459,0.980973,0.974006,0.004661,1
9,6.190693,0.081876,1.394775,0.036613,"LogisticRegression(C=100, class_weight=None, d...","TruncatedSVD(algorithm='randomized', n_compone...",5,"{'clf': LogisticRegression(C=100, class_weight...",0.968321,0.974657,0.966173,0.968288,0.978858,0.971259,0.004746,3
