## Import libraries

In [47]:
from pprint import pprint
from time import time
import logging
import numpy as np

from gensim.models import Word2Vec, Doc2Vec, doc2vec, KeyedVectors
from gensim.utils import simple_preprocess, tokenize
from gensim.sklearn_api import W2VTransformer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin

## Load news categories

In [2]:
# Load some categories from the training set
categories = [
    'rec.motorcycles',
    'sci.med',
    'sci.space',
    'talk.politics.guns'
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['rec.motorcycles', 'sci.med', 'sci.space', 'talk.politics.guns']


## Fetch documents for listed categories

In [4]:
train_data = fetch_20newsgroups(subset='train', categories=categories)
test_data = fetch_20newsgroups(subset='test', categories=categories)
print(f"{len(train_data.filenames)} train documents")
print(f"{len(test_data.filenames)} test documents")
print(f"{len(train_data.target_names)} categories")

2331 train documents
1552 test documents
4 categories


**For text feature extractor, we will be using 2 of them.**

1. HashingVectorizer
2. TfidfVectorizer (same as CountVectorizer + TfidfTransformer)
3. Word2Vec
4. Doc2Vec

**For Classifier, we will be using 4 of them**

1. Multinomial Naive Bayes
2. Logistic Regression
3. Support Vector Machine
4. Decision Tree

## Vectorizing available data for Word2Vec

In [38]:
path_to_model = "GoogleNews-vectors-negative300.bin"
w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
text_processed = [simple_preprocess(sent) for sent in train_data.data]

def embedding_features(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this += 1
        feats.append(feat_for_this/count_for_this)
    return feats

w2v_train_vectors = embedding_features(text_processed)

## Vectorizing available data for Doc2Vec

In [55]:
d2v_train = [doc2vec.TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(text_processed)]

model = Doc2Vec(size=300, alpha=0.025, epochs=100)
model.build_vocab(d2v_train)
model.train(d2v_train, total_examples=model.corpus_count, epochs=model.epochs)

vocab = model.wv.vocab

def embedding_features(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in vocab.keys():
                feat_for_this += model[token]
                count_for_this += 1
        feats.append(feat_for_this / count_for_this)
    return feats

d2v_train_vectors = embedding_features(text_processed)

## Using MultinomialNB with all 4 Vectorizer

### HashingVectorizer with MultinomialNavieBayes

In [76]:
# Creating pipleine
pipeline_hash_mnb = Pipeline([
    ("hash", HashingVectorizer(alternate_sign=False)),
    ("clf", MultinomialNB())
])

# Defining pipeline parameters
pipe_parameters = {
    'hash__norm': ('l1', 'l2'),
    'hash__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_hash_mnb, pipe_parameters, cv=5, n_jobs=-1, verbose=1)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best score: 0.965
Best parameters set:
	hash__ngram_range: (1, 2)
	hash__norm: 'l2'


### TfidfVectorizer with MultinomialNB

In [82]:
# Creating pipleine
pipeline_tfidf_mnb = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

# Defining pipeline parameters
pipe_parameters = {
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'tfidf__min_df': (0, 0.1, 0.25, 0.5),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_tfidf_mnb, pipe_parameters, cv=5, n_jobs=-1, verbose=1)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.976
Best parameters set:
	tfidf__min_df: 0
	tfidf__ngram_range: (1, 1)
	tfidf__norm: 'l2'


### Word2Vec with MultinomialNB - Can't train because of negative values in training vectors

In [97]:
# Creating pipleine
pipeline_w2v_mnb = Pipeline([
    ('scaler', MinMaxScaler()),
    ("clf", MultinomialNB())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__alpha': (0.25, 0.50, 0.75, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_w2v_mnb, pipe_parameters, cv=5, n_jobs=12, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(w2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.846
Best parameters set:
	clf__alpha: 0.25


### Doc2Vec with MultinomialNB - Can't train because of negative values in training vectors

In [98]:
# Creating pipleine
pipeline_d2v_mnb = Pipeline([
    ('scaler', MinMaxScaler()),
    ("clf", MultinomialNB())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__alpha': (0.25, 0.50, 0.75, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_d2v_mnb, pipe_parameters, cv=5, n_jobs=12, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(d2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.927
Best parameters set:
	clf__alpha: 0.25


## Using Logistic Regression with all 4 Vectorizer

### HashingVectorizer with LogisticRegression

In [7]:
# Creating pipleine
pipeline_hash_lr = Pipeline([
    ("hash", HashingVectorizer()),
    ("clf", LogisticRegression(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'hash__norm': ('l1', 'l2'),
    'hash__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams,
    'clf__penalty': ('l2', 'none'),
    'clf__solver': ('newton-cg', 'lbfgs'),
    'clf__C': (0.0001, 0.001, 0.01)
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_hash_lr, pipe_parameters, cv=5, n_jobs=-1, verbose=1)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 48 candidates, totalling 240 fits




Best score: 0.974
Best parameters set:
	clf__C: 0.0001
	clf__penalty: 'none'
	clf__solver: 'newton-cg'
	hash__ngram_range: (1, 2)
	hash__norm: 'l2'


### TfidfVectorizer with LogisticRegression

In [9]:
# Creating pipleine
pipeline_tfidf_lr = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__min_df': (0, 0.1, 0.25, 0.5),
    'clf__penalty': ('l2', 'none'),
    'clf__solver': ('newton-cg', 'lbfgs'),
    'clf__C': (0.0001, 0.001)
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_tfidf_lr, pipe_parameters, cv=2, n_jobs=12, verbose=3)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 128 candidates, totalling 256 fits




Best score: 0.976
Best parameters set:
	clf__C: 0.0001
	clf__penalty: 'none'
	clf__solver: 'newton-cg'
	tfidf__min_df: 0
	tfidf__ngram_range: (1, 1)
	tfidf__norm: 'l1'


### Word2Vec with LogisticRegression

In [43]:
# Creating pipleine
pipeline_w2v_lr = Pipeline([
    ("clf", LogisticRegression())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__penalty': ('l2', 'none'),
    'clf__solver': ('newton-cg', 'lbfgs', 'sag', 'saga'),
    'clf__C': (0.0001, 0.001, 0.01, 1)
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_w2v_lr, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(w2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Best score: 0.957
Best parameters set:
	clf__C: 0.01
	clf__penalty: 'none'
	clf__solver: 'sag'




### Doc2Vec with LogisticRegression

In [94]:
# Creating pipleine
pipeline_d2v_lr = Pipeline([
    ("clf", LogisticRegression())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__penalty': ('l2', 'none'),
    'clf__solver': ('newton-cg', 'lbfgs', 'sag', 'saga'),
    'clf__C': (0.0001, 0.001, 0.01, 1)
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_d2v_lr, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(d2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best score: 0.961
Best parameters set:
	clf__C: 1
	clf__penalty: 'l2'
	clf__solver: 'newton-cg'


## Using Support Vector Machines with all 4 Vectorizer

### HashingVectorizer with SVM

In [15]:
# Creating pipleine
pipeline_hash_svm = Pipeline([
    ("hash", HashingVectorizer()),
    ("clf", SVR())
])

# Defining pipeline parameters
pipe_parameters = {
    'hash__norm': ('l1', 'l2'),
    'hash__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams,
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'clf__gamma': ('scale', 'auto'),
    'clf__C': (0.001, 0.01, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_hash_svm, pipe_parameters, cv=3, n_jobs=12, verbose=3)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best score: 0.819
Best parameters set:
	clf__C: 1
	clf__gamma: 'scale'
	clf__kernel: 'linear'
	hash__ngram_range: (1, 2)
	hash__norm: 'l2'


### TfidfVectorizer with SVM

In [17]:
# Creating pipleine
pipeline_tfidf_svm = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", SVR())
])

# Defining pipeline parameters
pipe_parameters = {
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__min_df': (0, 0.1, 0.25, 0.5),
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'clf__gamma': ('scale', 'auto'),
    'clf__C': (0.001, 0.01, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_tfidf_svm, pipe_parameters, cv=2, n_jobs=12, verbose=3)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 384 candidates, totalling 768 fits
Best score: 0.842
Best parameters set:
	clf__C: 1
	clf__gamma: 'scale'
	clf__kernel: 'sigmoid'
	tfidf__min_df: 0
	tfidf__ngram_range: (1, 1)
	tfidf__norm: 'l2'


### Word2Vecth SVM

In [44]:
# Creating pipleine
pipeline_w2v_svm = Pipeline([
    ("clf", SVR())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'clf__gamma': ('scale', 'auto'),
    'clf__C': (0.001, 0.01, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_w2v_svm, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(w2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.764
Best parameters set:
	clf__C: 1
	clf__gamma: 'scale'
	clf__kernel: 'poly'


### Doc2Vec with SVM

In [95]:
# Creating pipleine
pipeline_d2v_svm = Pipeline([
    ("clf", SVR())
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'clf__gamma': ('scale', 'auto'),
    'clf__C': (0.001, 0.01, 1),
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_d2v_svm, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(d2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.830
Best parameters set:
	clf__C: 1
	clf__gamma: 'scale'
	clf__kernel: 'rbf'


## Using Decision Trees with all 4 vectorizers

### HashingVectorizer with Decision Tree

In [26]:
# Creating pipleine
pipeline_hash_dt = Pipeline([
    ("hash", HashingVectorizer()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'hash__norm': ('l1', 'l2'),
    'hash__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams,
    'clf__criterion': ('gini', 'entropy', 'log_loss'),
    'clf__splitter': ('best', 'random'),
    'clf__max_depth': (25, 50, 100),
    'clf__max_features': ('sqrt', 'log2')
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_hash_dt, pipe_parameters, cv=3, n_jobs=12, verbose=3)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best score: 0.650
Best parameters set:
	clf__criterion: 'gini'
	clf__max_depth: 100
	clf__max_features: 'sqrt'
	clf__splitter: 'best'
	hash__ngram_range: (1, 2)
	hash__norm: 'l2'


### TfidfVectorizer with Decision Tree

In [25]:
# Creating pipleine
pipeline_tfidf_dt = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__min_df': (0, 0.1, 0.25, 0.5),
    'clf__criterion': ('gini', 'entropy', 'log_loss'),
    'clf__splitter': ('best', 'random'),
    'clf__max_depth': (25, 50, 100),
    'clf__max_features': ('sqrt', 'log2')
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_tfidf_dt, pipe_parameters, cv=3, n_jobs=12, verbose=3)

# Fitting data into GridSearch Instance
grid_search.fit(train_data.data, train_data.target)

# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Best score: 0.689
Best parameters set:
	clf__criterion: 'entropy'
	clf__max_depth: 100
	clf__max_features: 'sqrt'
	clf__splitter: 'best'
	tfidf__min_df: 0
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l1'


### Word2Vec with Decision Tree

In [45]:
# Creating pipleine
pipeline_w2v_dt = Pipeline([
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__criterion': ('gini', 'entropy', 'log_loss'),
    'clf__splitter': ('best', 'random'),
    'clf__max_depth': (25, 50, 100),
    'clf__max_features': ('sqrt', 'log2')
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_w2v_dt, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(w2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best score: 0.633
Best parameters set:
	clf__criterion: 'gini'
	clf__max_depth: 25
	clf__max_features: 'sqrt'
	clf__splitter: 'best'


### Doc2Vec with Decision Tree

In [96]:
# Creating pipleine
pipeline_d2v_dt = Pipeline([
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Defining pipeline parameters
pipe_parameters = {
    'clf__criterion': ('gini', 'entropy', 'log_loss'),
    'clf__splitter': ('best', 'random'),
    'clf__max_depth': (25, 50, 100),
    'clf__max_features': ('sqrt', 'log2')
}

# Creating gridsearch instance
grid_search = GridSearchCV(pipeline_d2v_dt, pipe_parameters, cv=5, n_jobs=-1, verbose=1, error_score="raise")

# Fitting data into GridSearch Instance
grid_search.fit(d2v_train_vectors, train_data.target)


# Getting best score from instance
print("Best score: %0.3f" % grid_search.best_score_)

# Getting best parameters for classifiers and vectorizers
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(pipe_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best score: 0.618
Best parameters set:
	clf__criterion: 'entropy'
	clf__max_depth: 25
	clf__max_features: 'sqrt'
	clf__splitter: 'best'
