# Benchmarking different Sklearn classifiers

Inspired by the sklearn sample code here: http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

"This is an example showing how scikit-learn can be used to classify documents by topics using a bag-of-words approach. This example uses a scipy.sparse matrix to store the features and demonstrates various classifiers that can efficiently handle sparse matrices."

It helps us choose the best classifier for this problem from the different sklearn algos.

In [None]:
import pandas as pd

In [None]:
# Loading in the training data with Pandas
train = pd.read_csv("../input/train.csv")

In [None]:
train.head()
print(train.groupby('author')['id'].count().sort_values(ascending=False).head())
train.groupby('author')['id'].count().plot(kind='bar',figsize=(16,4))

In [None]:
# convert to dictionary and split into 30% 70%
import random

In [None]:
documents = train.to_dict(orient='records')
labels = list(train.author.unique())
print(documents[0], labels)

In [None]:
train_set = []
test_set = []

# make sure our sets are balanced take the same amount for each label
for label in labels:        
    partition = int(len(documents) * 0.7)
    # randomly split documents between test and training
    random.shuffle(documents)
    train_set += documents[:partition]
    test_set += documents[partition:]

print('training set:', len(train_set),'test set', len(test_set))

In [None]:
# benchmark multiple algos

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from time import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#  map to 

opts = dict()
opts['n_features'] = 2 ** 16

X_train = [x['text'] for x in train_set]
y_train = [x['author'] for x in train_set]
X_test = [x['text'] for x in test_set]
y_test = [x['author'] for x in test_set]
target_names = set(y_train)

Do a little preprocessing: turn the text into vectors and select the most important features using TF-IDF

In [None]:
token_pattern = r"[a-zA-Z]+"
vectorizer = CountVectorizer(token_pattern=token_pattern,
                             stop_words = 'english',
                              max_features=None,
                              max_df=0.5,
                              ngram_range=(1, 2))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, X_test.shape)

In [None]:
%%time

tfidf = TfidfTransformer(norm='l2')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

In [None]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

Define the benchmarking function which takes a classifier as input and outputs the metrics and confusion metrics obtained using the classifier.

In [None]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
    
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time




In [None]:
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

## Plot the results

In [None]:
# make some plots

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time",
         color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()