# Model comparison

Compare LogisticRegression and LinearSVM models on RCV1 politics articles

In [19]:
import pandas as pd
import numpy as np
from cytoolz import identity
import spacy
from scipy.stats import wilcoxon

## Load data

In [2]:
df = pd.read_msgpack('http://bulba.sdsu.edu/rcv1_train.dat')

## Tokenize

In [3]:
nlp = spacy.load('en', disable=['tagger', 'ner', 'parser'])

In [4]:
%%time 

def tokenize(text):
    return [tok.orth_ for tok in nlp.tokenizer(text)]
df['tokens'] = df['text'].apply(tokenize)

CPU times: user 1min 31s, sys: 111 ms, total: 1min 31s
Wall time: 1min 31s


## Import sklearn

In [5]:
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *

## Find best LogisticRegression hyperparameters

In [7]:
lr_model = make_pipeline(CountVectorizer(analyzer=identity), 
                         LogisticRegression())     

In [8]:
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0],
          'countvectorizer__min_df': [1, 2, 5],
          'countvectorizer__max_df': [0.1, 0.25, 0.5]}
lr_grid_search = GridSearchCV(lr_model, params, n_jobs=3, verbose=1, 
                               return_train_score=True)
lr_grid_search.fit(df['tokens'], df['politics'])   

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  5.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7fce6f331a58>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'logisticregression__penalty': ['l2'], 'logisticregression__C': [0.001, 0.01, 0.1, 1.0, 10.0], 'countvectorizer__min_df': [1, 2, 5], 'countvectorizer__max_df': [0.1, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [11]:
lr_grid_search.best_score_, lr_grid_search.best_params_

(0.8913,
 {'countvectorizer__max_df': 0.2500,
  'countvectorizer__min_df': 1,
  'logisticregression__C': 0.0100,
  'logisticregression__penalty': 'l2'})

## Find best LinearSVC hyperparameters

In [13]:
svm_model = make_pipeline(CountVectorizer(analyzer=identity), LinearSVC())     

In [14]:
params = {'linearsvc__penalty': ['l2'],
          'linearsvc__C': [0.001, 0.01, 0.1, 1.0, 10.0],
          'countvectorizer__min_df': [1, 2, 5],
          'countvectorizer__max_df': [0.1, 0.25, 0.5]}
svm_grid_search = GridSearchCV(svm_model, params, n_jobs=3, verbose=1, 
                               return_train_score=True)
svm_grid_search.fit(df['tokens'], df['politics'])   

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  5.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7fce6f331a58>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'linearsvc__penalty': ['l2'], 'linearsvc__C': [0.001, 0.01, 0.1, 1.0, 10.0], 'countvectorizer__min_df': [1, 2, 5], 'countvectorizer__max_df': [0.1, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [15]:
svm_grid_search.best_score_, svm_grid_search.best_params_

(0.8914,
 {'countvectorizer__max_df': 0.2500,
  'countvectorizer__min_df': 1,
  'linearsvc__C': 0.0010,
  'linearsvc__penalty': 'l2'})

## Evaluate models with best hyperparameters

Use 10-fold cross validation with the same "folds" for both models

In [16]:
folds = StratifiedKFold(shuffle=True, n_splits=10, random_state=10)

In [17]:
lr_model.set_params(**lr_grid_search.best_params_)
lr_score = cross_val_score(lr_model, df['tokens'], df['politics'], cv=folds, n_jobs=-1)

In [18]:
svm_model.set_params(**svm_grid_search.best_params_)
svm_score = cross_val_score(svm_model, df['tokens'], df['politics'], cv=folds, n_jobs=-1)

In [45]:
scores = pd.DataFrame({'svm':svm_score, 'lr':lr_score})
scores['diff'] = np.abs(scores['svm'] - scores['lr'])
scores['sign'] = np.sign(scores['svm'] - scores['lr'])
scores = scores.sort_values('diff')
scores.index = range(1,len(scores)+1)
scores

Unnamed: 0,lr,svm,diff,sign
1,0.879414,0.88008,0.000666,1.0
2,0.886,0.885333,0.000667,-1.0
3,0.898,0.899333,0.001333,1.0
4,0.902667,0.901333,0.001333,-1.0
5,0.888592,0.889927,0.001334,1.0
6,0.901935,0.899266,0.002668,-1.0
7,0.888,0.884667,0.003333,-1.0
8,0.896667,0.892667,0.004,-1.0
9,0.876,0.88,0.004,1.0
10,0.902732,0.898068,0.004664,-1.0


In [46]:
wilcoxon(lr_score, svm_score, correction=True)

WilcoxonResult(statistic=18.5, pvalue=0.38596207926442694)