# Text Classification

Improving text classifiction using . . .

In [1]:
import pandas as pd
from cytoolz import identity
import spacy

In [2]:
nlp = spacy.load('en', disable=['ner', 'parser'])

## Load data

In [5]:
df = pd.read_msgpack('http://bulba.sdsu.edu/rcv1_train.dat')
df = df.sample(5000)

In [7]:
df['tokens'] = df['text'].apply(lambda d: [tok.orth_ for tok in nlp(d)])

## Import sklearn

In [31]:
from sklearn.linear_model import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.model_selection import *
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.neighbors import *
from sklearn.feature_selection import *
from sklearn.ensemble import *

## Logistic Regression

In [10]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [11]:
lr = make_pipeline(CountVectorizer(analyzer=identity), 
                   LogisticRegression())     
lr_score = cross_val_score(lr, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
lr_score.mean(), lr_score.std()

(0.87799635159635159, 0.0069620288790832752)

Optimize hyperparameters

In [12]:
params = {'logisticregression__C': [0.01, 0.1, 1.0],
          'countvectorizer__min_df': [1, 2],
          'countvectorizer__max_df': [0.25, 0.5]}
grid_search = GridSearchCV(lr, params, n_jobs=-1, return_train_score=True)
grid_search.fit(df['tokens'], df['politics'])   

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f4a8a4a0a58>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1.0], 'countvectorizer__min_df': [1, 2], 'countvectorizer__max_df': [0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
lr.set_params(**grid_search.best_params_)
score = cross_val_score(lr, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
score.mean(), score.std()

(0.88120135220135221, 0.0022425946809482632)

## K Nearest Neighbors

In [14]:
knn = make_pipeline(CountVectorizer(analyzer=identity), 
                    KNeighborsClassifier(metric='cosine'))    
knn_score = cross_val_score(knn, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
knn_score.mean(), knn_score.std()

(0.77080070120070121, 0.014931987277351011)

In [16]:
params = {'kneighborsclassifier__n_neighbors': [2, 5, 10, 25, 50],
          'countvectorizer__min_df': [1, 2],
          'countvectorizer__max_df': [0.25, 0.5]}
grid_search = GridSearchCV(knn, params, n_jobs=3, return_train_score=True)
grid_search.fit(df['tokens'], df['politics'])   

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f4a8a4a0a58>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...osine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'kneighborsclassifier__n_neighbors': [2, 5, 10, 25, 50], 'countvectorizer__min_df': [1, 2], 'countvectorizer__max_df': [0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
knn.set_params(**grid_search.best_params_)
knn_score = cross_val_score(knn, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
knn_score.mean(), knn_score.std()

(0.87439414819414818, 0.010516405030673007)

In [18]:
grid_search.best_params_

{'countvectorizer__max_df': 0.25,
 'countvectorizer__min_df': 1,
 'kneighborsclassifier__n_neighbors': 25}

## Feature selection

In [19]:
knn = make_pipeline(CountVectorizer(analyzer=identity), 
                    SelectKBest(mutual_info_classif),
                    KNeighborsClassifier(metric='cosine'))    

In [28]:
params = {'kneighborsclassifier__n_neighbors': [5, 10],
          'selectkbest__k':[1000, 5000, 10000],
          'countvectorizer__min_df': [1, 2],
          'countvectorizer__max_df': [0.25, 0.5]}
grid_search = GridSearchCV(knn, params, n_jobs=4, return_train_score=True)
grid_search.fit(df['tokens'], df['politics'])   

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f4a8a4a0a58>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.25,
        max_features=None, min_df=2, ngram_range=(1, 1), prep...osine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'kneighborsclassifier__n_neighbors': [5, 10, 20], 'selectkbest__k': [1000, 5000, 10000, 20000, 'all'], 'countvectorizer__min_df': [1, 2], 'countvectorizer__max_df': [0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
grid_search.best_params_

{'countvectorizer__max_df': 0.25,
 'countvectorizer__min_df': 1,
 'kneighborsclassifier__n_neighbors': 5,
 'selectkbest__k': 5000}

In [29]:
knn.set_params(**grid_search.best_params_)
knn_score = cross_val_score(knn, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
knn_score.mean(), knn_score.std()

(0.87499995079995085, 0.0045172840087310722)