# Text Classification

Improving text classifiction using LSI/NMF

In [1]:
import pandas as pd
from cytoolz import identity
import spacy

In [2]:
nlp = spacy.load('en', disable=['tagger', 'ner', 'parser'])

## Load data

In [3]:
df = pd.read_msgpack('http://bulba.sdsu.edu/rcv1_train.dat')

In [4]:
def tokenize(text):
    return [tok.orth_ for tok in nlp.tokenizer(text)]
df['tokens'] = df['text'].apply(tokenize)

## Import sklearn

In [5]:
from sklearn.linear_model import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.model_selection import *
from sklearn.decomposition import *
from sklearn.preprocessing import *

In [6]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

## Logistic Regression

In [7]:
lr = make_pipeline(CountVectorizer(analyzer=identity), 
                   LogisticRegression())     
lr_score = cross_val_score(lr, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
lr_score.mean(), lr_score.std()

(0.8816664338814556, 0.0030277962191306404)

Optimize hyperparameters

In [8]:
params = {'logisticregression__C': [0.01, 0.1, 1.0],
          'countvectorizer__min_df': [1, 2],
          'countvectorizer__max_df': [0.25, 0.5]}
grid_search = GridSearchCV(lr, params, n_jobs=-1, return_train_score=True)
grid_search.fit(df['tokens'], df['politics'])   

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f856b8e88e8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1.0], 'countvectorizer__min_df': [1, 2], 'countvectorizer__max_df': [0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
lr.set_params(**grid_search.best_params_)
score = cross_val_score(lr, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
score.mean(), score.std()

(0.89053303462959654, 0.0025217873309385753)

## LR + Latent Semantic Indexing

In [10]:
lsa = make_pipeline(CountVectorizer(analyzer=identity), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    TruncatedSVD(300, n_iter=25), 
                    LogisticRegression())     

LSI is computationally expensive, so this grid search can take a long time....

In [12]:
params = {'logisticregression__C': [0.01, 0.1, 1.0],
          'countvectorizer__min_df': [1, 2],
          'countvectorizer__max_df': [0.25, 0.5]}
grid_search = GridSearchCV(lsa, params, n_jobs=-1, verbose=1, return_train_score=True)
grid_search.fit(df['tokens'], df['politics'])   

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 11.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f856b8e88e8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), prepr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1.0], 'countvectorizer__min_df': [1, 2], 'countvectorizer__max_df': [0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [14]:
lsa.set_params(**grid_search.best_params_)
lsa_score = cross_val_score(lsa, df['tokens'], df['politics'], cv=folds, n_jobs=-1)
lsa_score.mean(), lsa_score.std()

(0.88920056786672974, 0.0028572041447832508)

-----

Compare TruncatedSVD vs NMF

In [15]:
lsa = make_pipeline(CountVectorizer(analyzer=identity, min_df=2, max_df=0.25), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    TruncatedSVD(500, n_iter=25))

In [16]:
lsa.fit(df['tokens'], df['politics'])

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f856b8e88e8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.25,
        max_features=None, min_df=2, ngram_range=(1, 1), prep...uncatedSVD(algorithm='randomized', n_components=500, n_iter=25,
       random_state=None, tol=0.0))])

In [17]:
V = lsa.named_steps['countvectorizer'].get_feature_names()

In [18]:
for d in range(10):
    D = list(reversed(lsa.named_steps['truncatedsvd'].components_[d].argsort()))
    for i in D[:10]:
        print(V[i], end = ' ')
    print()

percent U.S. $ 1 new million Clinton state Minister 2 
0 1 2 6 3 4 7 5 : 8 
Israel Israeli Palestinian Netanyahu peace Arafat Palestinians Jerusalem Hebron East 
Kong Hong China Chinese Taiwan Beijing Zaire rebels military refugees 
Kong Hong China percent Chinese Israel Beijing tax Taiwan Palestinian 
NATO Yeltsin Russia 0 Russian Moscow 1 alliance summit Clinton 
6 7 beat 4 NATO Yeltsin Russia 5 U.S. Russian 
Zaire percent refugees 6 tax Iraq Mobutu Kabila budget rebels 
party election Labour percent Zaire opposition elections Party parliament Mobutu 
Zaire Mobutu Kabila refugees Rwanda South Rwandan Africa Zairean Clinton 


In [19]:
nmf = make_pipeline(CountVectorizer(analyzer=identity, min_df=2, max_df=0.25), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    NMF(50))

In [20]:
nmf.fit(df['tokens'], df['politics'])

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=<cyfunction identity at 0x7f856b8e88e8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.25,
        max_features=None, min_df=2, ngram_range=(1, 1), prep...er=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0))])

In [21]:
for d in range(10):
    D = list(reversed(nmf.named_steps['nmf'].components_[d].argsort()))
    for i in D[:10]:
        print(V[i], end = ' ')
    print()

Labour Major Blair election Conservatives Britain Conservative party British Party 
0 1 2 3 4 5 b Results matches c 
Israel Palestinian Israeli Netanyahu Arafat peace Palestinians Jerusalem Hebron East 
China Chinese Beijing Deng Jiang rights human Xiaoping MFN trade 
Kong Hong China Tung British handover Chinese territory colony legislature 
NATO Russia alliance Moscow expansion enlargement Poland Europe summit Russian 
6 beat 7 4 3 2 5 tennis denotes Spain 
Cup season game second team points scored win goal match 
$ million company billion Inc money 1996 Corp pounds New 
refugees Zaire Rwanda Rwandan Hutu Zairean eastern U.N. UNHCR Tutsi 
