In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
# Need cp1252 - Windows Western Europe encoding here
df = pd.read_csv('../kaggleData/imdb-review-dataset/imdb_master.csv', index_col=0, encoding='cp1252')

In [4]:
df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [5]:
x_train = df.loc[df['type']=='train', 'review']
y_train = df.loc[df['type']=='train', 'label']
x_test = df.loc[df['type']=='test', 'review']
y_test = df.loc[df['type']=='test', 'label']

In [6]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(75000,) (75000,)
(25000,) (25000,)


In [14]:
x_train.iloc[0,]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

## CountVectorizer

In [76]:
vect = CountVectorizer(ngram_range=(1,2), max_df=0.5, stop_words='english')

In [77]:
countVects = vect.fit_transform(x_train[0:10])

In [78]:
countVects.shape

(10, 2006)

In [79]:
countVects.toarray()[0]>0

array([False, False, False, ..., False,  True,  True], dtype=bool)

In [80]:
np.array(vect.get_feature_names())[countVects.toarray()[0]>0]

array(['absurd', 'absurd comedy', 'absurd time', 'audience',
       'audience turned', 'better', 'better think', 'briefly', 'chantings',
       'chantings singers', 'cinematography', 'cinematography future',
       'comedy', 'comedy formal', 'crazy', 'crazy chantings', 'cryptic',
       'cryptic dialogue', 'dialogue', 'dialogue make', 'easy',
       'easy grader', 'era', 'era turned', 'eventually',
       'eventually making', 'example', 'example absurd', 'feelings',
       'feelings pig', 'formal', 'formal orchestra', 'forrest',
       'forrest seen', 'frederic', 'frederic forrest', 'future',
       'future great', 'future stars', 'general', 'general narrative',
       'good', 'good cinematography', 'grader', 'grader technical',
       'great', 'great vilmos', 'insane', 'insane violent', 'just putting',
       'kirkland', 'kirkland frederic', 'level', 'level better', 'make',
       'make shakespeare', 'making', 'making just', 'man', 'man unnatural',
       'mob', 'mob crazy', 'narrativ

## Tfidf Vectorizer

In [14]:
vect = TfidfVectorizer()

In [15]:
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
tfVects = vect.fit_transform(x_train[0:10])

In [17]:
tfVects.toarray()[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.2215105 ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.05398864,  0.04485126,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

## 20 Newsgroups

In [79]:
from sklearn.datasets import fetch_20newsgroups

In [80]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [81]:
twenty_train = fetch_20newsgroups(subset='train',shuffle=True, categories=categories, random_state=42)

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [83]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [84]:
len(count_vect.vocabulary_)

35788

In [85]:
X_train_counts.shape

(2257, 35788)

In [86]:
len(X_train_counts.toarray()[0])

35788

In [87]:
twenty_train.target.shape

(2257,)

In [88]:
np.unique(twenty_train.target)

array([0, 1, 2, 3], dtype=int64)

We want to see if the Logistic regression classifier can handle categorical data. So converting the encoded y values to categorical

In [89]:
twenty_train.target[0:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [90]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [91]:
y_labels = [twenty_train.target_names[y] for y in twenty_train.target]

In [92]:
y_labels[0:10]

['comp.graphics',
 'comp.graphics',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'sci.med',
 'sci.med',
 'sci.med']

### Now try to fit the LogReg Classifier on this data

In [293]:
clf = LogisticRegression(C=1, n_jobs=-1, penalty='l2', solver='lbfgs', multi_class='ovr')

# Training specifcally on y_labels instead of twenty_train.target which is in encoded form
# Want to see if LogisticRegression handles categoricals by default
clf.fit(X_train_counts, y_labels)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

### Test performance

In [294]:
twenty_test = fetch_20newsgroups(subset='test',shuffle=True, categories=categories, random_state=42)

In [295]:
x_test = count_vect.transform(twenty_test.data)

In [296]:
x_test.toarray()[0][0:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [297]:
docs_test = twenty_test.data
predicted = clf.predict(x_test)

In [298]:
predicted

array(['sci.med', 'sci.med', 'sci.med', ..., 'sci.med', 'sci.med',
       'comp.graphics'],
      dtype='<U22')

In [299]:
print(np.unique(twenty_test.target))
print(twenty_test.target_names)

[0 1 2 3]
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [300]:
predicted_encoded = [twenty_test.target_names.index(pr) for pr in predicted]

In [301]:
predicted_encoded[0:10]

[2, 2, 2, 0, 3, 0, 1, 3, 0, 1]

In [302]:
# Accuracy %
np.mean(predicted_encoded == twenty_test.target)     

0.89813581890812255

In [303]:
len(predicted_encoded)

1502

In [304]:
np.sum(predicted_encoded == twenty_test.target)/len(predicted_encoded)

0.89813581890812255

In [305]:
twenty_test.target[10:20]

array([1, 3, 2, 3, 1, 0, 1, 3, 0, 0], dtype=int64)

In [306]:
np.array(predicted_encoded)[10:20]

array([1, 3, 0, 3, 1, 2, 1, 3, 0, 3])

In [308]:
import sklearn

In [311]:
sklearn.__version__

'0.19.1'

In [319]:
print(pd.Series(y_labels).astype('category').cat.codes[0:10])
print(pd.Series(y_labels).astype('category')[0:10])

0    1
1    1
2    3
3    3
4    3
5    3
6    3
7    2
8    2
9    2
dtype: int8
0             comp.graphics
1             comp.graphics
2    soc.religion.christian
3    soc.religion.christian
4    soc.religion.christian
5    soc.religion.christian
6    soc.religion.christian
7                   sci.med
8                   sci.med
9                   sci.med
dtype: category
Categories (4, object): [alt.atheism, comp.graphics, sci.med, soc.religion.christian]
