## This notebook is a practice for building a classifier on IMDB movie reviews

In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('data/imdb-dataset-of-50k-movie-reviews.zip', compression='zip')

In [3]:
data.shape

(50000, 2)

In [4]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [6]:
positives = data[data['sentiment'] == 'positive']
negatives = data[data['sentiment'] == 'negative']

In [7]:
print(positives.shape)
print(negatives.shape)

(25000, 2)
(25000, 2)


#### Get a subset (imbalanced)

In [8]:
imb_positives = positives[:9000]
imb_negatives = negatives[:1000]
print(imb_positives.shape)
print(imb_negatives.shape)
imb_reviews = pd.concat([imb_positives, imb_negatives])
#imb_reviews.reset_index(drop=True, inplace=True)
print(imb_reviews.shape)
imb_sentiments = imb_reviews['sentiment']
print(imb_sentiments.shape)
imb_reviews.drop(['sentiment'], axis=1, inplace=True)
#imb_reviews = imb_reviews[['review']]
print(imb_reviews.shape)


(9000, 2)
(1000, 2)
(10000, 2)
(10000,)
(10000, 1)


In [9]:
imb_reviews

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o..."
...,...
2000,Stranded in Space (1972) MST3K version - a ver...
2005,"I happened to catch this supposed ""horror"" fli..."
2007,waste of 1h45 this nasty little film is one to...
2010,Warning: This could spoil your movie. Watch it...


In [10]:
Counter(imb_sentiments)

Counter({'positive': 9000, 'negative': 1000})

#### Using imblearn under_sampling to handle the imbalanced data

In [11]:
rus = RandomUnderSampler(random_state=0)
reviews, sentiments = rus.fit_resample(imb_reviews, imb_sentiments)
print(Counter(sentiments))
print(f'review_shap: {reviews.shape}')

x_train, x_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=0.3,random_state=42)
print(f'x_train.shape:{x_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'x_test: {x_test.shape}')
print(f'y_test: {y_test.shape}')

Counter({'negative': 1000, 'positive': 1000})
review_shap: (2000, 1)
x_train.shape:(1400, 1)
y_train: (1400,)
x_test: (600, 1)
y_test: (600,)


#### converting the reviews to numerical data to be fed into classifier moodels

In [25]:
tfidf = TfidfVectorizer(stop_words='english')
x_train_list = [x[0] for x in x_train.values]
x_train_vectorized = tfidf.fit_transform(x_train_list)
x_train_vectorized

<1400x21091 sparse matrix of type '<class 'numpy.float64'>'
	with 124311 stored elements in Compressed Sparse Row format>

In [26]:
pd.DataFrame.sparse.from_spmatrix(x_train_vectorized, index=x_train.index, columns=tfidf.get_feature_names()) 

Unnamed: 0,00,000,007,01pm,02,04,08,10,100,1000,...,zooming,zooms,zues,zzzzzzzzzzzzzzzzzz,æon,élan,émigré,ísnt,ïn,ünfaithful
836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031783,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
x_test_list = [x[0] for x in x_test.values]
x_test_vectorized = tfidf.transform(x_test_list)

In [28]:
x_test_vectorized.shape

(600, 21091)

In [29]:
pd.DataFrame.sparse.from_spmatrix(x_test_vectorized, index=x_test.index,columns=tfidf.get_feature_names())

Unnamed: 0,00,000,007,01pm,02,04,08,10,100,1000,...,zooming,zooms,zues,zzzzzzzzzzzzzzzzzz,æon,élan,émigré,ísnt,ïn,ünfaithful
1860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066569,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
print(x_train_vectorized.shape)
print(y_train.shape)
print(x_test_vectorized.shape)
print(y_test.shape)

(1400, 21091)
(1400,)
(600, 21091)
(600,)


In [64]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

## Building models

In [84]:
classifiers = [
    SVC(kernel='linear'),
    LogisticRegression(solver= "sag", random_state= 33, max_iter = 100),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=100, n_estimators=500),
    AdaBoostClassifier(),
    #GaussianNB(var_smoothing=0.02),
    KNeighborsClassifier()
]
names = [
    'scv', 
    'logistic_regression',
    'decision_tree', 
    'random_forest',
    'adaboost',
    #'naive_bayes',
    'knn'
]

In [87]:
for name, classifier in zip(names, classifiers):
    clf = classifier
    clf.fit(x_train_vectorized, y_train_enc)
    preds = clf.predict(x_test_vectorized)
    cf = confusion_matrix(y_test_enc,preds)
    acc = accuracy_score(y_test_enc, preds)
    fscore =f1_score(y_test_enc, preds) 
    
    print(f'{name}:\n {cf}, accuracy: {acc}, fscore: {fscore}')
    

scv:
 [[245  57]
 [ 43 255]], accuracy: 0.8333333333333334, fscore: 0.8360655737704918
logistic_regression:
 [[237  65]
 [ 41 257]], accuracy: 0.8233333333333334, fscore: 0.8290322580645162
decision_tree:
 [[192 110]
 [103 195]], accuracy: 0.645, fscore: 0.6467661691542289
random_forest:
 [[250  52]
 [ 60 238]], accuracy: 0.8133333333333334, fscore: 0.8095238095238095
adaboost:
 [[226  76]
 [ 74 224]], accuracy: 0.75, fscore: 0.7491638795986622
knn:
 [[245  57]
 [148 150]], accuracy: 0.6583333333333333, fscore: 0.594059405940594


### Looks like svm performs best here. So let's fine tune that

In [89]:
parameters = {'C':[1, 4, 8, 16, 32], 'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc, parameters, cv=5)
svc_grid.fit(x_train_vectorized, y_train_enc)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']})

In [90]:
svc_grid.best_params_

{'C': 4, 'kernel': 'rbf'}

In [92]:
svc_grid.best_estimator_

SVC(C=4)

In [93]:
final_svc = SVC(C=4, kernel='rbf')
final_svc.fit(x_train_vectorized, y_train_enc)
preds = final_svc.predict(x_test_vectorized)
cf = confusion_matrix(y_test_enc,preds)
acc = accuracy_score(y_test_enc, preds)
fscore =f1_score(y_test_enc, preds) 

print(f'{name}:\n {cf}, accuracy: {acc}, fscore: {fscore}')

knn:
 [[244  58]
 [ 37 261]], accuracy: 0.8416666666666667, fscore: 0.8460291734197731
