# IMDB review classifier using word embedding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
with open("imdb_labelled.txt","r") as text_file:
    lines = text_file.read().split('\n')

In [3]:
lines[1:10]

['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  \t0',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  \t0',
 'Very little music or anything to speak of.  \t0',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  \t1',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  \t0",
 'Wasted two hours.  \t0',
 'Saw the movie today and thought it was a good effort, good messages for kids.  \t1',
 'A bit predictable.  \t0',
 'Loved the casting of Jimmy Buffet as the science teacher.  \t1']

# split by tab and remove corrupted data if any or lines which are not tab seperated

In [4]:
 
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']
lines[1:10]

[['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
  '0'],
 ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
  '0'],
 ['Very little music or anything to speak of.  ', '0'],
 ['The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
  '1'],
 ["The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
  '0'],
 ['Wasted two hours.  ', '0'],
 ['Saw the movie today and thought it was a good effort, good messages for kids.  ',
  '1'],
 ['A bit predictable.  ', '0'],
 ['Loved the casting of Jimmy Buffet as the science teacher.  ', '1']]

#  Seperating the reviews

In [5]:

train_doc = [line[0] for line in lines ]
train_doc

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
 'Very little music or anything to speak of.  ',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
 'Wasted two hours.  ',
 'Saw the movie today and thought it was a good effort, good messages for kids.  ',
 'A bit predictable.  ',
 'Loved the casting of Jimmy Buffet as the science teacher.  ',
 'And those baby owls were adorable.  ',
 "The movie showed a lot of Florida at it's best, made it look very appealing.  ",
 'The Son

# Seperating the sentiment

In [6]:
train_labels = [int(line[1]) for line in lines]
train_labels[1:10]

[0, 0, 0, 1, 0, 0, 1, 0, 1]

#  Importing CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


# Training the document

In [8]:
count_vectorizer = CountVectorizer(binary='true')
train_documents = count_vectorizer.fit_transform(train_doc)

In [9]:
train_documents

<1000x3047 sparse matrix of type '<type 'numpy.int64'>'
	with 12666 stored elements in Compressed Sparse Row format>

In [10]:
print(train_documents[0])

  (0, 1639)	1
  (0, 3037)	1
  (0, 786)	1
  (0, 748)	1
  (0, 37)	1
  (0, 1748)	1
  (0, 92)	1
  (0, 1750)	1
  (0, 2404)	1
  (0, 2871)	1


In [11]:
count_vectorizer.vocabulary_

{u'secondly': 2301,
 u'limited': 1571,
 u'child': 452,
 u'four': 1074,
 u'sleep': 2398,
 u'ridiculous': 2199,
 u'captain': 387,
 u'hate': 1225,
 u'poorly': 2003,
 u'relationships': 2151,
 u'buddy': 352,
 u'spew': 2456,
 u'contained': 559,
 u'presents': 2030,
 u'fingernails': 1022,
 u'edward': 818,
 u'sorrentino': 2435,
 u'whatsoever': 2954,
 u'under': 2789,
 u'rickman': 2198,
 u'lord': 1602,
 u'sorry': 2436,
 u'worth': 3011,
 u'sinking': 2384,
 u'rescue': 2177,
 u'rise': 2204,
 u'smack': 2406,
 u'every': 895,
 u'jack': 1433,
 u'bertolucci': 274,
 u'tickets': 2682,
 u'school': 2272,
 u'wooden': 3000,
 u'loneliness': 1593,
 u'upa': 2841,
 u'girolamo': 1147,
 u'haggis': 1203,
 u'falwell': 962,
 u'settings': 2330,
 u'ups': 2844,
 u'clothes': 486,
 u'enjoy': 863,
 u'force': 1055,
 u'monotonous': 1728,
 u'tired': 2691,
 u'tolerate': 2699,
 u'consistent': 554,
 u'japanese': 1437,
 u'elegant': 830,
 u'second': 2299,
 u'fabulous': 946,
 u'gorman': 1166,
 u'emptiness': 848,
 u'trashy': 2736,
 u'

In [12]:
x = train_documents
y = train_labels

# Splitting dataset

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train,Y_test = train_test_split(x,y,test_size = 0.30,random_state = 7)


## Naive Bayes

### (1) Multinolmial Naive Bayes

In [14]:
from sklearn.naive_bayes import MultinomialNB
multi = MultinomialNB().fit(X_train,Y_train)
multi_pred = multi.predict(X_test)

In [15]:
multi_pred

array([1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [16]:
from sklearn import metrics
multi_acc = metrics.accuracy_score(Y_test,multi_pred)
multi_acc

0.7866666666666666

In [17]:
multi_f1 = metrics.f1_score(Y_test, multi_pred, average='weighted', labels=np.unique(multi_pred))
multi_f1

0.7857229247657774

In [18]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test,multi_pred)
print(matrix)


[[129  23]
 [ 41 107]]


### Bernoulli Naive Bayes

In [19]:
# Training Phase
from sklearn.naive_bayes import BernoulliNB
bern = BernoulliNB().fit(X_train,Y_train)
bern_pred = bern.predict(X_test)


In [20]:
from sklearn import metrics
bern_acc = metrics.accuracy_score(Y_test,bern_pred)
bern_acc

0.7633333333333333

In [21]:
bern_f1 = metrics.f1_score(Y_test, bern_pred, average='weighted', labels=np.unique(bern_pred))
bern_f1

0.7583699914748508

In [22]:
from sklearn.metrics import classification_report 

print(classification_report(Y_test,bern_pred))

             precision    recall  f1-score   support

          0       0.71      0.90      0.79       152
          1       0.86      0.62      0.72       148

avg / total       0.78      0.76      0.76       300



In [23]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test,bern_pred)
print(matrix)


[[137  15]
 [ 56  92]]


### (3)Gaussain Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

In [25]:
gauss = GaussianNB()
gauss.fit(X_train.todense(),Y_train)

GaussianNB(priors=None)

In [26]:
gauss_pred =  gauss.predict(X_test.todense())

In [27]:
gauss_acc = metrics.accuracy_score(gauss_pred,Y_test)
gauss_acc

0.6666666666666666

In [28]:
print(classification_report(gauss_pred,Y_test))

             precision    recall  f1-score   support

          0       0.79      0.64      0.71       188
          1       0.54      0.71      0.62       112

avg / total       0.70      0.67      0.67       300



In [29]:
matrix = confusion_matrix(Y_test,gauss_pred)
print(matrix)


[[120  32]
 [ 68  80]]


In [30]:
gauss_f1 = metrics.f1_score(Y_test, gauss_pred, average='weighted', labels=np.unique(gauss_pred))
gauss_f1

0.6612368024132729

### KNeighborsClassifier

In [31]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [32]:
KNN_pred = KNN.predict(X_test)

In [33]:
KNN_acc = metrics.accuracy_score(Y_test,KNN_pred)
KNN_acc

0.59

In [34]:
KNN_mat = confusion_matrix(Y_test, KNN_pred)
KNN_mat

array([[108,  44],
       [ 79,  69]], dtype=int64)

In [35]:
print(classification_report(Y_test, KNN_pred))

             precision    recall  f1-score   support

          0       0.58      0.71      0.64       152
          1       0.61      0.47      0.53       148

avg / total       0.59      0.59      0.58       300



In [36]:
KNN_f1 = metrics.f1_score(Y_test, KNN_pred, average='weighted', labels=np.unique(KNN_pred))
KNN_f1

0.5836747702844743

## Support vector machine

In [37]:
from sklearn.svm import SVC

### (1)Linear SVC

In [38]:
lsvc =SVC(kernel = "linear",degree=8)

In [39]:
lsvc.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=8, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
lsvc_pred = lsvc.predict(X_test)

In [41]:
lsvc_acc = metrics.accuracy_score(Y_test,lsvc_pred)
lsvc_acc

0.7433333333333333

In [42]:
matrix = confusion_matrix(Y_test,lsvc_pred)
print(matrix)


[[113  39]
 [ 38 110]]


In [43]:
print(classification_report(Y_test,lsvc_pred))

             precision    recall  f1-score   support

          0       0.75      0.74      0.75       152
          1       0.74      0.74      0.74       148

avg / total       0.74      0.74      0.74       300



In [44]:
lsvc_f1 = metrics.f1_score(Y_test, lsvc_pred, average='weighted', labels=np.unique(lsvc_pred))
lsvc_f1

0.74334188974453

### (2)Poly SVC

In [45]:
psvc =SVC(kernel = "poly",degree= 8)

In [46]:
psvc.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=8, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
psvc_pred = psvc.predict(X_test)

In [48]:
psvc_acc = metrics.accuracy_score(Y_test,psvc_pred)
psvc_acc

0.49333333333333335

In [49]:
matrix = confusion_matrix(Y_test,psvc_pred)
print(matrix)


[[  0 152]
 [  0 148]]


In [50]:
print(classification_report(Y_test,psvc_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       152
          1       0.49      1.00      0.66       148

avg / total       0.24      0.49      0.33       300



  'precision', 'predicted', average, warn_for)


In [51]:
psvc_f1 = metrics.f1_score(Y_test, psvc_pred, average='weighted', labels=np.unique(psvc_pred))
psvc_f1

0.6607142857142857

### (3)Gaussian SVC

In [52]:
gsvc = SVC(kernel = 'rbf')
gsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
gsvc_pred = gsvc.predict(X_test)

In [54]:
gsvc_acc = metrics.accuracy_score(gsvc_pred, Y_test)
gsvc_acc

0.49333333333333335

In [55]:
gsvc_mat = confusion_matrix(Y_test, gsvc_pred)
gsvc_mat

array([[  0, 152],
       [  0, 148]], dtype=int64)

In [56]:
print(classification_report(Y_test, gsvc_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       152
          1       0.49      1.00      0.66       148

avg / total       0.24      0.49      0.33       300



In [57]:
gsvc_f1 =  metrics.f1_score(Y_test, gsvc_pred, average='weighted', labels=np.unique(gsvc_pred))
gsvc_f1

0.6607142857142857

## Decision Tree Classifier

In [58]:
from sklearn.tree import DecisionTreeClassifier

In [59]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [60]:
dtree_pred = dtree.predict(X_test)

In [61]:
dtree_acc = metrics.accuracy_score(Y_test, dtree_pred)
dtree_acc

0.67

In [62]:
dtree_mat = confusion_matrix(Y_test, dtree_pred)
dtree_mat

array([[ 97,  55],
       [ 44, 104]], dtype=int64)

In [63]:
print(classification_report(Y_test, dtree_pred))

             precision    recall  f1-score   support

          0       0.69      0.64      0.66       152
          1       0.65      0.70      0.68       148

avg / total       0.67      0.67      0.67       300



In [64]:
dtree_f1 = metrics.f1_score(Y_test, dtree_pred)
dtree_f1

0.6775244299674267

## Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier
rand = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=7)

In [66]:
rand.fit(X_train , Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [67]:
rand_pred = rand.predict(X_test)

In [68]:
rand_acc = metrics.accuracy_score(Y_test, rand_pred)
rand_acc

0.7266666666666667

In [69]:
rand_mat = confusion_matrix(Y_test, rand_pred)
rand_mat

array([[116,  36],
       [ 46, 102]], dtype=int64)

In [70]:
print(classification_report(Y_test, rand_pred))

             precision    recall  f1-score   support

          0       0.72      0.76      0.74       152
          1       0.74      0.69      0.71       148

avg / total       0.73      0.73      0.73       300



In [71]:
rand_f1 = metrics.f1_score(Y_test, rand_pred)
rand_f1

0.7132867132867132

## Voting Classifier

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [73]:
estimators =[]
model = LogisticRegression()
estimators.append(('logistic',model))
model1 = SVC(kernel = 'rbf')
estimators.append(('SVC',model1))
model2 = KMeans()
estimators.append(('KMeans',model2))
model3 = KNeighborsClassifier()
estimators.append(('KNN',model3))
voting = VotingClassifier(estimators)

In [74]:
voting.fit(X_train, Y_train)

VotingClassifier(estimators=[('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('SVC', ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [75]:
voting_pred = voting.predict(X_test)

  if diff:


In [76]:
voting_acc = metrics.accuracy_score(Y_test,voting_pred)
voting_acc

0.6766666666666666

In [77]:
voting_mat = confusion_matrix(Y_test, voting_pred)
voting_mat

array([[ 97,  55],
       [ 42, 106]], dtype=int64)

In [78]:
print(classification_report(Y_test, voting_pred))

             precision    recall  f1-score   support

          0       0.70      0.64      0.67       152
          1       0.66      0.72      0.69       148

avg / total       0.68      0.68      0.68       300



In [79]:
voting_f1 = metrics.f1_score(Y_test, voting_pred)
voting_f1


0.686084142394822

In [80]:
Algo_name = ["Multinolmial Naive Bayes","Bernoulli Naive Bayes","Gaussain Naive Bayes","KNN","Linear SVC","Poly SVC",
             "Gaussian SVC","Decision Tree Classifier","Random Forest Classifier","Voting Classifier"]
f1_score = [multi_f1,bern_f1,gauss_f1,KNN_f1,lsvc_f1,psvc_f1,gsvc_f1,dtree_f1,rand_f1,voting_f1]

In [81]:
res = pd.DataFrame({"Algorithms":Algo_name, "F1_score":f1_score }, index = [1,2,3,4,5,6,7,8,9,10])
res

Unnamed: 0,Algorithms,F1_score
1,Multinolmial Naive Bayes,0.785723
2,Bernoulli Naive Bayes,0.75837
3,Gaussain Naive Bayes,0.661237
4,KNN,0.583675
5,Linear SVC,0.743342
6,Poly SVC,0.660714
7,Gaussian SVC,0.660714
8,Decision Tree Classifier,0.677524
9,Random Forest Classifier,0.713287
10,Voting Classifier,0.686084


In [82]:
Algo_name = ["Multinolmial Naive Bayes","Bernoulli Naive Bayes","Gaussain Naive Bayes","KNN","Linear SVC","Poly SVC",
             "Gaussian SVC","Decision Tree Classifier","Random Forest Classifier","Voting Classifier"]
accuracy = [multi_acc,bern_acc,gauss_acc,KNN_acc,lsvc_acc,psvc_acc,gsvc_acc,dtree_acc,rand_acc,voting_acc]

In [83]:
res2 = pd.DataFrame({"Algorithms":Algo_name, "Accuracy":accuracy}, index = [1,2,3,4,5,6,7,8,9,10])
res2

Unnamed: 0,Accuracy,Algorithms
1,0.786667,Multinolmial Naive Bayes
2,0.763333,Bernoulli Naive Bayes
3,0.666667,Gaussain Naive Bayes
4,0.59,KNN
5,0.743333,Linear SVC
6,0.493333,Poly SVC
7,0.493333,Gaussian SVC
8,0.67,Decision Tree Classifier
9,0.726667,Random Forest Classifier
10,0.676667,Voting Classifier
