In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
# Shortened verion of dataset from: http://ai.stanford.edu/~amaas/data/sentiment/ 
df = pd.read_csv('./moviereviews2.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [3]:
df.isnull().sum()

label      0
review    20
dtype: int64

In [5]:
len(df)

6000

In [7]:
df = df.dropna()

In [12]:
# Check for blank strings
blanks = []

#(index, label, review)
for idx, lb, rv in df.itertuples():
    if rv.isspace():
        blanks.append(idx)
blanks

[]

In [13]:
df['label'].value_counts()

pos    2990
neg    2990
Name: label, dtype: int64

In [14]:
X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
stopWords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [24]:
# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words = stopWords)),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(stop_words = stopWords)),
                     ('clf', LinearSVC()),
])

Naive Bayes

In [25]:
text_clf_nb.fit(X_train, y_train)
predictions = text_clf_nb.predict(X_test)

In [26]:
print(metrics.confusion_matrix(y_test,predictions))

[[927  64]
 [121 862]]


In [27]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.88      0.94      0.91       991
         pos       0.93      0.88      0.90       983

   micro avg       0.91      0.91      0.91      1974
   macro avg       0.91      0.91      0.91      1974
weighted avg       0.91      0.91      0.91      1974



SVM

In [28]:
text_clf_lsvc.fit(X_train, y_train)
predictions = text_clf_lsvc.predict(X_test)

In [29]:
print(metrics.confusion_matrix(y_test,predictions))

[[884 107]
 [ 62 921]]


In [30]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.93      0.89      0.91       991
         pos       0.90      0.94      0.92       983

   micro avg       0.91      0.91      0.91      1974
   macro avg       0.92      0.91      0.91      1974
weighted avg       0.92      0.91      0.91      1974



Manual (from Saibal Chatterjee, NDTV)

In [32]:
myreview = "The actors give the film all they have. Kartik Aryaan is quick to hit his straps, Bhumi Pednekar pulls off the sultry siren with as much ease as she does the settled-into-domesticity persona, and Ananya Panday is perfectly cast as the wide-eyed girl who sweeps the hero off his feet but holds her own ground."

In [33]:
print(text_clf_nb.predict([myreview]))

['pos']


In [34]:
print(text_clf_lsvc.predict([myreview]))

['pos']
