Moview Review Sentiment Analysis

In [None]:
Data Set : https://www.kaggle.com/utathya/imdb-review-dataset

In [6]:
# Import libraries
import re
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Autograding
import tests_lab4
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# train test split and cross validation
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [10]:
### Helper Function

def store_cross_val_results(model_name, scores, results_dict):
    """
    Stores mean scores from cross_validate in results_dict for
    the given model model_name.

    Parameters
    ----------
    model_name :
        scikit-learn classification model
    scores : dict
        object return by `cross_validate`
    results_dict: dict
        dictionary to store results

    Returns
    ----------
        None

    """
    results_dict[model_name] = {
        "mean_train_accuracy": "{:0.4f}".format(np.mean(scores["train_score"])),
        "mean_valid_accuracy": "{:0.4f}".format(np.mean(scores["test_score"])),
        "mean_fit_time (s)": "{:0.4f}".format(np.mean(scores["fit_time"])),
        "mean_score_time (s)": "{:0.4f}".format(np.mean(scores["score_time"])),
        "std_train_score": "{:0.4f}".format(scores["train_score"].std()),
        "std_valid_score": "{:0.4f}".format(scores["test_score"].std()),
    }


In [3]:
imdb_df = pd.read_csv("imdb_master.csv", encoding="ISO-8859-1", index_col="Unnamed: 0")
imdb_df = imdb_df.query('label == "neg" | label == "pos"')
train_df = imdb_df.query('type == "train"')
test_df = imdb_df.query('type == "test"')

In [4]:
imdb_df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [7]:
X_train, X_test, y_train, y_test = train_test_split(imdb_df["review"],imdb_df["label"],test_size=0.2, random_state=123)

In [12]:
results_dict ={}
pipeline = make_pipeline(CountVectorizer(binary = True), LogisticRegression(max_iter=2000))
scores = cross_validate(pipeline, X_train, y_train, cv=2, return_train_score=True)
store_cross_val_results("LogisticRegression(max_iter=2000)", scores, results_dict)
pd.DataFrame(results_dict).T

Unnamed: 0,mean_fit_time (s),mean_score_time (s),mean_train_accuracy,mean_valid_accuracy,std_train_score,std_valid_score
LogisticRegression(max_iter=2000),6.4462,2.7904,0.9991,0.8757,0.0,0.002


In [13]:
pipe_logistic = make_pipeline(CountVectorizer(binary = True), LogisticRegression(max_iter=1000))

In [14]:
param_grid = {"logisticregression__C": 10.0 ** np.arange(-3, 3),
              "countvectorizer__max_features": [10,20,30,100,1000,8000],
             }

In [15]:
random_search = RandomizedSearchCV(pipe_logistic, param_distributions = param_grid, cv = 5, n_jobs = -1, n_iter = 10, return_train_score = True)
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('countvectorizer',
                                              CountVectorizer(binary=True)),
                                             ('logisticregression',
                                              LogisticRegression(max_iter=1000))]),
                   n_jobs=-1,
                   param_distributions={'countvectorizer__max_features': [10,
                                                                          20,
                                                                          30,
                                                                          100,
                                                                          1000,
                                                                          8000],
                                        'logisticregression__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
                   return_train_score=True)

In [16]:
print("Best Params: ",random_search.best_params_)
print("Best Validation Score: ",random_search.best_score_)
#pd.DataFrame(random_search.cv_results_).set_index("rank_test_score").sort_index()

Best Params:  {'logisticregression__C': 0.01, 'countvectorizer__max_features': 8000}
Best Validation Score:  0.8813749999999999


In [17]:
random_search.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(binary=True, max_features=8000)),
                ('logisticregression',
                 LogisticRegression(C=0.01, max_iter=1000))])

In [18]:
weights_neg_20 = np.argsort(random_search.best_estimator_.named_steps['logisticregression'].coef_.flatten())[0:19:1]
weights_pos_20 = np.argsort(random_search.best_estimator_.named_steps['logisticregression'].coef_.flatten())[-20:]

neg = []
pos = []

words = random_search.best_estimator_.named_steps['countvectorizer'].get_feature_names()

for index in weights_neg_20:
    neg.append(words[index])
    
for index in weights_pos_20:
    pos.append(words[index])
    
print("words that are most indicative of a positive review:\n ", pos)
print("\nwords that are most indicative of a negative review:\n ", neg)

words that are most indicative of a positive review:
  ['awesome', 'fun', 'definitely', 'dvd', 'enjoyable', 'highly', 'favorite', 'enjoyed', 'hilarious', 'brilliant', 'fantastic', 'best', 'superb', 'today', 'loved', 'wonderful', 'amazing', 'perfect', 'great', 'excellent']

words that are most indicative of a negative review:
  ['worst', 'waste', 'awful', 'boring', 'bad', 'terrible', 'poor', 'dull', 'poorly', 'worse', 'horrible', 'stupid', 'fails', 'disappointment', 'nothing', 'disappointing', 'unfortunately', 'avoid', 'mess']


In [19]:
random_search.best_estimator_.fit(X_train,y_train)
scores = random_search.best_estimator_.score(X_test,y_test)
print("Accuracy Score of Best estimator in Logistic Regression: ",scores)

Accuracy Score of Best estimator in Logistic Regression:  0.8777


In [20]:
most_neg = np.where(random_search.best_estimator_.predict_proba(X_test)[:,0] == max(random_search.best_estimator_.predict_proba(X_test)[:,0]))

In [21]:
print("With associated probability score of: ",max(random_search.best_estimator_.predict_proba(X_test)[:,0]),
      "\nthe review with highest predicted probability of being negative is: \n\n",X_test.iloc[int(most_neg[0][0])])

With associated probability score of:  0.9999989498597482 
the review with highest predicted probability of being negative is: 

 Plankton, or Creatures from the Abyss as I'm positive it's more commonly known as & filmed under as the title Creatures from the Abyss appears over a moving image & in the same font type as the rest of the credits, starts with five 20 something kids, Mike (Clay Rogers) his girlfriend Margaret (Sharon Twomey), sisters Julie (Ann Wolf) & Dorothy (Loren DePalm) & an annoying idiot named Bobby (Michael Bon) whom decide to all fit into a small rubber boat & head out to sea, don't ask why as I don't know. Oh & the complete idiot Bobby left the petrol behind & never thought to tell anyone so it comes as no great surprise that they end up stranded out at sea without any petrol for the motor & to make matters worse they become trapped in a thunder storm & discover a dead body floating in the water. Shortly after their luck seems to change when they come across a yach

In [22]:
most_pos = np.where(random_search.best_estimator_.predict_proba(X_test)[:,1] == max(random_search.best_estimator_.predict_proba(X_test)[:,1]))

In [23]:
print("With associated probability score of: ",max(random_search.best_estimator_.predict_proba(X_test)[:,1]),
      "\nthe review with highest predicted probability of being positive is: \n\n",X_test.iloc[int(most_pos[0][0])])

With associated probability score of:  0.9999879729486912 
the review with highest predicted probability of being positive is: 

 I am beginning to see a very consistent pattern form in the identity of 2007's films. If 2004 was the year of the biographies and 2005 was the year of the political films, 2007 can be identified as a year featuring a wide plethora of morality tales, films that portray, test, challenge and question human morality and the motives that drive us to do certain things. Although this identification is rather broad, I think that there are a handful of films released this year, such as 3:10 To Yuma, Eastern Promises, American Gangster, No Country for Old Men and others that specifically question and study human morals and the motives that drive us to acts such as violence or treachery. Before the Devil Knows You're Dead is a deviously stylish morality tale, and quite a dark, bleak and depressing one at that. And even better is the fact that it comes from one of the g