In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm
from bs4 import BeautifulSoup

ps = PorterStemmer()
lm = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/praveen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('IMDB_Dataset.csv')
df.shape

(50000, 2)

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def remove_html_tags(review):
    return BeautifulSoup(review, 'lxml').get_text()

def remove_url(review):
    return re.sub(r"http\S+", "", review)

def remove_alpha_numeric(review):
    return re.sub("\S*\d\S*", "", review).strip()

def remove_special_char(review):
    return re.sub('[^A-Za-z]+', ' ', review)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [5]:
def denoise_text(text):
    text = remove_html_tags(text)
    text = remove_url(text)
    text = remove_alpha_numeric(text)
    text = remove_special_char(text)
    text = decontracted(text)
    return text

In [6]:
df['cleaned_review']=df['review'].apply(denoise_text)

In [7]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei s Love in the Time of Money is a...


In [8]:
_stopwords=set(stopwords.words('english'))

def remove_stop_words(text):
    text = ' '.join(e.lower() for e in text.split() if e.lower() not in _stopwords)
    return text.strip()



In [9]:
df['cleaned_review']=df['cleaned_review'].apply(remove_stop_words)

In [10]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [11]:
replace_map = {'sentiment':{'positive':1,'negative':0}}
df.replace(replace_map,inplace=True)

In [12]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...


In [13]:
df.drop(['review'],axis=1,inplace=True)
df_train = df[:40000]
df_test = df[40000:]

### Converting Review text to numeric values

In [14]:
# 1. Using One Hot Encoding
# for each word a column is created , 
# 1 represents word present in review , 0 represents word does not present in review.
# This creates very large sparse matrix


cv = CountVectorizer(binary=True)
cv.fit(df_train['cleaned_review'])
X = cv.transform(df_train['cleaned_review'])
X_test = cv.transform(df_test['cleaned_review'])

In [15]:

X_train, X_val, y_train, y_val = train_test_split(X, df_train['sentiment'], train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C = c)
    lr.fit(X_train,y_train)
    print("Accuracy of model with C = {} is {}".format(c, accuracy_score(y_val, lr.predict(X_val))))
    



Accuracy of model with C = 0.01 is 0.8825
Accuracy of model with C = 0.05 is 0.8893
Accuracy of model with C = 0.25 is 0.8868
Accuracy of model with C = 0.5 is 0.8818
Accuracy of model with C = 1 is 0.8781


In [16]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, df_train['sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['sentiment'], final_model.predict(X_test)))

Final Accuracy: 0.8891


### Using n-grams to convert text to vectors

In [17]:
# 2. Using n-grams
# along with considering 1 word , here we are consider 2 words together as one .
# this creates very large metrix 

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(df_train['cleaned_review'])
X = ngram_vectorizer.transform(df_train['cleaned_review'])
X_test = ngram_vectorizer.transform(df_test['cleaned_review'])

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, df_train['sentiment'], train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C = c)
    lr.fit(X_train,y_train)
    print("Accuracy of model with C = {} is {}".format(c, accuracy_score(y_val, lr.predict(X_val))))
    



Accuracy of model with C = 0.01 is 0.8913
Accuracy of model with C = 0.05 is 0.8995
Accuracy of model with C = 0.25 is 0.8996
Accuracy of model with C = 0.5 is 0.8995
Accuracy of model with C = 1 is 0.8996


In [19]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, df_train['sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['sentiment'], final_model.predict(X_test)))

Final Accuracy: 0.8964


### TF IDF

In [20]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df_train['cleaned_review'])
X = tfidf_vectorizer.transform(df_train['cleaned_review'])
X_test = tfidf_vectorizer.transform(df_test['cleaned_review'])


X_train, X_val, y_train, y_val = train_test_split(X, df_train['sentiment'], train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C = c)
    lr.fit(X_train,y_train)
    print("Accuracy of model with C = {} is {}".format(c, accuracy_score(y_val, lr.predict(X_val))))
    
    



Accuracy of model with C = 0.01 is 0.8245
Accuracy of model with C = 0.05 is 0.8534
Accuracy of model with C = 0.25 is 0.8818
Accuracy of model with C = 0.5 is 0.8892
Accuracy of model with C = 1 is 0.8941


In [25]:
final_model = LogisticRegression(C=0.75)
final_model.fit(X, df_train['sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['sentiment'], final_model.predict(X_test)))



Final Accuracy: 0.9005


### SVM Model

In [28]:
from sklearn.svm import LinearSVC


ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(df_train['cleaned_review'])
X = ngram_vectorizer.transform(df_train['cleaned_review'])
X_test = ngram_vectorizer.transform(df_test['cleaned_review'])


X_train, X_val, y_train, y_val = train_test_split(X, df_train['sentiment'], train_size = 0.75)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    svm = LinearSVC(C = c)
    svm.fit(X_train,y_train)
    print("Accuracy of model with C = {} is {}".format(c, accuracy_score(y_val, svm.predict(X_val))))
  

Accuracy of model with C = 0.001 is 0.8854
Accuracy of model with C = 0.005 is 0.8953
Accuracy of model with C = 0.01 is 0.8947
Accuracy of model with C = 0.05 is 0.8929
Accuracy of model with C = 0.1 is 0.8919


In [31]:
final_model = LinearSVC(C=0.01)
final_model.fit(X, df_train['sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(df_test['sentiment'], final_model.predict(X_test)))

Final Accuracy: 0.9008
