In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')


data = pd.read_csv("F:\cis660\IMDB Dataset.csv", encoding = 'latin1')

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
#Checking for null values

data.isna().sum().sum()

0

In [3]:
df = data

In [5]:
data['sentiment'].describe()

count        50000
unique           2
top       positive
freq         25000
Name: sentiment, dtype: object

In [6]:
#Data preprocessing methods

import nltk
import re
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

#To tokenize words
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')


#Changing into lowercase
df['review'].str.lower()


#Removing the html strips 
def strip_html(x):
    soup = BeautifulSoup(x, "html.parser")
    return soup.get_text()

#To remove the brackets
def remove_brac(x):
    return re.sub('\[[^]]*\]', '', x)

#Remove special characters
def remove_sp_char(x):
    sp_char = r'[^a-zA-z0-9\s]'
    x = re.sub(sp_char, '', x)
    return x

#removing Stopwords
def remove_stopwords(x):
    tokens = tokenizer.tokenize(x)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]    
    filtered_text=' '.join(filtered_tokens)
    return filtered_text

#Stemming
def stemming(x):
    ps = nltk.porter.PorterStemmer()
    x=' '.join([ps.stem(word) for word in x.split()])
    return x


def remove_noise(x):
    x = strip_html(x)
    x = remove_brac(x)
    x = remove_sp_char(x)
    x = remove_stopwords(x)
    x = stemming(x)
    return x

#Removing noise from reviews
df['review'] = df['review'].apply(remove_noise)

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,a wonder littl product the film techniqu unass...,positive
2,i thought wonder way spend time hot summer wee...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive


In [8]:
label_encoder = preprocessing.LabelEncoder()
df['sentiment']= label_encoder.fit_transform(df['sentiment'])

#Positive - 1
#Negative - 0


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,a wonder littl product the film techniqu unass...,1
2,i thought wonder way spend time hot summer wee...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [10]:
a = np.array(df['review'])
b = np.array(df['sentiment'])

# Tfidf

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv=TfidfVectorizer()
tv_review = tv.fit_transform(a)

In [12]:
from sklearn.model_selection import train_test_split

tv_X = tv_review
Y = b

tv_X_train, tv_X_test, tv_Y_train, tv_Y_test = train_test_split(tv_X, Y, random_state=42, train_size = 0.60)

# LinearSVC

In [13]:
from sklearn.svm import LinearSVC

tv_svm_clf = LinearSVC()

tv_svm_clf.fit(tv_X_train, tv_Y_train)

LinearSVC()

In [14]:
tv_svm_clf_pred = tv_svm_clf.predict(tv_X_test)


In [15]:
from sklearn import metrics
from sklearn.metrics import classification_report 
tv_svm_clf_accuracy_score = metrics.accuracy_score(tv_svm_clf_pred , tv_Y_test)

In [16]:
print ("The accuracy score of tfidf model of svm classifier is ",tv_svm_clf_accuracy_score)

The accuracy score of tfidf model of svm classifier is  0.8895


In [17]:
target_names = ['Negative', 'Positive']
print ("Classification report for tfidf model using svm classifier")
print (classification_report(tv_Y_test, tv_svm_clf_pred, target_names=target_names))

Classification report for tfidf model using svm classifier
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      9989
    Positive       0.88      0.90      0.89     10011

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000



In [19]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(tv_Y_test, tv_svm_clf_pred, labels=[0,1])

array([[8769, 1220],
       [ 990, 9021]], dtype=int64)

# Prediction of new reviews

In [20]:
custom_review=['This is a good movie the action scenes are amazing. the acting is god tier. must recommend']

custom_review2 = ['This movie is bad']

In [21]:
cr1 = tv.transform(custom_review)

cr2 = tv.transform(custom_review2)

In [22]:
#print (cr1)

In [23]:
print ('the sentiment of the custom review 1 is',tv_svm_clf.predict(cr1))
print ('the sentiment of the custom review 2 is',tv_svm_clf.predict(cr2))

the sentiment of the custom review 1 is [1]
the sentiment of the custom review 2 is [0]


# Multinomial NB

In [24]:
from sklearn.naive_bayes import MultinomialNB

tv_mnb = MultinomialNB()
tv_mnb.fit(tv_X_train, tv_Y_train)
y_pred = tv_mnb.predict(tv_X_test)

In [25]:
tv_mnb_pred = tv_mnb.predict(tv_X_test)


In [26]:
tv_mnb_accuracy_score = metrics.accuracy_score(tv_mnb_pred , tv_Y_test)

In [27]:
 print ("The accuracy score of tfidf model using multinomialNB is ",tv_mnb_accuracy_score)

The accuracy score of tfidf model using multinomialNB is  0.8592


In [28]:
print ("Classification report for tfidf model using multinomial NB")
print (classification_report(tv_Y_test, tv_mnb_pred, target_names=target_names))

Classification report for tfidf model using multinomial NB
              precision    recall  f1-score   support

    Negative       0.85      0.87      0.86      9989
    Positive       0.87      0.84      0.86     10011

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000



In [30]:
print("The confusion matrix for Multinomial NB is")
confusion_matrix(tv_Y_test, tv_mnb_pred, labels=[0,1])

The confusion matrix for Multinomial NB is


array([[8729, 1260],
       [1556, 8455]], dtype=int64)

# Decision tree classifier

In [31]:
from sklearn.tree import DecisionTreeClassifier

tv_clf = DecisionTreeClassifier(random_state=0)

tv_clf.fit(tv_X_train, tv_Y_train)


DecisionTreeClassifier(random_state=0)

In [32]:
tv_clf_pred = tv_clf.predict(tv_X_test)


In [33]:
tv_clf_accuracy_score = metrics.accuracy_score(tv_clf_pred , tv_Y_test)

In [34]:
print ("The accuracy score of tfidf model of decision tree classifier is ",tv_clf_accuracy_score)

The accuracy score of tfidf model of decision tree classifier is  0.71835


In [35]:
print ("Classification report for tfidf model using Decision tree classifier")
print (classification_report(tv_Y_test, tv_clf_pred, target_names=target_names))

Classification report for tfidf model using Decision tree classifier
              precision    recall  f1-score   support

    Negative       0.72      0.72      0.72      9989
    Positive       0.72      0.72      0.72     10011

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



In [37]:
print("The confusion matrix for Decision tree classifier is")
confusion_matrix(tv_Y_test, tv_mnb_pred, labels=[0,1])

The confusion matrix for Decision tree classifier is


array([[8729, 1260],
       [1556, 8455]], dtype=int64)