## Movie Review Analysis using TF-IDF

In [1]:
import pandas as pd
df = pd.read_csv('movie-Review.csv')

In [2]:
import numpy as np
np.random.seed(500)

In [3]:
#Remove number
import re # import all Regular expression functions
df['text']=[re.sub('\d','', i)for i in df['text']]
df.head(10)

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...
5,Pos,on june a self taught idealistic ye...
6,Pos,apparently director tony kaye had a major b...
7,Pos,one of my colleagues was surprised when i tol...
8,Pos,after bloody clashes and independence won l...
9,Pos,the american action film has been slowly drow...


In [4]:
# Replace punctuations with a white space
import string
df['text']=[re.sub('[%s]' % re.escape(string.punctuation), ' ', i) for i in df['text']]
df.head(10)

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...
5,Pos,on june a self taught idealistic ye...
6,Pos,apparently director tony kaye had a major b...
7,Pos,one of my colleagues was surprised when i tol...
8,Pos,after bloody clashes and independence won l...
9,Pos,the american action film has been slowly drow...


In [5]:
df['text']=[i.lower() for i in df['text']]

In [6]:
# import pandas as pd 
import pandas as pd 
#Word Tokenization
import nltk # import package for tokenization
#nltk.download('punkt') # download all spporting function /files for NLTK package
from nltk.tokenize import word_tokenize
df['text_wt'] = [word_tokenize(i) for i in df['text']]
df.head()

Unnamed: 0,class,text,text_wt
0,Pos,films adapted from comic books have had plent...,"[films, adapted, from, comic, books, have, had..."
1,Pos,every now and then a movie comes along from a...,"[every, now, and, then, a, movie, comes, along..."
2,Pos,you ve got mail works alot better than it des...,"[you, ve, got, mail, works, alot, better, than..."
3,Pos,jaws is a rare film that grabs your atte...,"[jaws, is, a, rare, film, that, grabs, your, a..."
4,Pos,moviemaking is a lot like being the general m...,"[moviemaking, is, a, lot, like, being, the, ge..."


In [7]:
#To show the stop words
#nltk.download('stopwords') #download Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#Remove All Stop Word
df['text_SW'] = [[i for i in j if not i in stop_words] for j in df['text_wt']]# remove the word which is aviable in stopword libr
df.head()

Unnamed: 0,class,text,text_wt,text_SW
0,Pos,films adapted from comic books have had plent...,"[films, adapted, from, comic, books, have, had...","[films, adapted, comic, books, plenty, success..."
1,Pos,every now and then a movie comes along from a...,"[every, now, and, then, a, movie, comes, along...","[every, movie, comes, along, suspect, studio, ..."
2,Pos,you ve got mail works alot better than it des...,"[you, ve, got, mail, works, alot, better, than...","[got, mail, works, alot, better, deserves, ord..."
3,Pos,jaws is a rare film that grabs your atte...,"[jaws, is, a, rare, film, that, grabs, your, a...","[jaws, rare, film, grabs, attention, shows, si..."
4,Pos,moviemaking is a lot like being the general m...,"[moviemaking, is, a, lot, like, being, the, ge...","[moviemaking, lot, like, general, manager, nfl..."


In [8]:
#nltk.download('tagsets')
#nltk.help.upenn_tagset()# tagset documentation
#nltk.download('wordnet')
from collections import defaultdict #Default Dictionary is imported from collections
from nltk.corpus import wordnet as wn #the corpus reader wordnet is imported.
from nltk.tag import pos_tag
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN) #Dictionary is created where pos_tag (first letter) are the key values 
tag_map['J'] = wn.ADJ                   #whose values are mapped with the value 
tag_map['V'] = wn.VERB                  #from wordnet dictionary. We have taken the only first letter as 
tag_map['R'] = wn.ADV
# we will use it later in the loop.
#tag_map

In [9]:
#lemmatization
from nltk.stem import WordNetLemmatizer 
 # Initializing WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

df['lemma']=[[lemmatizer.lemmatize(word,tag_map[tag[0]]) for word ,tag in pos_tag(i)] for i in df['text_SW']] 
df.head()

Unnamed: 0,class,text,text_wt,text_SW,lemma
0,Pos,films adapted from comic books have had plent...,"[films, adapted, from, comic, books, have, had...","[films, adapted, comic, books, plenty, success...","[film, adapt, comic, book, plenty, success, wh..."
1,Pos,every now and then a movie comes along from a...,"[every, now, and, then, a, movie, comes, along...","[every, movie, comes, along, suspect, studio, ...","[every, movie, come, along, suspect, studio, e..."
2,Pos,you ve got mail works alot better than it des...,"[you, ve, got, mail, works, alot, better, than...","[got, mail, works, alot, better, deserves, ord...","[get, mail, work, alot, good, deserves, order,..."
3,Pos,jaws is a rare film that grabs your atte...,"[jaws, is, a, rare, film, that, grabs, your, a...","[jaws, rare, film, grabs, attention, shows, si...","[jaw, rare, film, grab, attention, show, singl..."
4,Pos,moviemaking is a lot like being the general m...,"[moviemaking, is, a, lot, like, being, the, ge...","[moviemaking, lot, like, general, manager, nfl...","[moviemaking, lot, like, general, manager, nfl..."


In [10]:
 df['lemma2']= df['lemma'].apply(lambda x: ' '.join(x))

In [11]:
df['lemma2'].head()

0    film adapt comic book plenty success whether s...
1    every movie come along suspect studio every in...
2    get mail work alot good deserves order make fi...
3    jaw rare film grab attention show single image...
4    moviemaking lot like general manager nfl team ...
Name: lemma2, dtype: object

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=5000)
Tfidf= tf.fit_transform(df['lemma2']).toarray()

In [13]:
pd.DataFrame(Tfidf, columns=tf.get_feature_names()).head()

Unnamed: 0,aaron,abandon,ability,able,aboard,abound,abraham,absence,absent,absolute,...,youth,zane,zany,zellweger,zero,zeta,zombie,zone,zoom,zwick
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.036021,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
df['class2'] = Encoder.fit_transform(df['class'])


In [15]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(Tfidf,df['class2'],test_size=0.2)

In [16]:
pd.DataFrame(Test_X, columns=tf.get_feature_names()).head()

Unnamed: 0,aaron,abandon,ability,able,aboard,abound,abraham,absence,absent,absolute,...,youth,zane,zany,zellweger,zero,zeta,zombie,zone,zoom,zwick
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057196,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029282,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.045214,0.0,0.067821,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",round(accuracy_score(predictions_NB, Test_Y)*100,2),"%")

Naive Bayes Accuracy Score ->  81.5 %


In [18]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",round(accuracy_score(predictions_SVM, Test_Y)*100,2),"%")

SVM Accuracy Score ->  85.0 %


In [19]:
# Fitting Random Forest Classification 
# to the Training set 
from sklearn.ensemble import RandomForestClassifier 

# n_estimators can be said as number of 
# trees, experiment with n_estimators 
# to get better results 
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy') 
model.fit(Train_X, Train_Y) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=501,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
# Predicting the Test set results 
y_pred = model.predict(Test_X) 


In [21]:
# Making the Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(Test_Y, y_pred) 
# Use accuracy_score function to get the accuracy
print("Random forest Accuracy Score -> ",round(accuracy_score(y_pred, Test_Y)*100,2),"%")

Random forest Accuracy Score ->  83.0 %


In [23]:
# 1. import
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# 2. instantiate a logistic regression model
logreg = LogisticRegression()
# 3. train the model using X_train_dtm
%time logreg.fit(Train_X, Train_Y)#4. make class predictions for Test_X
predictions_log = logreg.predict(Test_X)

print(" ------ Confusion Matrix-----[TN FP  FN TP]")

print(metrics.confusion_matrix(predictions_log, Test_Y))
print(metrics.classification_report(predictions_log, Test_Y))
# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",round(accuracy_score(predictions_log, Test_Y)*100,2),"%")
print("Logistic Regression Area under curve -> ",round(metrics.roc_auc_score(predictions_log, Test_Y),2))

Wall time: 56.6 ms
 ------ Confusion Matrix-----[TN FP  FN TP]
[[169  27]
 [ 27 177]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       196
           1       0.87      0.87      0.87       204

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400

Logistic Regression Accuracy Score ->  86.5 %
Logistic Regression Area under curve ->  0.86


