In [2]:
#Importing all the important libraries
import numpy as np
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
import regex

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
#On positive review
#encoding - latin-1 coverts text file as csv file
pos_rev = pd.read_csv('pos.txt', encoding='latin-1', header=None, sep='\n')
pos_rev

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."
...,...
5326,both exuberantly romantic and serenely melanch...
5327,mazel tov to a film about a family's joyous li...
5328,standing in the shadows of motown is the best ...
5329,it's nice to see piscopo again after all these...


In [34]:
#Creating a target variable
pos_rev['mood'] = 1
pos_rev.rename(columns={0: 'review'}, inplace = True)
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [35]:
#On negative review
#encoding - latin-1 coverts text file as csv file
neg_rev = pd.read_csv('negative.txt', encoding='latin-1', header=None, sep='\n')
#Creating a target variable
neg_rev['mood'] = 0
neg_rev.rename(columns={0: 'review'}, inplace = True)
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [5]:
#Convert to lower case
#Remove the punctuations
#Remove stopwords
#Lemmatization
#Join the cleaned data

In [40]:
lemma = WordNetLemmatizer()

In [77]:
#Cleaning the positive data

pos_rev['review'] = pos_rev.review.apply(lambda x: x.lower()) #Coverting to lowerCase
pos_rev['review'] = pos_rev.review.apply(lambda x: regex.sub("[^a-z ]+", "", x)) #Removing punctuations
pos_rev['review'] = pos_rev.review.apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x.split() if word not in stopwords.words('english') and len(word)>2])) #Removing stopwords
pos_rev

Unnamed: 0,review,mood
0,rock destined century new conan going make spl...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tootepid biopic,1
3,sometimes like movie fun wasabi good place start,1
4,emerges something rare issue movie honest keen...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family joyous life acting yiddi...,1
5328,standing shadow motown best kind documentary o...,1
5329,nice see piscopo year chaykin headly priceless,1


In [78]:
#Cleaning the negative data

neg_rev['review'] = neg_rev.review.apply(lambda x: x.lower()) #Coverting to lowerCase
neg_rev['review'] = neg_rev.review.apply(lambda x: regex.sub("[^a-z ]+", "", x)) #Removing punctuations
neg_rev['review'] = neg_rev.review.apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x.split() if word not in stopwords.words('english') and len(word)>2])) #Removing stopwords
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boy could possibly fi...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find moving,0
5327,many definition time waster movie must surely one,0
5328,stand crocodile hunter hurried badly cobbled l...,0
5329,thing look like madeforhomevideo quickie,0


In [79]:
#Concatinate two data frames

com_rev = pd.concat([pos_rev, neg_rev], axis = 0).reset_index(drop= True)
com_rev

Unnamed: 0,review,mood
0,rock destined century new conan going make spl...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tootepid biopic,1
3,sometimes like movie fun wasabi good place start,1
4,emerges something rare issue movie honest keen...,1
...,...,...
10657,terrible movie people nevertheless find moving,0
10658,many definition time waster movie must surely one,0
10659,stand crocodile hunter hurried badly cobbled l...,0
10660,thing look like madeforhomevideo quickie,0


In [80]:
#Train test split

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'].values, test_size = 0.2, random_state = 101)

In [81]:
train_data = pd.DataFrame({'review': X_train, 'mood':y_train})
test_data = pd.DataFrame({'review':X_test, 'mood':y_test})

In [82]:
train_data

Unnamed: 0,review,mood
0,put washington honest working man john archiba...,0
1,poignant familiar story young person suspended...,1
2,timely director could ever dreamed quietly lyr...,1
3,film virtually choke selfconsciousness,0
4,film take inside rhythm subject experience watch,1
...,...,...
8524,branagh forceful nonshakespeare screen perform...,1
8525,movie friday fan critic damned already like so...,0
8526,perhaps heaviest joyless movie ever made giant...,0
8527,film rival live fine little amusebouche keep a...,1


In [83]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move make ...,1
1,never seen heard anything quite like film reco...,1
2,ending leave unfulfilled performance enjoy mem...,1
3,surface loversontherun crime flick lot common ...,1
4,walk remember shrewd enough activate girlish t...,0
...,...,...
2128,bullock good job working natural likability,1
2129,result memorable least interesting,1
2130,apparently designed reverie memory regret thin...,0
2131,movie insecure capacity excite churn one two f...,0


In [84]:
#Coverting to vectors form
vectoriser = TfidfVectorizer()
train_vectors = vectoriser.fit_transform(train_data['review'])
test_vectors = vectoriser.transform(test_data['review'])

In [85]:
#Id dataset is large better to use SVM or Naive bayes
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [96]:
svc_classifier = SVC()
params = {'C': [0.01, 0.1, 1.0],
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
          'gamma' : ['scale', 'auto']
         }
rs = RandomizedSearchCV(svc_classifier, params,  cv=10, n_jobs=-1)
final = rs.fit(train_vectors, train_data['mood'])
final.best_params_

{'kernel': 'linear', 'gamma': 'scale', 'C': 1.0}

In [97]:
svc_classifier.set_params(kernel= 'linear', gamma= 'scale', C= 1.0)

SVC(kernel='linear')

In [98]:
svc_classifier.fit(train_vectors, train_data['mood'])

SVC(kernel='linear')

In [99]:
nb_classifier = GaussianNB()
nb_classifier.fit(train_vectors.toarray(), train_data['mood'])

GaussianNB()

In [100]:
svc_pred = svc_classifier.predict(test_vectors)

In [101]:
nb_pred = nb_classifier.predict(test_vectors.toarray())

In [102]:

print(classification_report(test_data['mood'] , svc_pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1053
           1       0.77      0.79      0.78      1080

    accuracy                           0.78      2133
   macro avg       0.78      0.78      0.78      2133
weighted avg       0.78      0.78      0.78      2133



In [103]:
nb_report = classification_report(test_data['mood'] , nb_pred , output_dict=True)
print(f"positve {report['1']['recall']}")
print(f"neagtive {report['0']['recall']}")

positve 0.8009259259259259
neagtive 0.7692307692307693


In [104]:
#saving the model
import joblib
joblib.dump(vectoriser, 'tfidf_vector_model.pkl')
joblib.dump(svc_classifier, 'netflix_75.pkl')

['netflix_75.pkl']

In [105]:
#Prediction

#load the models before prediction
tfidf = joblib.load('tfidf_vector_model.pkl')
model = joblib.load('netflix_75.pkl')

data = ['bad movie']

vector = tfidf.transform(data).toarray()
my_pred = model.predict(vector)

print(my_pred)

if my_pred[0] == 1:
    print('Positive Review')
else:
    print('Negative Review')

[0]
Negative Review
