In [1]:
#import important libraries
import numpy as np
import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
#Getting the List of all Stopwords
sw = stopwords.words('english')
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
#reading the positive data
pos_rev = pd.read_csv('netflix/pos.txt',sep='\n', header = None, encoding = 'latin-1')
#creating target column
pos_rev['mood'] = 1.0
pos_rev =pos_rev.rename(columns = {0:'review'})
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1.0
1,"the gorgeously elaborate continuation of "" the...",1.0
2,effective but too-tepid biopic,1.0
3,if you sometimes like to go to the movies to h...,1.0
4,"emerges as something rare , an issue movie tha...",1.0
...,...,...
5326,both exuberantly romantic and serenely melanch...,1.0
5327,mazel tov to a film about a family's joyous li...,1.0
5328,standing in the shadows of motown is the best ...,1.0
5329,it's nice to see piscopo again after all these...,1.0


In [4]:
#Reading the negative data
neg_rev = pd.read_csv('netflix/negative.txt', sep='\n', header = None, encoding = 'latin-1')
#adding a target column
neg_rev['mood']=0.0
neg_rev = neg_rev.rename(columns = {0:'review'})
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0.0
1,"it's so laddish and juvenile , only teenage bo...",0.0
2,exploitative and largely devoid of the depth o...,0.0
3,[garbus] discards the potential for pathologic...,0.0
4,a visually flashy but narratively opaque and e...,0.0
...,...,...
5326,a terrible movie that some people will neverth...,0.0
5327,there are many definitions of 'time waster' bu...,0.0
5328,"as it stands , crocodile hunter has the hurrie...",0.0
5329,the thing looks like a made-for-home-video qui...,0.0


In [5]:
#cleaning the data
'''
1. Lower
2. remove spaces
3. puctuation
4. stopwords
5. Lemmatization
'''

'\n1. Lower\n2. remove spaces\n3. puctuation\n4. stopwords\n5. Lemmatization\n'

In [6]:
#Cleaning Positive data
pos_rev.loc[:,'review'] = pos_rev.loc[:,'review'].apply(lambda x:x.lower())
pos_rev.loc[:,'review'] = pos_rev.loc[:,'review'].apply(lambda x: re.sub(r"@\S+","",x))
pos_rev.loc[:,'review'] = pos_rev.loc[:,'review'].apply(lambda x: x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
pos_rev.loc[:,'review'] = pos_rev.loc[:,'review'].apply(lambda x: " ".join([word for word in x.split() if word not in (sw)]))

In [7]:
#Cleaning Negative data
neg_rev.loc[:,'review'] = neg_rev.loc[:,'review'].apply(lambda x:x.lower())
neg_rev.loc[:,'review'] = neg_rev.loc[:,'review'].apply(lambda x: re.sub(r"@\S+","",x))
neg_rev.loc[:,'review'] = neg_rev.loc[:,'review'].apply(lambda x: x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
neg_rev.loc[:,'review'] = neg_rev.loc[:,'review'].apply(lambda x: " ".join([word for word in x.split() if word not in (sw)]))

In [9]:
#Row wise concatincation of negative and positive data
com_rev = pd.concat([pos_rev, neg_rev], axis=0).reset_index()
com_rev

Unnamed: 0,index,review,mood
0,0,rock destined 21st centurys new conan hes goin...,1.0
1,1,gorgeously elaborate continuation lord rings t...,1.0
2,2,effective tootepid biopic,1.0
3,3,sometimes like go movies fun wasabi good place...,1.0
4,4,emerges something rare issue movie thats hones...,1.0
...,...,...,...
10657,5326,terrible movie people nevertheless find moving,0.0
10658,5327,many definitions time waster movie must surely...,0.0
10659,5328,stands crocodile hunter hurried badly cobbled ...,0.0
10660,5329,thing looks like madeforhomevideo quickie,0.0


In [13]:
#Train_test_split
X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'].values, test_size = 0.2, random_state =101) 

In [14]:
#Dataframes to compare test and train dataset (No necessary)
train_data = pd.DataFrame({'review': X_train,'mood': y_train})
test_data = pd.DataFrame({'review': X_test,'mood': y_test})

In [15]:
train_data

Unnamed: 0,review,mood
0,puts washington honest working man john q arch...,0.0
1,poignant familiar story young person suspended...,1.0
2,timely director could ever dreamed quietly lyr...,1.0
3,film virtually chokes selfconsciousness,0.0
4,film takes inside rhythms subject experience w...,1.0
...,...,...
8524,branagh forceful nonshakespeare screen perform...,1.0
8525,movie friday fans critics damned already like ...,0.0
8526,perhaps heaviest joyless movie ever made giant...,0.0
8527,film rival live fine little amusebouche keep a...,1.0


In [16]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move us ma...,1.0
1,ive never seen heard anything quite like film ...,1.0
2,ending leave unfulfilled performances enjoy me...,1.0
3,surface loversontherun crime flick lot common ...,1.0
4,walk remember shrewd enough activate girlish t...,0.0
...,...,...
2128,bullock good job working natural likability,1.0
2129,results memorable least interesting,1.0
2130,apparently designed reverie memory regret thin...,0.0
2131,movie insecure capacity excite churns one two ...,0.0


In [25]:
#Convert the data to numerical form using TF-IDF
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(train_data['review'])
test_vector = vectorizer.transform(['review']) #Fit_transform is not used because we want our test data to be completely new
#and to scale the data

In [26]:
from sklearn import svm
from sklearn.metrics import classification_report

In [35]:
classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vector, train_data['mood'])

SVC(kernel='linear')

In [36]:
pred = classifier.predict(test_vector)

In [37]:
report = classification_report(test_data['mood'], pred, output_dict = True)
print(f"Positive {report['1.0']['recall']}")
print(f"Negative {report['0.0']['recall']}")

ValueError: Found input variables with inconsistent numbers of samples: [2133, 1]

In [41]:
a = input('Write the review: ')
vector = vectorizer.transform([a]).toarray()
mypred = classifier.predict(vector)
print(" ",mypred)

Write the review: What the hell is wrong
  [0.]


In [43]:
#Task
#1. Use Lemmatization
#2. use Navie Bayes
#3. use random forest

In [None]:
#Methods to save the model
'''
1. pickle
2. joblib
'''

In [48]:
#Saving the model using joblib
import joblib
model_file_name = 'netflix_svm_model.pkl'
vectorizer_file_name = 'netflix_vector.pkl'
joblib.dump(classifier, model_file_name)
joblib.dump(vectorizer, vectorizer_file_name)

['netflix_vector.pkl']

In [49]:
#Loading the model
clf = joblib.load('netflix_svm_model.pkl')
vect = joblib.load('netflix_vector.pkl')

In [None]:
'''
Pipeline to follow after the models have been loaded:
1. Take input from the user
2. Covert the input to a list
3. Provide the vectoriser the list as an input to convert it into vectorizer form
4. Prodive the vector as an input to the model
'''

In [50]:
a = input('Write the review: ')
vector = vect.transform([a]).toarray()
mypred = clf.predict(vector)
print("",mypred)

Write the review: Awesome movie
  [1.]
