In [2]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

import joblib

In [3]:
sw=stopwords.words('english')

In [4]:
# reading the review data

review=pd.read_csv('reviews.txt',sep='\t',names=['mood','review'])

In [5]:
review

Unnamed: 0,mood,review
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [6]:
# Cleaning the Data

review.loc[:,'review']=review.loc[:,'review'].apply(lambda x:x.lower())
review.loc[:,'review']=review.loc[:,'review'].apply(lambda x:re.sub(r"@\S","",x))
review.loc[:,'review']=review.loc[:,'review'].apply(lambda x:x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
review.loc[:,'review']=review.loc[:,'review'].apply(lambda x:" ".join([word for word in x.split() if word not in (sw)]))
review.loc[:,'review']=review.loc[:,'review'].apply(lambda x:WordNetLemmatizer().lemmatize(x,pos='v'))



In [7]:
review

Unnamed: 0,mood,review
0,1,da vinci code book awesome
1,1,first clive cussler ive ever read even books l...
2,1,liked da vinci code lot
3,1,liked da vinci code lot
4,1,liked da vinci code ultimatly didnt seem hold
...,...,...
6913,0,brokeback mountain boring
6914,0,brokeback mountain really depressing
6915,0,sit watching mtv movie awards reminded much de...
6916,0,ok brokeback mountain horrible movie


In [8]:
# splitting data into x and y ndarrays

X=review['review'].values
y=review['mood'].values


In [9]:
# train test split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [10]:
# just to see the results in a dataframe

train_data=pd.DataFrame({'review':X_train, 'mood': y_train})
test_data=pd.DataFrame({'review':X_test, 'mood': y_test})

In [11]:
train_data

Unnamed: 0,review,mood
0,da vinci code sucks,0
1,want love harry potter really want place peopl...,1
2,like harry potter,1
3,da vinci code sucks,0
4,da vinci code awesome,1
...,...,...
5529,love luv lubb da vinci code,1
5530,harry potter movies really suck,0
5531,like mission impossible movies never know whos...,1
5532,mission impossible 2 rocks,1


In [12]:
test_data

Unnamed: 0,review,mood
0,love harry potter,1
1,said silent hill turned reality coz hella like...,1
2,people worth know much love da vinci code,1
3,hes likeyeah got acne love brokeback mountain,1
4,da vinci code sucked,0
...,...,...
1379,brokeback mountain awesome,1
1380,friday hung kelsie went saw da vinci code sucked,0
1381,hes likeyeah got acne love brokeback mountain,1
1382,gonna like watch mission impossible hoot,1


In [13]:
# Coverting the text data of review column into vectors

vectorizer=TfidfVectorizer()
train_vectors=vectorizer.fit_transform(train_data['review'])
test_vectors=vectorizer.transform(test_data['review'])

In [14]:
# Applying Naive Bayes Model for prediction

MNB=MultinomialNB()
MNB.fit(train_vectors,y_train)
predict=MNB.predict(test_vectors)

In [15]:
report=classification_report(y_test,predict,output_dict=True)
print(f"positive: {report['1']['recall']}")
print(f"negative: {report['0']['recall']}")

positive: 0.9974651457541192
negative: 0.9781512605042016


In [16]:
# saving the model

model_filename='NLP_model.pkl'
vectorizer_filename='NLP_vector.pkl'

joblib.dump(MNB,model_filename)
joblib.dump(vectorizer,vectorizer_filename)

['NLP_vector.pkl']