In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df = pd.read_csv('datasets/review.csv')
df.shape

(50000, 2)

In [3]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Text Cleaning

In [38]:
#sample random 2000 rows
df=df.sample(2000)
df.shape

(2000, 2)

In [39]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)
df.head()

Unnamed: 0,review,sentiment
7994,the whole shorthand for supposedly being more ...,1
4440,patricia arquette plays american doctor laura ...,1
18717,this film is a summary of visconti's obssessio...,1
35073,"this eloquent, simple film makes a remarkably ...",1
14317,"stan as a bullfighter, and a good one, is quit...",1


In [40]:
df.iloc[1].review

'patricia arquette plays american doctor laura bowman, who takes a holiday to burma in an attempt to heal her spirit after the murders of her husband and young son. she is left behind in rangoon during a military crackdown and leaves the city with an aging man who works as a "tour guide." but he is no simple tour guide; he is a professor who introduces her to the life outside of the tourist traps ... the two of them get caught up in the political upheaval and laura sees with her own eyes how the government betrays and oppresses its own people.this movie is one of my favorites because of its themes. first, it\'s informational (describing some of the injustices that are occurring in burma). secondly, it\'s about a woman\'s struggle to find meaning in life after an incredible loss. thirdly, it\'s about compassion and sacrifice, and people coming together - without even knowing each other - to endure pain and fear.just about every beautiful scene in this movie is important; nothing is wast

In [41]:
#removing html tags
import re
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[1].review)

'patricia arquette plays american doctor laura bowman, who takes a holiday to burma in an attempt to heal her spirit after the murders of her husband and young son. she is left behind in rangoon during a military crackdown and leaves the city with an aging man who works as a "tour guide." but he is no simple tour guide; he is a professor who introduces her to the life outside of the tourist traps ... the two of them get caught up in the political upheaval and laura sees with her own eyes how the government betrays and oppresses its own people.this movie is one of my favorites because of its themes. first, it\'s informational (describing some of the injustices that are occurring in burma). secondly, it\'s about a woman\'s struggle to find meaning in life after an incredible loss. thirdly, it\'s about compassion and sacrifice, and people coming together - without even knowing each other - to endure pain and fear.just about every beautiful scene in this movie is important; nothing is wast

In [42]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [43]:
df['review'] = df['review'].apply(clean_html)

In [44]:
#convert to lowercase
def conv_low(text):
    return text.lower()

In [45]:
df['review'] = df['review'].apply(conv_low)

In [46]:
#remove special characters
def rem_special(text):
    x = ''
    for i in text:
        if i.isalnum():
            x = x + i
        else:
            x = x + ' '
    return x

In [47]:
rem_special('t@he $$ king i&*n the north')

't he    king i  n the north'

In [48]:
df['review'] = df['review'].apply(rem_special)
df1 = df

In [49]:
#remove stop word (words like: and, a, if)
import nltk

In [50]:
from nltk.corpus import stopwords #stopwords is a class

In [51]:
def rem_stopwords(text):
    x = []
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y = x[:]
    x.clear()
    return y

In [52]:
df1['review'] = df1['review'].apply(rem_stopwords)
df2 = df1
df2

Unnamed: 0,review,sentiment
7994,"[whole, shorthand, supposedly, aware, weird, t...",1
4440,"[patricia, arquette, plays, american, doctor, ...",1
18717,"[film, summary, visconti, obssessions, decaden...",1
35073,"[eloquent, simple, film, makes, remarkably, cl...",1
14317,"[stan, bullfighter, good, one, quite, surprise...",1
...,...,...
41252,"[fortunate, enough, see, checking, peter, falk...",1
37649,"[native, city, story, takes, place, buffalo, n...",1
48212,"[movie, remake, two, movies, lot, better, last...",1
23165,"[bronson, ireland, last, film, together, make,...",0


In [53]:
#perform stemming (convert v1,v3,v4,v5 to v2)
from nltk.stem import WordNetLemmatizer, PorterStemmer
ps = PorterStemmer()
wnl = WordNetLemmatizer()

In [54]:

def stem_words(text):
    y = []
    for i in text:
        y.append(wnl.lemmatize(i) if wnl.lemmatize(i).endswith('e') else ps.stem(i))
    z = y[:]
    y.clear()
    return z

In [55]:
lists =['i','loved','loving','it']
stem_words(lists)

['i', 'love', 'love', 'it']

In [56]:
df2['review'] = df2['review'].apply(stem_words)
df3 = df2
df3

Unnamed: 0,review,sentiment
7994,"[whole, shorthand, supposedli, aware, weird, t...",1
4440,"[patricia, arquette, play, american, doctor, l...",1
18717,"[film, summari, visconti, obssess, decadence, ...",1
35073,"[eloqu, simple, film, make, remark, clear, sta...",1
14317,"[stan, bullfight, good, one, quite, surprise, ...",1
...,...,...
41252,"[fortunate, enough, see, check, peter, falk, p...",1
37649,"[native, citi, stori, take, place, buffalo, ny...",1
48212,"[movie, remake, two, movie, lot, better, last,...",1
23165,"[bronson, ireland, last, film, togeth, make, l...",0


In [57]:
df3['review'] = [','.join(map(str, l)) for l in df3['review']]
df4 = df3
df4

Unnamed: 0,review,sentiment
7994,"whole,shorthand,supposedli,aware,weird,time,bl...",1
4440,"patricia,arquette,play,american,doctor,laura,b...",1
18717,"film,summari,visconti,obssess,decadence,nobil,...",1
35073,"eloqu,simple,film,make,remark,clear,statement,...",1
14317,"stan,bullfight,good,one,quite,surprise,usual,o...",1
...,...,...
41252,"fortunate,enough,see,check,peter,falk,phoenix,...",1
37649,"native,citi,stori,take,place,buffalo,ny,fun,se...",1
48212,"movie,remake,two,movie,lot,better,last,one,hea...",1
23165,"bronson,ireland,last,film,togeth,make,likable,...",0


### Create Bag of words

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [59]:
X = cv.fit_transform(df4['review']).toarray()
pickle.dump(cv, open('tranform.pkl', 'wb'))

In [60]:
X.shape 

(2000, 18851)

In [61]:
y = df4.iloc[:,-1].values
y.shape

(2000,)

### Training and testing data

In [62]:
#taking X,y and spliting in training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [63]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(1600, 18851) (400, 18851) (1600,) (400,)


In [64]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [65]:
clf1 = GaussianNB()
clf2 = MultinomialNB()

In [66]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)

y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)

In [67]:
y_test.shape, y_pred1.shape, y_pred2.shape

((400,), (400,), (400,))

### Testing accuracy

In [68]:
from sklearn.metrics import accuracy_score

In [69]:
print("GaussianNB has an accuracy of ",accuracy_score(y_test,y_pred1)*100,"%")
print("MultinomialNB has an accuracy of ",accuracy_score(y_test,y_pred2)*100,"%")

GaussianNB has an accuracy of  63.24999999999999 %
MultinomialNB has an accuracy of  82.75 %


In [70]:
filename = 'nlp_model.pkl'
pickle.dump(clf2, open(filename, 'wb'))