In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# 1. Instead of 50000 we will use only 10000 rows
# 2. Remove html tags
# 3. Remove special chars
# 4. Remove all stop words
# 5. Stemming (NLP)

# {"Play","Playing","Played"}
# {"Play","Play","Play"}

In [5]:
df=df.sample(20000)

In [6]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '', df.iloc[0].review)

'This is NOT your run-of-the mill police story where the characters were only secondary to the gun battles and car chases. The episodes, so far, are more realistic and intelligent.If you are looking for something with a lot of butt-kicking Rambo-style cop story, you will not find it here.I gave it a 9. I wish this series lives long. Morse is excellent!'

In [7]:
# Step-2 Remove html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [8]:
df['review']=df['review'].apply(clean_html)

In [9]:
#Step-3 Remove special charecter
def remove_special(text):
    x=''
    
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x + ' '
    return x

In [10]:
df['review']=df['review'].apply(remove_special)

In [11]:
def convert_lower(text):
    return text.lower()

In [12]:
df['review']=df['review'].apply(convert_lower)

In [13]:
import nltk
from nltk.corpus import stopwords

In [14]:
# 4. Remove all stop words
len(stopwords.words('english'))

179

In [15]:
def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [16]:
df['review']=df['review'].apply(remove_stopwords)

In [17]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [18]:
def stem_words(text):
    y=[]
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return " ".join(z)

In [19]:
df['review']=df['review'].apply(stem_words)

In [20]:
df.head()

Unnamed: 0,review,sentiment
36109,run mill polic stori charact secondari gun bat...,positive
10413,show promis start sort opposit ocean 11 develo...,negative
3031,stori buster mistaken dead shot dan notori cri...,positive
438,watch six kind w c field around 10 minut one l...,positive
23341,beauti made film fine balanc fragil human stor...,positive


In [21]:
df.shape

(20000, 2)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [23]:
X=cv.fit_transform(df['review']).toarray()

In [24]:
X.shape

(20000, 48682)

In [25]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [26]:
y=df.iloc[:,-1].values

In [27]:
y.shape

(20000,)

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [29]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()

In [30]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
y_pred=clf.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.85575

In [33]:
words_used=cv.get_feature_names()

review="Dark is easily one of the best shows out there( if not the best). Just thinking back on all the events that happened, it makes my head ache, and it's a good kind of headache because it reminds just how the show was very well-written. The casting of characters are the best, it is unbelievable how they had different actors to portray one character of different ages and you would not even notice they're different actors (i literally believed that the 2019 ulrich and the old Ulrich were played by the same actor, the old one being played with makeup on). The soundtrack and cinematograpy are also top-tier."


In [38]:
count_words=[]
for i in words_used:
    count_words.append(review.count(i))
input_review=np.array(count_words).reshape(1,48682)

In [39]:
def predict_sentiment(review):
    count_words=[]
    for i in words_used:
        count_words.append(review.count(i))
    input_review=np.array(count_words).reshape(1,48682)
    
    return clf.predict(input_review)[0]

In [40]:
import pickle
pickle.dump(cv,open('word_list.pkl','wb'))
pickle.dump(clf,open('model.pkl','wb'))

In [37]:
#import sys
#!{sys.executable} -m pip install numpy