In [201]:
import numpy as np 
import pandas as pd 
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle

In [197]:
#reading Data
data = pd.read_csv('IMDB_Dataset.csv')[:5000]
data.shape

(5000, 2)

In [198]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [199]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,not good horrible worst Not happy Not user-f...,negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [200]:
pd.get_dummies(data.sentiment)

Unnamed: 0,negative,positive
0,0,1
1,0,1
2,0,1
3,1,0
4,0,1
...,...,...
4995,1,0
4996,0,1
4997,0,1
4998,1,0


In [160]:
data['sentiment'] = pd.get_dummies(data.sentiment).drop('negative',axis=1)
# 1 implies positive
# 0 implies negative


In [161]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [162]:
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [163]:
def cleaning(df):
    all_reviews = list()
    lines = df["review"].values.tolist()
    for text in lines:
        text = text.lower() # converting the text to lower case
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text) # removes URL'S
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text) #removes punctuation
        text = removeApostrophe(text)
        tokens = word_tokenize(text) #tokenizing
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()] #filtering only text data
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not") #removing "not" from stopwords as it is sentimental analysis
        PS = PorterStemmer()
        words = [PS.stem(w) for w in words if not w in stop_words] #stemming and removing stopwords
        words = ' '.join(words) #joining strings 
        all_reviews.append(words)
    return all_reviews

reviews = cleaning(data)


In [164]:
data['cleaned_reviews'] = reviews

In [168]:
from sklearn.model_selection import train_test_split

X = data['cleaned_reviews']
y = data["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
# from sklearn

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")


from sklearn.pipeline import Pipeline
model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)

confusion_matrix(predictions, y_test)

array([[450,  60],
       [ 80, 410]], dtype=int64)

In [169]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, y_test))
print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))

Accuracy :  0.86
Precision :  0.8604656764351667
Recall :  0.86


In [195]:
filename = 'model_imdb.pkl'
pickle.dump(model, open(filename, 'wb'))
 
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.86


In [190]:
def predict(review):
    l = []
    l.append(review)
    print(l)
    sent = model.predict(l)
    if sent[0]== 0:
        return "Negative"
    else:
        return "Positive"

In [193]:
# predict("cheap")
np.array('cheap')

array('cheap', dtype='<U5')

In [172]:
df = data.drop('review',axis=1)

In [173]:
df.head()

Unnamed: 0,sentiment,cleaned_reviews
0,1,one review mention watch oz episod hook right ...
1,1,wonder littl product br br film techniqu unass...
2,1,thought wonder way spend time hot summer weeke...
3,0,basic famili littl boy jake think zombi closet...
4,1,petter mattei love time money visual stun film...
