In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #RegEx
# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
# BOW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
# pkl
import pickle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(data.shape)
data.head(20)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data['review'][0] # checking how one review is written

**so a review contains - html tags , special charcaters and mixed case words.**

# **We need to clean the reviews **
# Steps involved are:
* **Remove HTML tags**
* **Remove special characters**
* **Convert everything to lowercase**
* **Remove stopwords**
* **Stemming**

# 1. Removing html tags using regex

In [None]:
def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned,'',text)

data.review = data.review.apply(clean)
data.review[0]

** We can html tags is removed**

# 2. Removing special charcters

In [None]:
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

data.review = data.review.apply(is_special)
data.review[0]

**SO special words are also removed**

# 3. Converting the mixed case review to lower case

In [None]:
def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)
data.review[0]

 **Now that we have removed special characters, html tags and have turned the mixed case review to lower case**
 
# 4.Removing stopwords using NLP packages 

In [None]:
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(rem_stopwords)
data.review[0]

 **What are these stopwords in English**

In [None]:
stopwords.words('english')

# 5. Stemming words

**What is stemming?**

**Turning all the words convert back to its original words**

**Example - 'play','playing,'played' all these 3 words are getting converted to its very basic word - 'play'.This process is called stemming.**

In [None]:
def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.review = data.review.apply(stem_txt)
data.review[0]

**We are done with text pre-processing. My reviews are now cleaned**

# Now we will be creating our model.

# 1. Creating bag of words

In [None]:
X = np.array(data.iloc[:,0].values)
y = np.array(data.sentiment.values)
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(data.review).toarray()
print("X.shape = ",X.shape)
print("y.shape = ",y.shape)

In [None]:
X

In [None]:
y

# 2. Train test split

In [None]:
trainx,testx,trainy,testy = train_test_split(X,y,test_size=0.2,random_state=9)
print("Train shapes : X = {}, y = {}".format(trainx.shape,trainy.shape))
print("Test shapes : X = {}, y = {}".format(testx.shape,testy.shape))

# 3. Fitting my data in several algorithms and training them

In [None]:
gnb,mnb,bnb = GaussianNB(),MultinomialNB(alpha=1.0,fit_prior=True),BernoulliNB(alpha=1.0,fit_prior=True)
gnb.fit(trainx,trainy)
mnb.fit(trainx,trainy)
bnb.fit(trainx,trainy)

# 4.Prediction,testing stage and also checking the accuracy of the matrix (in-order to choose the best model for this kind of analysis)

In [None]:
ypg = gnb.predict(testx)
ypm = mnb.predict(testx)
ypb = bnb.predict(testx)

print("Gaussian = ",accuracy_score(testy,ypg))
print("Multinomial = ",accuracy_score(testy,ypm))
print("Bernoulli = ",accuracy_score(testy,ypb))

**Bernoulli Naive bayes is giving me a better accuracy and hence it is the best model for sentiment analysis**

In [None]:
revs = "When I was looking through IMDb's Top 250 movie list and saw a movie called 3 Idiots,I was surprised. Why would a movie with such a bad title be voted so highly? I went in thinking this would be India's version of Animal House or American Pie. But after watching the film, I was in love! It is so much more than the usual college story of young guys getting drunk, flunking classes, and getting back at their superiors. It is an extremely well-made film about doing what you love and facing your fears. The acting is incredible by the ensemble cast. The script is funny and poignant at the same time. Even the scenery is breathtaking. Although the length of the film is pretty long and has some quirky musical numbers, 3 Idiots is a delight. It is worth the watch!"

In [None]:
f1 = clean(revs)
f2 = is_special(f1)
f3 = to_lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

bow,words = [],word_tokenize(f5)
for word in words:
    bow.append(words.count(word))
#np.array(bow).reshape(1,3000)
#bow.shape
word_dict = cv.vocabulary_
pickle.dump(word_dict,open('bow1.pkl','wb'))

In [None]:
inp = []
for i in word_dict:
    inp.append(f5.count(i[0]))
y_pred = bnb.predict(np.array(inp).reshape(1,1000))

In [None]:
pickle.dump(bnb,open('model1.pkl','wb'))