In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')


In [4]:
df = df.iloc[0:40000, :]


In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape

(40000, 2)

In [7]:
df['sentiment'].value_counts()
#balanced target

negative    20007
positive    19993
Name: sentiment, dtype: int64

In [8]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [9]:
df['sentiment'].value_counts()
#balanced classes

negative    20007
positive    19993
Name: sentiment, dtype: int64

In [10]:
#lower case
df['review'] = df['review'].str.lower()

In [11]:
df['review'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [12]:
#remove html texts
def html_tag(text):
  seq = re.compile('<.*?>')
  return seq.sub(r'', text)


In [13]:
df['review'] = df['review'].apply(html_tag)

In [14]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [15]:
# #correct spelling errors
# from textblob import TextBlob

# def correct_spell(text):
#   txt = TextBlob(text)
#   return txt.correct().string


# #df['review'] = df['review'].apply(correct_spell)

#------since the dataset is big and this tooks much time, skipping this step

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
#stop words removal
from nltk.corpus import stopwords

def stop_word(text):
  new_sent = []
  for word in text.split():
    if word in stopwords.words('english'):
      new_sent.append("")
    else:
      new_sent.append(word)

  return " ".join(new_sent)


df['review'] = df['review'].apply(stop_word)


In [18]:
df['review'][2]

' thought    wonderful way  spend time    hot summer weekend, sitting   air conditioned theater  watching  light-hearted comedy.  plot  simplistic,   dialogue  witty   characters  likable (even  well bread suspected serial killer).   may  disappointed   realize    match point 2: risk addiction,  thought   proof  woody allen  still fully  control   style many  us  grown  love.this    i\'d laughed  one  woody\'s comedies  years (dare  say  decade?).  i\'ve never  impressed  scarlet johanson,    managed  tone   "sexy" image  jumped right   average,  spirited young woman.this may    crown jewel   career,    wittier  "devil wears prada"   interesting  "superman"  great comedy  go see  friends.'

In [19]:
#Remove punctuation
import string
exclude  =  string.punctuation

def remove_punc(text):
  for char in exclude:
    text = text.replace(char, "")
  return text


df['review'] = df['review'].apply(remove_punc)

In [20]:

  import nltk
  nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
#tokenization
from nltk.tokenize import word_tokenize

def token(text):
  return word_tokenize(text)


In [22]:
df['review'] = df['review'].apply(token)

In [23]:
#stemming
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_word(text):
  return " ".join([ ps.stem(word) for word in text])

In [24]:
df['review'] = df['review'].apply(stem_word)

In [25]:
df['review'][2]

'thought wonder way spend time hot summer weekend sit air condit theater watch lightheart comedi plot simplist dialogu witti charact likabl even well bread suspect serial killer may disappoint realiz match point 2 risk addict thought proof woodi allen still fulli control style mani us grown lovethi id laugh one woodi comedi year dare say decad ive never impress scarlet johanson manag tone sexi imag jump right averag spirit young womanthi may crown jewel career wittier devil wear prada interest superman great comedi go see friend'

After all the basic pre-processing steps, now move on to vectorization of the reviews

In [26]:
X = df.iloc[: ,0:1]
y = df['sentiment']

In [27]:
print(X.shape)
print(y.shape)

(40000, 1)
(40000,)


In [28]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [29]:
y
#positive = 1
#negative =0

array([1, 1, 1, ..., 1, 0, 0])

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [31]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(32000, 1)
(8000, 1)
(32000,)
(8000,)


In [32]:
X_train

Unnamed: 0,review
21721,visual delight much better renaiss would live ...
15576,ani gun play 1967 direct enzo g castellari goo...
28716,hate movi never seen utter complet trash life ...
204,one say warn read review both user extern like...
36677,american famili move countrysid spain live iso...
...,...
7813,well say the film relat clubdjelectron music r...
32511,okay havepenelop keith miss herringbonetwe bbe...
5192,odd will skew biopic dyan thoma hear littl doz...
12172,basic structur stori begin middl endsometim st...


# **Tf-Idf**


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(max_features=3000)

In [34]:
X_train_tf = tf.fit_transform(X_train['review']).toarray()
X_test_tf = tf.transform(X_test['review'])

In [35]:
X_train_tf.shape

(32000, 3000)

In [36]:
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB

#GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_tf,y_train)
y_pred = gnb.predict(X_test_tf.toarray())
accuracy_score(y_test,y_pred)

0.804375

In [37]:
#MultinomialNB
mnb = GaussianNB()

mnb.fit(X_train_tf,y_train)
y_pred = mnb.predict(X_test_tf.toarray())
accuracy_score(y_test,y_pred)

0.804375

In [38]:
#BernoulliNB
bnb = GaussianNB()

bnb.fit(X_train_tf,y_train)
y_pred = bnb.predict(X_test_tf.toarray())
accuracy_score(y_test,y_pred)

0.804375

In [39]:
#random forrest classifier

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_tf,y_train)
y_pred = rf.predict(X_test_tf)
accuracy_score(y_test,y_pred)

0.84325

In [40]:
#to test on new sentiment

def transform_text(text):
    text = text.lower()
    seq = re.compile('<.*?>')
    text =  seq.sub(r'', text)
    

            
    y = word_tokenize(text)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
        
    
    
            
    y_new =  " ".join(y)
    
    
    return y_new

In [41]:
#testing on a random sentence
transformed_text = transform_text('One of the weakest entries in the J-horror remake sweepstakes, One Missed Call is undone by bland performances and shopworn shocks.')
vectorize = tf.transform([transformed_text])

In [42]:
vectorize

<1x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [43]:
rf.predict(vectorize)

array([1])

In [45]:
import pickle

pickle.dump(rf,open('model.pkl','wb')) #randomforest model

pickle.dump(tf,open('vectorizer.pkl','wb')) #tfidf vectorizer
