# Introduction

IMDB dataset having 50K movie reviews for natural language processing or Text analytics. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.

# Import Package 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

import re, string, unicodedata
from bs4 import BeautifulSoup

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read Dataset

In [3]:
df = pd.read_csv("Datasets/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text Normalization

In [4]:
# Tokenization of Text
tokenization = ToktokTokenizer()

# Setting English Stopwords
stopwords = stopwords.words('english')

### Remove HTML Tags

In [5]:
# Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub('\[[^]*/]', '', text)
    return text

In [6]:
# Apply function on review column
df['review'] = df["review"].apply(noiseremoval_text)



In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Stemming

In [8]:
def stemmer(text):
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [9]:
# Apply function on review column
df['review'] = df["review"].apply(stemmer)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


### Removing Stropwords

In [11]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
# Removing Stopwords
def removing_stopwords(text, is_lower_case=False):
    # Settings English Stopwords
    tokens = tokenization.tokenize(text)
    tokens = [i.strip() for i in tokens]
    if is_lower_case:
        filtokens = [i for i in tokens if token not in stopwords]
    else:
        filtokens = [i for i in tokens if i.lower() not in stopwords]
    filtered_text = " ".join(filtokens)
    return filtered_text

In [13]:
# Apply function on review column
df['review'] = df["review"].apply(removing_stopwords)

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Dataset Splitting

In [15]:
X = df.review
y = df.sentiment.values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
pipeline_bow = Pipeline([
    ('bow', CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))),
    ('algo', LogisticRegression(max_iter=500, n_jobs=-1, random_state=42))
])

In [18]:
pipeline_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))),
    ('algo', LogisticRegression(max_iter=500, n_jobs=-1, random_state=42))
])

# Hyperparameter Tunning

In [19]:
parameters = {
    "algo__fit_intercept" : [True, False],
    "algo__C" : range(1,3,1)
}
model_bow = GridSearchCV(pipeline_bow,parameters, cv=3, n_jobs=-1, verbose=1)
model_tfidf = GridSearchCV(pipeline_tfidf,parameters, cv=3, n_jobs=-1, verbose=1)

# Training

#### Bag of Words

In [20]:
# Training model Bag of Words
model_bow.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('bow',
                                        CountVectorizer(max_df=1, min_df=0,
                                                        ngram_range=(1, 3))),
                                       ('algo',
                                        LogisticRegression(max_iter=500,
                                                           n_jobs=-1,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid={'algo__C': range(1, 3),
                         'algo__fit_intercept': [True, False]},
             verbose=1)

In [22]:
model_bow.score(X_train, y_train), model_bow.best_score_, model_bow.score

(0.99656,
 0.7310133333333333,
 <bound method BaseSearchCV.score of GridSearchCV(cv=3,
              estimator=Pipeline(steps=[('bow',
                                         CountVectorizer(max_df=1, min_df=0,
                                                         ngram_range=(1, 3))),
                                        ('algo',
                                         LogisticRegression(max_iter=500,
                                                            n_jobs=-1,
                                                            random_state=42))]),
              n_jobs=-1,
              param_grid={'algo__C': range(1, 3),
                          'algo__fit_intercept': [True, False]},
              verbose=1)>)

#### TF-IDF

In [23]:
# Training model Bag of Words
model_tfidf.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(max_df=1, min_df=0,
                                                        ngram_range=(1, 3))),
                                       ('algo',
                                        LogisticRegression(max_iter=500,
                                                           n_jobs=-1,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid={'algo__C': range(1, 3),
                         'algo__fit_intercept': [True, False]},
             verbose=1)

In [24]:
model_tfidf.score(X_train, y_train), model_tfidf.best_score_, model_tfidf.score

(0.99656,
 0.73816,
 <bound method BaseSearchCV.score of GridSearchCV(cv=3,
              estimator=Pipeline(steps=[('tfidf',
                                         TfidfVectorizer(max_df=1, min_df=0,
                                                         ngram_range=(1, 3))),
                                        ('algo',
                                         LogisticRegression(max_iter=500,
                                                            n_jobs=-1,
                                                            random_state=42))]),
              n_jobs=-1,
              param_grid={'algo__C': range(1, 3),
                          'algo__fit_intercept': [True, False]},
              verbose=1)>)

# Predict

In [25]:
model_bow_predict = model_bow.predict(y_test)
model_bow_predict

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [26]:
model_tfidf_predict = model_tfidf.predict(y_test)
model_tfidf_predict

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [27]:
data_predict = [
    ["I rarely stop watching a movie although how crappy it is. Well for this one I made an exception since its beyond boring. I cant leave long reviews so this is it.. SKIP IT"],
    ["I watch a lot of movies and I like to give them all a chance just in case there is something interesting or exciting to warrant a viewing Unfortunately this movie has none of these features it is pointless and offers nothing in the way of story line,acting or direction The plot is non-existent with the actors just going through the motions and the dialogue is sooo boring its embarrassing. I wish the previous reviewers had posted earlier as this would have saved me 95 mins of my time"],
    ["This is a MUST-MUST-MUST-MUST see for any student, teacher, parent, politician concerned about the future. It is one the best, challenging, uplifting and encouraging movie I have ever seen. Just wanted to say AGAIN that is a MUST-MUST -MUST see!” Randy Lukasiewicz, Omaha, Nebaska"],
    ["“WOW. This film changed my life, period. As an educator and the son of an educator, I am forever deepened! Thank you for bringing these children into conversation with THEIR world so beautifully.” Nate Adams, Denver CO"]
]

In [28]:
for predict in data_predict:
    print(model_bow.predict(predict))

['negative']
['negative']
['positive']
['positive']


In [29]:
for predict in data_predict:
    print(model_tfidf.predict(predict))

['negative']
['negative']
['positive']
['positive']
