In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
imdb = pd.read_csv('IMDB Dataset.csv')

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
imdb.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
print(imdb.iloc[0,0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [7]:
sentiment = pd.get_dummies(imdb['sentiment'],drop_first=True)

In [8]:
imdb = pd.concat([imdb,sentiment],axis=1)

In [9]:
imdb.head()

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


Data Preparation

In [10]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import PorterStemmer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rishik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rishik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Text processing method which makes everything lower case, removes punctuation, removes the linebreaks, removes stopwords and stems

In [11]:
def text_process(review):
    #make everything lower case
    review = review.lower()
    #remove punctuation
    review = [char for char in review if char not in string.punctuation]
    review = ''.join(review)
    #remove br because it is used to show line break
    review = [word for word in review.split() if word != 'br']
    #remove stopwords
    review = [word for word in review if word not in stopwords.words('english')]
     #stemming
    stemming = PorterStemmer()
    review = [stemming.stem(word) for word in review]
    return review
   

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import sys

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = imdb['review']
y = imdb['positive']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

First try to fit with a Naive-Bayes classifier to see if it works better

In [16]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [17]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000001EE8819F288>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

Vectorising each review

In [18]:
predictions_NB = pipeline.predict(X_test)

In [19]:
print(classification_report(predictions_NB, y_test))
print(confusion_matrix(predictions_NB,y_test))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86      8463
           1       0.85      0.87      0.86      8037

    accuracy                           0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500

[[7182 1281]
 [1026 7011]]


Now try with logistic regression model to see which one is better

In [20]:
pipeline_log = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ logistic regression model
])

In [21]:
pipeline_log.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000001EE8819F288>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                            

In [22]:
predictions_log = pipeline_log.predict(X_test)

In [23]:
print(classification_report(predictions_log, y_test))
print(confusion_matrix(predictions_log,y_test))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7985
           1       0.91      0.88      0.90      8515

    accuracy                           0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500

[[7215  770]
 [ 993 7522]]


The logistic regression model performs better!

Now try the text processing method with lemmatizing to see which one produces a better model

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
def text_process2(review):
    #make everything lower case
    review = review.lower()
    #remove punctuation
    review = [char for char in review if char not in string.punctuation]
    review = ''.join(review)
    #remove br because it is used to show line break
    review = [word for word in review.split() if word != 'br']
    #remove stopwords
    review = [word for word in review if word not in stopwords.words('english')]
     #stemming
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(word) for word in review]
    return review

In [26]:
pipeline_lem = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process2)),  # strings to token integer counts using lemmatizing processor
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ logistic regression model
])

In [27]:
pipeline_lem.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process2 at 0x000001EE881B0B88>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                            

In [28]:
predictions_lem = pipeline_lem.predict(X_test)

In [29]:
print(classification_report(predictions_lem,y_test))
print(confusion_matrix(predictions_lem,y_test))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      7959
           1       0.91      0.88      0.90      8541

    accuracy                           0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500

[[7219  740]
 [ 989 7552]]


Lemmatizing takes a lot longer to process but delivered a slightly better model