## Setting up

#### Importing libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

#### Importing the dataset

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df["sentiment"].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Text Preprocessing

#### Removing punctuation

In [6]:
#function to replace punctuation with a space

def punct(review):
    for word in review:
        if word in string.punctuation:
            review = review.replace(word, " ")
            
    return review

In [7]:
df["punctuation"] = df["review"].apply(punct)

#### Normalization

In [8]:
#converting to lower case
df["lower"] = df["punctuation"].apply(lambda x: x.lower())

#### Tokenization

In [11]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
df["tokenize"] = df["lower"].apply(lambda x: word_tokenize(x))

#### Stop word removal

In [13]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
#function to remove stopwords

def stopword(review):
    stop = []
    stop_words = set(stopwords.words("english"))
    for word in review:
        if word not in stop_words:
            stop.append(word)
            
    return stop

In [15]:
df["stopwords"] = df["tokenize"].apply(stopword)

#### Lemmatization

In [16]:
nltk.download("averaged_perception_tagger")
nltk.download("wordnet")

[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
#function to tag Part of Speect (POS)

def pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

In [24]:
lemmatizer = WordNetLemmatizer()

df["lemmatize"] = df["stopwords"].apply(
    lambda words:[lemmatizer.lemmatize(x, pos(x)) for x in words]
)

#### Joining back the pre-processed review

In [25]:
df["final"] = df["lemmatize"].apply(lambda x: " ".join(x))

## Feature Extraction

In [27]:
count_vectorizer = CountVectorizer(max_features = 1000)
count_vector = count_vectorizer.fit_transform(df["final"].to_list()).toarray()

count_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Machine Learning

In [28]:
#preparing the dataset

x = count_vector
y = df["sentiment"]

In [29]:
#split into training and testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y)

### Gaussian Naive Bayes

In [31]:
nb = GaussianNB()
nb.fit(x_train, y_train)

predict_nb = nb.predict(x_test)
accuracy_nb = accuracy_score(y_test, predict_nb)
print(accuracy_nb)

0.79464


### Multinomial Naive Bayes

In [32]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

predict_mnb = mnb.predict(x_test)
accuracy_mnb = accuracy_score(y_test, predict_mnb)
print(accuracy_mnb)

0.83048


### Logistic Regression

In [33]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

predict_lr = lr.predict(x_test)
accuracy_lr = accuracy_score(y_test, predict_lr)
print(accuracy_lr)

0.86208


### Support Vector Machine

In [34]:
svm = LinearSVC()
svm.fit(x_train, y_train)

predict_svm = svm.predict(x_test)
accuracy_svm = accuracy_score(y_test, predict_svm)
print(accuracy_svm)

0.86088
