In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Perform necessary imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Read data

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df.head()

# Check for null values

In [None]:
df.isnull().sum()

**No null values are present. Hence we are good to go.**

# Check the distribution of positive and negative reviews

In [None]:
sns.countplot(x=df['sentiment'])

In [None]:
df['sentiment'].value_counts()

**We have equal number of positive and negative reviews**

# Count the length of reviews

In [None]:
review_length = [len(review.split()) for review in df['review']]

In [None]:
df["review_len"] = review_length

In [None]:
df.head(10)

In [None]:
fig = plt.figure(figsize=(12,10))
ax1 = fig.add_subplot(122)
sns.histplot(data=df[df["sentiment"] == "positive"], x = "review_len", ax = ax1, kde=True, hue="sentiment", bins=50)
describe = df.review_len[df["sentiment"] == "positive"].describe().to_frame().round(2)
#print(describe)

ax2 = fig.add_subplot(121)
ax2.axis("off")
bbox = [0, 0, 1, 1]
table = ax2.table(cellText=describe.values, rowLabels=describe.index,bbox=bbox, colLabels=describe.columns)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,10))
ax1 = fig.add_subplot(122)
sns.histplot(data=df[df["sentiment"] == "negative"], x = "review_len", ax = ax1, kde=True, hue="sentiment", bins=50)
describe = df.review_len[df["sentiment"] == "negative"].describe().to_frame().round(2)
#print(describe)

ax2 = fig.add_subplot(121)
ax2.axis("off")
bbox = [0, 0, 1, 1]
table = ax2.table(cellText=describe.values, rowLabels=describe.index,bbox=bbox, colLabels=describe.columns)
plt.show()

# Wordcloud

In [None]:
from wordcloud import WordCloud

## Wordcloud for positive sentiments

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.sentiment == 'positive'].review))                                                                       
plt.imshow(wc, interpolation='bilinear')

## Wordcloud for negative sentiments

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words = 200 , width = 1600 , height = 800).generate(" ".join(df[df.sentiment == 'negative'].review))                                                                       
plt.imshow(wc, interpolation='bilinear')

We see words like "br" appearing in the word cloud. This shows that our dataset needs preprocessing.

# Feature Engineering

## Data cleaning

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

* Remove characters other than alphabets
* Lemmatize the words so that all the words get reduced to their root words
* Change the words to lowercase so that 'Girl' and 'girl' are not considered as two unique words.

In [None]:
# lemmatizer = WordNetLemmatizer()

# doc = []

# for i in range(len(df)):
#     clean_text = re.sub('[^a-zA-Z]', ' ', df['review'][i] )
#     clean_text = clean_text.lower()
#     clean_text = clean_text.split()
#     clean_text = [lemmatizer.lemmatize(word) for word in clean_text if word not in set(stopwords.words('english'))]
#     clean_text = ' '.join(clean_text)
#     doc.append(clean_text)

## Sentiment Mapping
Computers understand only binary, so we convert 'positive' and 'negative' sentiments into binary form.
positive: 0
negative: 1

In [None]:
label_sentiment = {'positive': 0, 'negative': 1}

y = df.sentiment.map(label_sentiment)

We define 3 functions for carrying out three different tasks:
1. Remove HTML tags from the reviews
2. Remove special characters other than alphabets from the review
3. Remove stopwords from the review

In [None]:
def clean_html(text):
    clean_text = re.sub('<.*>', '', text)
    return clean_text
    
def clean_spcl_chars(text):
    clean_text = re.sub('[^a-zA-Z]', ' ', text) 
    return clean_text

def remove_stopwords(text):
    clean_text = []
    text = text.split()
    for word in text:
        if word not in stopwords.words('english'):
            clean_text.append(word)
    return ' '.join(clean_text)
    

Remove the HTML tags

In [None]:
df["clean_review"] = df.review.apply(clean_html)

In [None]:
df.head()

Remove special characters

In [None]:
df.clean_review = df.clean_review.apply(clean_spcl_chars)
df.head()

Remove stopwords

In [None]:
df.clean_review = df.clean_review.apply(remove_stopwords)
df.head()

## Lemmatize the words

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(text)

In [None]:
df['lemmatized_review'] = df['clean_review'].apply(lemmatize_text)
df.head()

## Save the cleaned review dataframe as pickle object, so that I don't have to run the cleaning process everytime I open this notebook

In [None]:
df.to_pickle('cleaned_df.pkl')

# Vectorize review
We will use Term frequency - inverse document frequency (Tf-idf) as for vectorizing the reviews

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
X = df['lemmatized_review']
#X = TfidfVectorizer().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Training Pipeline
We will create a pipeline for diffrent classifiers

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

In [None]:
pipe1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB() )])
pipe2 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC() )])
pipe3 = Pipeline([('tfidf', TfidfVectorizer()), ('clf', RandomForestClassifier() )])

In [None]:
pipes = [pipe1, pipe2, pipe3]
log = []
for i,pipe in enumerate(pipes):
    name = f'pipe_{i}'
    pipe.fit(X_train, y = y_train)
    y_pred = pipe.predict(X_test)
    print('*****RESULTS*****')
    acc = accuracy_score(y_test, y_pred).round(2)
    log.append((name, acc))
    print(classification_report(y_test, y_pred))
    print(pipe.score(X_test, y_test))

In [None]:
log

In [None]:
clf_list = ['MultinomialNB', 'LinearSVC', 'RandomForset']

In [None]:
accuracy = [log[i][1].round(2) for i in range(len(log))]

In [None]:
accuracy

In [None]:
sns.barplot(x=clf_list, y = accuracy)

LinearSVC has the highest accuracy of 86%

**If you have any doubt or suggestion, please feel free to comment**

**If you found my notebook, please do upvote. Thank you :)**