# IMDb Sentiment Analysis

In [1]:
# Importing relevant libraries:

import numpy as np
import pandas as pd
import spacy
import re, string, unicodedata

import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

from textblob import TextBlob
from textblob import Word

from wordcloud import WordCloud,STOPWORDS

In [2]:
# Reading the IMDb data

imdb = pd.read_csv(r"C:\Users\sando\OneDrive\Escritorio\Personal Projects\IMDB Sentiment Analysis\dataset\imdb_reviews.csv", encoding = 'UTF-8')

# Exploratory Data Analysis (EDA)

In [3]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
imdb.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
# Checking if our data is balanced
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting the Data Set

In [6]:
# Splitting the dataset using sklearn

from sklearn.model_selection import train_test_split

X = imdb['review'] # features
y = imdb['sentiment'] # target labels

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)


# Text Normalization

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# # Some stuff we have to download
# nltk.download('punkt')
# nltk.download('stopwords')

# Getting English stop words from nltk
stop_words = set(stopwords.words('english'))

tokenizer = ToktokTokenizer()

# Removing the stopwords
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stop_words]
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

# Applying our function:
imdb['review'] = imdb['review'].apply(remove_stopwords)

# Performing a sanity check:
sanity_check = imdb['review'].isin(stop_words)
sanity_check.value_counts()

review
False    50000
Name: count, dtype: int64

In [8]:
# Exploring what we have so far:
imdb['review'][2]

'Ithoughtwonderfulwayspendtimehotsummerweekend,sittingairconditionedtheaterwatchinglight-heartedcomedy.Theplotsimplistic,dialoguewittycharacterslikable(evenwellbreadsuspectedserialkiller).WhilemaydisappointedrealizeMatchPoint2:RiskAddiction,IthoughtproofWoodyAllenstillfullycontrolstylemanyusgrownlove.<br/><br/>ThisI\'laughedoneWoody\'comediesyears(dareIsaydecade?).WhileI\'neverimpressedScarletJohanson,managedtone"sexy"imagejumpedrightaverage,spiritedyoungwoman.<br/><br/>Thismaycrownjewelcareer,wittier"DevilWearsPrada"interesting"Superman"greatcomedygoseefriends.'

As we can see our entries no longer have stop words in them but we can see that they are littered with html tags, so let's go ahead and take those out.

# Removing HTML Strips and Noise Text

In [13]:
# Removing the html strips:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

#? What text is in between brackets?
# Removing the square brackets:
def remove_between_square_brackets(text):
    return re.sub(pattern = '\[[^]]*\]', repl = '', string = text)

# Removing the noisy text:
def denoise_text(text):
    text  = strip_html_tags(text)
    text = remove_between_square_brackets(text)
    return text

# Applying our function:
imdb['review'] = imdb['review'].apply(denoise_text)

#TODO: Write a sanity check to ensure that there are no more html tags in the text

  soup = BeautifulSoup(text, 'html.parser')


# Removing Special Characters