# IMDb Sentiment Analysis

In [230]:
# Importing relevant libraries:

import numpy as np
import pandas as pd
import spacy
import re, string, unicodedata

import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

from textblob import TextBlob
from textblob import Word

from wordcloud import WordCloud,STOPWORDS

In [231]:
# Reading the IMDb data

imdb = pd.read_csv(r"C:\Users\sando\OneDrive\Escritorio\Personal Projects\IMDB Sentiment Analysis\dataset\imdb_reviews.csv", encoding = 'UTF-8')

# Exploratory Data Analysis (EDA)

In [232]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [233]:
# Converting the columsn to string
imdb['review'] = imdb['review'].astype(str)
imdb['sentiment'] = imdb['sentiment'].astype(str)

In [234]:
# Checking if our data is balanced
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting the Data Set

In [235]:
# Splitting the dataset using sklearn

from sklearn.model_selection import train_test_split

X = imdb['review'] # features
y = imdb['sentiment'] # target labels

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)


# Text Normalization

In [236]:
# Checking for NAN values:

imdb['review'].isna().value_counts()

review
False    50000
Name: count, dtype: int64

# Removing HTML Tags

In [237]:
# Removing the html strips:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# Applying our function:
imdb['review'] = imdb['review'].apply(strip_html_tags)
imdb['review']

  soup = BeautifulSoup(text, 'html.parser')


0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [238]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = stopwords.words('english')

# Cleaning up the text:
def clean_text(text):
    # Tokenizing the text
    words = word_tokenize(text)
    # Removing stop words, punctuation, and numbers 
    processed_words = [w for w in words if w.lower() not in stop_words and w not in punctuation and not w.isdigit()]
    return processed_words 

imdb['review'] = imdb['review'].apply(tokenize_text)
print(imdb['review']) 

0        [One, reviewers, mentioned, watching, Oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [Basically, 's, family, little, boy, Jake, thi...
4        [Petter, Mattei, 's, ``, Love, Time, Money, ''...
                               ...                        
49995    [thought, movie, right, good, job, n't, creati...
49996    [Bad, plot, bad, dialogue, bad, acting, idioti...
49997    [Catholic, taught, parochial, elementary, scho...
49998    ['m, going, disagree, previous, comment, side,...
49999    [one, expects, Star, Trek, movies, high, art, ...
Name: review, Length: 50000, dtype: object


In [239]:
imdb['review'][0]

['One',
 'reviewers',
 'mentioned',
 'watching',
 'Oz',
 'episode',
 "'ll",
 'hooked',
 'right',
 'exactly',
 'happened',
 'me.The',
 'first',
 'thing',
 'struck',
 'Oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'GO',
 'Trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word.It',
 'called',
 'OZ',
 'nickname',
 'given',
 'Oswald',
 'Maximum',
 'Security',
 'State',
 'Penitentary',
 'focuses',
 'mainly',
 'Emerald',
 'City',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'Em',
 'City',
 'home',
 'many',
 '..',
 'Aryans',
 'Muslims',
 'gangstas',
 'Latinos',
 'Christians',
 'Italians',
 'Irish',
 '....',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away.I',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due'

# Removing Special Charcaters

In [244]:
def remove_special_characters(text, remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

imdb['review'] = imdb['review'].astype(str).apply(remove_special_characters) 
imdb['review']

0        One reviewers mentioned watching Oz episode ll...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        Basically s family little boy Jake thinks s zo...
4        Petter Mattei s  Love Time Money  visually stu...
                               ...                        
49995    thought movie right good job nt creative origi...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    Catholic taught parochial elementary schools n...
49998    m going disagree previous comment side Maltin ...
49999    one expects Star Trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

In [245]:
imdb['review'][0]

'One reviewers mentioned watching Oz episode ll hooked right exactly happened meThe first thing struck Oz brutality unflinching scenes violence set right word GO Trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordIt called OZ nickname given Oswald Maximum Security State Penitentary focuses mainly Emerald City experimental section prison cells glass fronts face inwards privacy high agenda Em City home many  Aryans Muslims gangstas Latinos Christians Italians Irish  scuffles death stares dodgy dealings shady agreements never far awayI would say main appeal show due fact goes shows would nt dare Forget pretty pictures painted mainstream audiences forget charm forget romance  OZ nt mess around first episode ever saw struck nasty surreal could nt say ready watched developed taste Oz got accustomed high levels graphic violence violence injustice crooked guards ll sold nickel inmates ll kill order get away well mannered middle class inmates t

# LIMIT

In [None]:
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # # Some stuff we have to download
# # nltk.download('punkt')
# # nltk.download('stopwords')

# # Getting English stop words from nltk
# stop_words = set(stopwords.words('english'))

# tokenizer = ToktokTokenizer()

# # Removing the stopwords
# def remove_stopwords(text):
#     tokens = tokenizer.tokenize(text)
#     tokens = [token.strip() for token in tokens]
#     filtered_tokens = [token for token in tokens if token not in stop_words]
#     filtered_text = ' '.join(filtered_tokens)
#     return filtered_text

# # Applying our function:
# imdb['review'] = imdb['review'].apply(remove_stopwords)

# # Performing a sanity check:
# sanity_check = imdb['review'].isin(stop_words)
# sanity_check.value_counts()

review
False    50000
Name: count, dtype: int64

As we can see our entries no longer have stop words in them but we can see that they are littered with html tags, so let's go ahead and take those out.

# Removing Special Characters

# Text Stemming:

# Bag of Words Model:

In [None]:
cv = CountVectorizer(min_df= 0, max_df= 1, binary= False, ngram_range= (1,3))