# IMDb Sentiment Analysis

In [332]:
# Importing relevant libraries:

import pandas as pd
import re

from bs4 import BeautifulSoup
from string import punctuation

# Sci-Kit Learn:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# NLTK:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [333]:
# Reading the IMDb data
imdb = pd.read_csv(r"C:\Users\sando\OneDrive\Escritorio\Personal Projects\IMDB Sentiment Analysis\dataset\imdb_reviews.csv", encoding = 'UTF-8')

# Exploratory Data Analysis (EDA)

In [334]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [335]:
# Converting the columsn to string
imdb['review'] = imdb['review'].astype(str)
imdb['sentiment'] = imdb['sentiment'].astype(str)

In [336]:
# Checking if our data is balanced
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting the Data Set

In [337]:
# Splitting the dataset using sklearn
X = imdb['review'] # features
y = imdb['sentiment'] # target labels

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)


# Text Normalization

In [338]:
# Checking for NAN values:
imdb['review'].isna().value_counts()

review
False    50000
Name: count, dtype: int64

# Removing HTML Tags

In [339]:
# Removing the html strips:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# Applying our function:
imdb['review'] = imdb['review'].apply(strip_html_tags)
imdb['review']

  soup = BeautifulSoup(text, 'html.parser')


0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [340]:
stop_words = stopwords.words('english')

# Cleaning up the text:
def clean_text(text):
    # Tokenizing the text
    words = word_tokenize(text)
    # Removing stop words, punctuation, and numbers 
    processed_words = [w for w in words if w.lower() not in stop_words and w not in punctuation and not w.isdigit()]
    return processed_words 

imdb['review'] = imdb['review'].apply(clean_text)
print(imdb['review']) 

0        [One, reviewers, mentioned, watching, Oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [Basically, 's, family, little, boy, Jake, thi...
4        [Petter, Mattei, 's, ``, Love, Time, Money, ''...
                               ...                        
49995    [thought, movie, right, good, job, n't, creati...
49996    [Bad, plot, bad, dialogue, bad, acting, idioti...
49997    [Catholic, taught, parochial, elementary, scho...
49998    ['m, going, disagree, previous, comment, side,...
49999    [one, expects, Star, Trek, movies, high, art, ...
Name: review, Length: 50000, dtype: object


# Removing Special Charcaters

In [341]:
# Removing special characters:
def remove_special_characters(text, remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

imdb['review'] = imdb['review'].astype(str).apply(remove_special_characters) 
imdb['review']

0        One reviewers mentioned watching Oz episode ll...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        Basically s family little boy Jake thinks s zo...
4        Petter Mattei s  Love Time Money  visually stu...
                               ...                        
49995    thought movie right good job nt creative origi...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    Catholic taught parochial elementary schools n...
49998    m going disagree previous comment side Maltin ...
49999    one expects Star Trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

# Removing Redundant Whitespace

In [342]:
# Removing the redundant whitespaces:
def remove_redundant_whitespaces(text):
    text = re.sub(pattern= r'\s+', repl= ' ', string= text)
    return text.strip()

imdb['review'] = imdb['review'].apply(remove_redundant_whitespaces)
print(imdb['review'])

0        One reviewers mentioned watching Oz episode ll...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        Basically s family little boy Jake thinks s zo...
4        Petter Mattei s Love Time Money visually stunn...
                               ...                        
49995    thought movie right good job nt creative origi...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    Catholic taught parochial elementary schools n...
49998    m going disagree previous comment side Maltin ...
49999    one expects Star Trek movies high art fans exp...
Name: review, Length: 50000, dtype: object


# Text Stemming

In [343]:
# Stemming the text:
def simple_stemmer(text):
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_text = ' '.join([stemmer.stem(word) for word in words])
    return stemmed_text

# Appyling the function:
imdb['review'] = imdb['review'].apply(simple_stemmer)
print(imdb['review'])

0        one review mention watch oz episod ll hook rig...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic s famili littl boy jake think s zombi cl...
4        petter mattei s love time money visual stun fi...
                               ...                        
49995    thought movi right good job nt creativ origin ...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    m go disagre previou comment side maltin one s...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object


# Combining Things Together:

In [344]:
def normalize_text(text):
    # Removing the HTML tags:
    normalized_text = strip_html_tags(text)
    print('HTML tag extraction complete.')
    print(type(normalized_text))

    # Removing stop words, punctuation, and numbers
    normalized_text = clean_text(normalized_text)
    print('Stop words, punctuation, and numbers removed.')
    print(type(normalized_text))

    #! For some reason the type here is now a list
    # Removing special characters:
    normalized_text = remove_special_characters(normalized_text, remove_digits = False)
    print('Special characters deleted.')
    print(type(normalized_text))

    # Removing the redundant whitespaces:
    normalized_text = remove_redundant_whitespaces(normalized_text)
    print('Redundant whitespaces have been quashed!')
    print(type(normalized_text))

    # Stemmatization:
    normalized_text = simple_stemmer(normalized_text)
    print('Stemmy complete.')
    print(type(normalized_text))
    
    return normalized_text

imdb['review'] = imdb['review'].apply(normalize_text)

HTML tag extraction complete.
<class 'str'>
Stop words, punctuation, and numbers removed.
<class 'list'>


TypeError: expected string or bytes-like object

# Bag of Words Model:

In [None]:
cv = CountVectorizer(min_df= 0, max_df= 1, binary= False, ngram_range= (1,3))