# IMDb Sentiment Analysis

In [369]:
# Importing relevant libraries:

import pandas as pd
import re

from bs4 import BeautifulSoup
from string import punctuation

# Sci-Kit Learn:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# NLTK:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [370]:
# Reading the IMDb data
imdb = pd.read_csv(r"C:\Users\sando\OneDrive\Escritorio\Personal Projects\IMDB Sentiment Analysis\dataset\imdb_reviews.csv", encoding = 'UTF-8')

# Exploratory Data Analysis (EDA)

In [371]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [372]:
# Converting the columsn to string
imdb['review'] = imdb['review'].astype(str)
imdb['sentiment'] = imdb['sentiment'].astype(str)

In [373]:
# Checking if our data is balanced
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting the Data Set

In [374]:
# Splitting the dataset using sklearn
X = imdb['review'] # features
y = imdb['sentiment'] # target labels

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)


# Text Normalization

In [375]:
# Checking for NAN values:
imdb['review'].isna().value_counts()

review
False    50000
Name: count, dtype: int64

# Removing HTML Tags

In [376]:
# Removing the html strips:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# # Applying our function:
# imdb['review'] = imdb['review'].apply(strip_html_tags)
# imdb['review']

In [377]:
stop_words = stopwords.words('english')

# Cleaning up the text:
def clean_text(text):
    # Tokenizing the text
    words = word_tokenize(text)
    # Removing stop words, punctuation, and numbers 
    processed_words = [w for w in words if w.lower() not in stop_words and w not in punctuation and not w.isdigit()]
    processed_words = ' '.join(processed_words)
    return processed_words 

# imdb['review'] = imdb['review'].apply(clean_text)
# print(imdb['review']) 

# Removing Special Charcaters

In [378]:
# Removing special characters:
def remove_special_characters(text, remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

# imdb['review'] = imdb['review'].apply(remove_special_characters) 
# imdb['review']

# Removing Redundant Whitespace

In [379]:
# Removing the redundant whitespaces:
def remove_redundant_whitespaces(text):
    text = re.sub(pattern= r'\s+', repl= ' ', string= text)
    return text.strip()

# imdb['review'] = imdb['review'].apply(remove_redundant_whitespaces)
# print(imdb['review'])

# Text Stemming

In [380]:
# Stemming the text:
def simple_stemmer(text):
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_text = ' '.join([stemmer.stem(word) for word in words])
    return stemmed_text

# imdb['review'] = imdb['review'].apply(simple_stemmer)
# print(imdb['review'])

# Combining Things Together:

In [385]:
def normalize_text(text):
    # Removing the HTML tags:
    normalized_text = strip_html_tags(text)

    # Removing stop words, punctuation, and numbers
    normalized_text = clean_text(normalized_text)

    # Removing special characters:
    normalized_text = remove_special_characters(normalized_text, remove_digits = False)

    # Removing the redundant whitespaces:
    normalized_text = remove_redundant_whitespaces(normalized_text)

    # Stemmatization:
    normalized_text = simple_stemmer(normalized_text)
    
    return normalized_text

imdb['review'] = imdb['review'].apply(normalize_text)
print(imdb['review'])

0        one review mention watch oz episod hook right ...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job nt creativ origin ...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    go disagr previou comment side maltin one seco...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object


# Bag of Words Model:

In [None]:
cv = CountVectorizer(min_df= 0, max_df= 1, binary= False, ngram_range= (1,3))