# Importing Libraries

In [1]:
import pandas as pd

# Loading Dataset

In [2]:
df_train = pd.read_csv("imdb_reviews_train.csv")
df_test = pd.read_csv("imdb_reviews_test.csv")

In [3]:
df_train.head(10)

Unnamed: 0,review,sentiment
0,In Panic In The Streets Richard Widmark plays ...,1
1,If you ask me the first one was really better ...,0
2,I am a big fan a Faerie Tale Theatre and I've ...,1
3,I just finished reading a book about Dillinger...,0
4,Greg Davis and Bryan Daly take some crazed sta...,0
5,This really is an incredible film. Not only do...,1
6,"If you lived through the 60s, this film can be...",1
7,As a writer I find films this bad making it in...,0
8,I'm 14 years old and I love this cartoon. Burt...,1
9,This film would usually classify as the worst ...,1


# Making Copy of both Dataset to Work

In [4]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

# Text Preprocessing

In [5]:
# Turning the reviews into lower case
def text_lower(text):
    text = text.lower()
    return text

In [6]:
df_train_copy["clean_lowercase"] = df_train_copy["review"].apply(text_lower)

In [7]:
df_train_copy.head()

Unnamed: 0,review,sentiment,clean_lowercase
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...


In [8]:
# Removing puctuation and special symbol
import re
def remove_p_ss(text):
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # remove punctuations and special symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [9]:
df_train_copy["clean_lowercase_re"] = df_train_copy["clean_lowercase"].apply(remove_p_ss)

In [10]:
df_train_copy.head()

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...


In [11]:
# Stopwords Removal
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\ABIR
[nltk_data]     GHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered)

df_train_copy['no_stopwords'] = df_train_copy['clean_lowercase_re'].apply(remove_stopwords)

In [13]:
df_train_copy.head()

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re,no_stopwords
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...,panic streets richard widmark plays us navy do...
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...,ask first one really better one look sarah g r...
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...,big fan faerie tale theatre ive seen one best ...
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...,finished reading book dillinger movie horribly...
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...,greg davis bryan daly take crazed statements t...


In [14]:
# Tokenization
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\ABIR
[nltk_data]     GHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\ABIR
[nltk_data]     GHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [15]:
df_train_copy['tokens'] = df_train_copy['no_stopwords'].apply(word_tokenize)

In [16]:
df_train_copy.head()

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re,no_stopwords,tokens
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...,panic streets richard widmark plays us navy do...,"[panic, streets, richard, widmark, plays, us, ..."
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...,ask first one really better one look sarah g r...,"[ask, first, one, really, better, one, look, s..."
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...,big fan faerie tale theatre ive seen one best ...,"[big, fan, faerie, tale, theatre, ive, seen, o..."
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...,finished reading book dillinger movie horribly...,"[finished, reading, book, dillinger, movie, ho..."
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...,greg davis bryan daly take crazed statements t...,"[greg, davis, bryan, daly, take, crazed, state..."


In [17]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\ABIR
[nltk_data]     GHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\ABIR
[nltk_data]     GHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
lemmatizer = WordNetLemmatizer()

def nltk_lemmatize(text):
    tokens = text.split()
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens])
df_train_copy['lemmatized'] = df_train_copy['no_stopwords'].apply(nltk_lemmatize)

In [19]:
df_train_copy.head()

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re,no_stopwords,tokens,lemmatized
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...,panic streets richard widmark plays us navy do...,"[panic, streets, richard, widmark, plays, us, ...",panic street richard widmark play u navy docto...
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...,ask first one really better one look sarah g r...,"[ask, first, one, really, better, one, look, s...",ask first one really better one look sarah g r...
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...,big fan faerie tale theatre ive seen one best ...,"[big, fan, faerie, tale, theatre, ive, seen, o...",big fan faerie tale theatre ive seen one best ...
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...,finished reading book dillinger movie horribly...,"[finished, reading, book, dillinger, movie, ho...",finished reading book dillinger movie horribly...
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...,greg davis bryan daly take crazed statements t...,"[greg, davis, bryan, daly, take, crazed, state...",greg davis bryan daly take crazed statement te...


In [20]:
df_train_copy['revised_tokens'] = df_train_copy['lemmatized'].apply(word_tokenize)

In [21]:
df_train_copy

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re,no_stopwords,tokens,lemmatized,revised_tokens
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...,panic streets richard widmark plays us navy do...,"[panic, streets, richard, widmark, plays, us, ...",panic street richard widmark play u navy docto...,"[panic, street, richard, widmark, play, u, nav..."
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...,ask first one really better one look sarah g r...,"[ask, first, one, really, better, one, look, s...",ask first one really better one look sarah g r...,"[ask, first, one, really, better, one, look, s..."
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...,big fan faerie tale theatre ive seen one best ...,"[big, fan, faerie, tale, theatre, ive, seen, o...",big fan faerie tale theatre ive seen one best ...,"[big, fan, faerie, tale, theatre, ive, seen, o..."
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...,finished reading book dillinger movie horribly...,"[finished, reading, book, dillinger, movie, ho...",finished reading book dillinger movie horribly...,"[finished, reading, book, dillinger, movie, ho..."
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...,greg davis bryan daly take crazed statements t...,"[greg, davis, bryan, daly, take, crazed, state...",greg davis bryan daly take crazed statement te...,"[greg, davis, bryan, daly, take, crazed, state..."
...,...,...,...,...,...,...,...,...
24995,My roommates & I nearly shorted out our TV fro...,0,my roommates & i nearly shorted out our tv fro...,my roommates i nearly shorted out our tv from...,roommates nearly shorted tv numerous spittakes...,"[roommates, nearly, shorted, tv, numerous, spi...",roommate nearly shorted tv numerous spittakes ...,"[roommate, nearly, shorted, tv, numerous, spit..."
24996,Michelle Rodriguez is the defining actress who...,1,michelle rodriguez is the defining actress who...,michelle rodriguez is the defining actress who...,michelle rodriguez defining actress could char...,"[michelle, rodriguez, defining, actress, could...",michelle rodriguez defining actress could char...,"[michelle, rodriguez, defining, actress, could..."
24997,Nice movie with a great soundtrack which spans...,1,nice movie with a great soundtrack which spans...,nice movie with a great soundtrack which spans...,nice movie great soundtrack spans rock landsca...,"[nice, movie, great, soundtrack, spans, rock, ...",nice movie great soundtrack span rock landscap...,"[nice, movie, great, soundtrack, span, rock, l..."
24998,"Even though this was a made-for-TV production,...",0,"even though this was a made-for-tv production,...",even though this was a madefortv production th...,even though madefortv production theres absolu...,"[even, though, madefortv, production, theres, ...",even though madefortv production there absolut...,"[even, though, madefortv, production, there, a..."


In [22]:
# Creating the CSV files to the directory
df_train_copy.to_csv("training_processed_dataset.csv",index=False,encoding="utf-8")

# Repeat Same for test dataset

In [23]:
df_test_copy["clean_lowercase"] = df_test_copy["review"].apply(text_lower)

In [24]:
df_test_copy["clean_lowercase_re"] = df_test_copy["clean_lowercase"].apply(remove_p_ss)

In [25]:
df_test_copy['no_stopwords'] = df_test_copy['clean_lowercase_re'].apply(remove_stopwords)

In [26]:
df_test_copy['tokens'] = df_test_copy['no_stopwords'].apply(word_tokenize)

In [27]:
df_test_copy['lemmatized'] = df_test_copy['no_stopwords'].apply(nltk_lemmatize)

In [30]:
df_test_copy['revised_tokens'] = df_test_copy['lemmatized'].apply(word_tokenize)

In [31]:
df_test_copy.to_csv("testing_processed_dataset.csv",index=False,encoding="utf-8")