In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset=pd.read_csv("IMDB Dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataset["review"] = dataset["review"].astype(str)  # Convert object to string


In [5]:
dataset["review"].replace("", np.nan, inplace=True)  # Convert empty strings to NaN
dataset["review"].fillna("No review", inplace=True)  # Replace NaN with a default text

In [6]:
dataset.shape

(50000, 2)

In [7]:
dataset["review"][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

for applying in whole dataset

In [8]:
dataset["review"]=dataset["review"].str.lower().head()

In [9]:
dataset.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive


Removing HTML Tags

In [10]:
#This function can be found on internet
import re

def remove_html_tags(text):
    # Ensure the text is a string
    if isinstance(text, str):
        pattern = re.compile("<.*?>")
        return pattern.sub(r"", text)
    else:
        return text  # return as-is if it's not a string (like NaN or float)


In [11]:
dataset["review"]=dataset["review"].apply(remove_html_tags)

In [12]:
dataset["review"][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

Removing Punctuation

In [13]:
import string

In [14]:
exclude=string.punctuation

In [15]:
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,"")
    return text

In [16]:
text="Hi, What is your name?"

In [17]:
remove_punc(text)

'Hi What is your name'

In [18]:
#The way upp is very slow so we use another method which is faster
def remove_punc1(text):
    exclude = string.punctuation  # Punctuation to remove
    return str(text).translate(str.maketrans("", "", exclude))

In [19]:
remove_punc1(text)

'Hi What is your name'

In [20]:
#now in dataset
dataset["review"].apply(remove_punc1)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995                                                  nan
49996                                                  nan
49997                                                  nan
49998                                                  nan
49999                                                  nan
Name: review, Length: 50000, dtype: object

In [21]:
dataset["review"][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

Removing stop words

In [22]:
from nltk.corpus import stopwords

In [23]:
stop_words=stopwords.words("english")

In [24]:
def remove_stopwords(text):
   
    clear_text=[]
    impure_text=[]
    if not isinstance(text, str):  # Check if text is not a string
        return text  # Return the original value (useful for NaNs or numbers)

    for word in text.split():
        if word not in stop_words:
            clear_text.append(word)
        else:
            impure_text.append(word)
    return clear_text

In [25]:
x="a the politics"
remove_stopwords(x)

['politics']

In [26]:
dataset["review"].apply(remove_stopwords)

0        [one, reviewers, mentioned, watching, 1, oz, e...
1        [wonderful, little, production., filming, tech...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there's, family, little, boy, (jak...
4        [petter, mattei's, "love, time, money", visual...
                               ...                        
49995                                                  NaN
49996                                                  NaN
49997                                                  NaN
49998                                                  NaN
49999                                                  NaN
Name: review, Length: 50000, dtype: object

Tokenization

In [27]:
from nltk.tokenize import word_tokenize

In [28]:
def tokenize(text):
    if pd.isna(text):  # Check if text is NaN or None
        return []  # Return an empty list instead of processing NaN
    if not isinstance(text, str):  # Check if text is not a string
        return word_tokenize(text)  # Return the original value (useful for NaNs or numbers)
    for word in text:
        if word in text:
            return word_tokenize(text)
        else:
            return text

     # Return the original value (useful for NaNs or numbers)
    


In [29]:
tokenize("I am a human being")

['I', 'am', 'a', 'human', 'being']

In [30]:
dataset["review"].apply(tokenize)

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, ., the, fil...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, there, 's, a, family, where, a, li...
4        [petter, mattei, 's, ``, love, in, the, time, ...
                               ...                        
49995                                                   []
49996                                                   []
49997                                                   []
49998                                                   []
49999                                                   []
Name: review, Length: 50000, dtype: object