In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("IMDB Dataset.csv")
# print(df)
# print(df.head())
print(df.describe())
# print(df.info())

                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


In [4]:
# df[['review','sentiment']]

In [5]:
print(df['sentiment'].unique())

['positive' 'negative']


In [6]:
print(df['sentiment'].value_counts())   # dataset with balanced class

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [7]:
print(df.count())

review       50000
sentiment    50000
dtype: int64


In [8]:
df.isnull().sum()  # it's mean, there is no missing value present in the dataset

review       0
sentiment    0
dtype: int64

convert the sentiment in 1 & 0
# positive - 1 
# negative - 0

In [9]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [10]:
print(df['sentiment'].unique())

[1 0]


In [11]:
print(df['sentiment'].value_counts())  

sentiment
1    25000
0    25000
Name: count, dtype: int64


In [12]:
print(df.head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


# Text Preprocessing (Cleaning the Reviews)

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    text = text.lower()

    words = word_tokenize(text)
    
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply preprocessing
df['filtered_review'] = df['review'].apply(preprocess_text)

# Display the cleaned data
print(df[['review', 'filtered_review']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                     filtered_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


In [14]:
df.to_csv('modified_dataset_for_ML.csv', index=False)

In [15]:
df.head()

Unnamed: 0,review,sentiment,filtered_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...
