# 1. Data Exploration and Preprocessing

**A. Load Data and basic exploration**

In [14]:
# Import necessary libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# load the data

df = pd.read_csv("/content/Imdb - data_imdb.csv")

# ccheck shape and first rows

print(df.shape)
print(df.head())





(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [9]:
# CHecking for missing values
print(df.isnull().sum())

#check class balance
print(df['sentiment'].value_counts())

#check review lengths
df['review_length'] = df['review'].apply(len)
df['review_length'].describe()

review       0
sentiment    0
dtype: int64
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


Unnamed: 0,review_length
count,50000.0
mean,1309.36772
std,989.759532
min,7.0
25%,699.0
50%,970.0
75%,1590.0
max,13704.0


**2. Data Cleaning and Text Preprocessing**

**Steps:**

*    Lowercasing
*    Remove punctutation,numbers,and special characters
*    Remove stop words
*    Tokenization
*    Lemmatization


In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [15]:
def clean_text (text):
  text = text.lower()
  text = re.sub(r'[^a-z\s]',"",text)
  tokens = nltk.word_tokenize(text)
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return " ".join(tokens)

#apply clean text on df

df['clean_review'] = df['review'].apply(clean_text)


**3.Feature Engineering**

Transforming text into vectors using TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer



In [17]:
tfidf = TfidfVectorizer(max_features=1000)
x_tfidf = tfidf.fit_transform(df['clean_review'])

**Additional Textual Feature:**

*     Word Count
*     Character Count
*     Average word length




In [21]:
df['word_count'] = df['clean_review'].apply(lambda x: len(x.split()))

df['char_count'] = df['clean_review'].apply(len)

df['avg_word_length'] = df['char_count'] / df['word_count']

print("Word Count:")
print(df['word_count'].describe())

print("\nCharacter Count:")
print(df['char_count'].describe())

print("\nAverage Word Length:")
print(df['avg_word_length'].describe())

Word Count:
count    50000.000000
mean       121.531960
std         91.573864
min          1.000000
25%         65.000000
50%         90.000000
75%        148.000000
max       1440.000000
Name: word_count, dtype: float64

Character Count:
count    50000.000000
mean       827.739820
std        638.456818
min          5.000000
25%        433.000000
50%        608.000000
75%       1008.000000
max       9243.000000
Name: char_count, dtype: float64

Average Word Length:
count    50000.000000
mean         6.762380
std          0.448175
min          5.000000
25%          6.461538
50%          6.746988
75%          7.045455
max         16.131579
Name: avg_word_length, dtype: float64


**3. Model Development**

Label Encoding



In [22]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y= le.fit_transform(df['sentiment'])  #0 = negative,1= positive




Train - Test Split

In [23]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_tfidf,y,test_size=0.21,random_state=42)

**Models to try:**

*    Logistic Regression
*    Naive Bayes
*    Support Vector Machine
*    Random Forest


**Logistic Regression:**

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=1000)

lr.fit(x_train,y_train)

y_pred_lr = lr.predict(x_test)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5201
           1       0.86      0.87      0.86      5299

    accuracy                           0.86     10500
   macro avg       0.86      0.86      0.86     10500
weighted avg       0.86      0.86      0.86     10500



**Naive Bayes:**

In [26]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(x_train,y_train)

y_pred_nb = nb.predict(x_test)
print(classification_report(y_test,y_pred_nb))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      5201
           1       0.83      0.84      0.84      5299

    accuracy                           0.83     10500
   macro avg       0.83      0.83      0.83     10500
weighted avg       0.83      0.83      0.83     10500



**Support Vector Machine:**

In [27]:
from sklearn.svm import LinearSVC
svc = LinearSVC()

svc.fit(x_train,y_train)

y_pred_svm = svc.predict(x_test)

print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      5201
           1       0.86      0.87      0.86      5299

    accuracy                           0.86     10500
   macro avg       0.86      0.86      0.86     10500
weighted avg       0.86      0.86      0.86     10500



**Random Forest:**

In [29]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)

rf.fit(x_train,y_train)

y_pred_rf = rf.predict(x_test)

print(classification_report(y_test,y_pred_rf))


              precision    recall  f1-score   support

           0       0.83      0.84      0.84      5201
           1       0.84      0.84      0.84      5299

    accuracy                           0.84     10500
   macro avg       0.84      0.84      0.84     10500
weighted avg       0.84      0.84      0.84     10500

