In [3]:
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pradyumnsrivast/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Data Ingestion**

In [4]:
!pwd

/mnt/c/Users/pradyumn.srivast/Desktop/AI/GenAI/genai-bert


In [6]:
data = pd.read_csv("./data/IMDB Dataset.csv")
data = data[:10000]
print(data.shape)

(10000, 2)


In [7]:
data.sample(3)

Unnamed: 0,review,sentiment
4821,"Sammi, Curr a metal rock god, they tried to st...",positive
7238,Just picked this up on DVD and watched it agai...,positive
5103,How can a movie be both controversial and gent...,positive


In [8]:
data['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [9]:
data['sentiment'].value_counts(normalize = True)

sentiment
positive    0.5028
negative    0.4972
Name: proportion, dtype: float64

In [10]:
data.duplicated().any()

np.True_

In [11]:
data = data.drop_duplicates()
data.duplicated().any()

np.False_

In [12]:
data.shape

(9983, 2)

**Cleaning the reviews**

In [13]:
data_org = data.copy()

In [14]:
def RemoveUrl(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', str(text))

data['review'] = data['review'].map(lambda x: RemoveUrl(x))

In [15]:
stop = stopwords.words('english')
data['review'] = data['review'].apply(lambda x: ' '.join(x.lower() for x in x.split() if x not in stop))

In [16]:
def RemoveOneTwoLetter(text):
    txt1 = re.sub(r'\b\w{1,2}\b', '', text)
    txt2 = re.sub(' +', ' ', txt1)
    return txt2

def RemoveDigits(text):
    txt1 = re.sub(r'\d+', '', text)
    txt2 = re.sub(' +', ' ', txt1)
    return txt2    

def RemoveTags(text):
    txt1 = re.sub(re.compile('<.*?>'), '', text)
    return txt1

def RemoveSpecialCharacters(text):
    sp = re.compile(r'''[.,:;"'\(\)\{\}\[\]\-\_]''')
    txt1 = re.sub(sp, '', text)
    return txt1

def RemoveExtraWhitespace(text):
    return re.sub(r'\s+', ' ', text)

In [17]:
data['review'] = data['review'].map(lambda x: RemoveDigits(x))
data['review'] = data['review'].map(lambda x: RemoveOneTwoLetter(x))
data['review'] = data['review'].map(lambda x: RemoveTags(x))
data['review'] = data['review'].map(lambda x: RemoveSpecialCharacters(x))
data['review'] = data['review'].map(lambda x: RemoveExtraWhitespace(x))

In [18]:
data_org['review'][100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

In [19]:
data['review'][100]

'this short film inspired soon full length feature spatula madness hilarious piece contends similar cartoons yielding multiple writers the short film stars edward spatula fired job joins fight evil spoons this premise allows funny content near beginning barely present remainder feature this film minute running time absorbed oddball comedy small musical number unfortunately much else lies the plot set really time show but surely follows plot better many highbudget hollywood films this film worth watching least times take expect deep story'

In [20]:
data['review'] = data['review'].apply(lambda x:x.lower())

**Split the data**

In [21]:
X = data.iloc[:,0:1]
y = data['sentiment']

In [22]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2025)

In [24]:
print(X_train.shape, X_test.shape)

(7986, 1) (1997, 1)


In [25]:
sum(y_train), sum(y_test)

(np.int64(4043), np.int64(980))

In [27]:
4043/7986, 980/1997

(0.5062609566741798, 0.49073610415623437)

**Vector Embeddings (sparse vector)**

**1. CountVectorizer (BOW)**

In [28]:
cv = CountVectorizer(max_features = 2500, analyzer = 'word')
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [29]:
X_train_bow.shape, X_test_bow.shape # (rows, vocabulary)

((7986, 2500), (1997, 2500))

In [30]:
# cv.vocabulary_

In [31]:
print(len(X_train_bow[0]))

2500


In [33]:
print(sum(X_train_bow[0])) # only 351 out of 2500 dimensions have 1 (very sparse embeddings)

351


**Model Building**

In [93]:
LR = LogisticRegression(random_state = 2025, solver = 'liblinear', verbose = 2, C = 0.01)
LR

In [94]:
LR.fit(X_train_bow,y_train)

[LibLinear]iter  1 act 2.112e+01 pre 1.873e+01 delta 2.703e+00 f 5.535e+01 |g| 2.487e+01 CG   8
iter  2 act 2.442e+00 pre 2.144e+00 delta 2.703e+00 f 3.423e+01 |g| 6.725e+00 CG   7
iter  3 act 1.806e-01 pre 1.732e-01 delta 2.703e+00 f 3.179e+01 |g| 1.679e+00 CG   7
iter  4 act 2.715e-03 pre 2.711e-03 delta 2.703e+00 f 3.161e+01 |g| 1.933e-01 CG   8
iter  5 act 2.012e-05 pre 2.011e-05 delta 2.703e+00 f 3.161e+01 |g| 1.291e-02 CG   9


In [95]:
y_pred_train = LR.predict(X_train_bow)
accuracy_score(y_train,y_pred_train)

0.8989481592787378

In [96]:
y_pred_test = LR.predict(X_test_bow)
accuracy_score(y_test,y_pred_test)

0.8602904356534802

**2. TF-IDF**

In [97]:
tfidf = TfidfVectorizer(max_features = 3000, analyzer = 'word', ngram_range = (1,2))
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [98]:
X_train_tfidf.shape, X_test_tfidf.shape # (rows, vocabulary (including ngrams))

((7986, 3000), (1997, 3000))

**Model Building**

In [101]:
LR = LogisticRegression(random_state = 2025, solver = 'liblinear', verbose = 2, C = 0.01)
LR

In [102]:
LR.fit(X_train_tfidf,y_train)

[LibLinear]iter  1 act 1.488e+00 pre 1.488e+00 delta 1.655e+00 f 5.535e+01 |g| 1.851e+00 CG   2
iter  2 act 1.508e-03 pre 1.508e-03 delta 1.655e+00 f 5.387e+01 |g| 5.752e-02 CG   2
iter  3 act 4.347e-06 pre 4.347e-06 delta 1.655e+00 f 5.386e+01 |g| 3.123e-03 CG   2
iter  4 act 1.457e-08 pre 1.457e-08 delta 1.655e+00 f 5.386e+01 |g| 1.783e-04 CG   2


In [103]:
y_pred_train = LR.predict(X_train_tfidf)
accuracy_score(y_train,y_pred_train)

0.8293263210618582

In [104]:
y_pred_test = LR.predict(X_test_tfidf)
accuracy_score(y_test,y_pred_test)

0.8057085628442664