In [23]:
import pandas as pd

dataset = pd.read_csv('IMDB Dataset.csv')
df = dataset
positive = df[df["sentiment"] == "positive"].head(50)
negative = df[df["sentiment"] == "negative"].head(50)
df = pd.concat([positive, negative], ignore_index=True)

testing = dataset.drop(positive.index.union(negative.index))
print(f"000TRAINING000:\n{df}\n\n")
print(f"000TESTING000:\n{testing}")

000TRAINING000:
                                               review sentiment
0   One of the other reviewers has mentioned that ...  positive
1   A wonderful little production. <br /><br />The...  positive
2   I thought this was a wonderful way to spend ti...  positive
3   Petter Mattei's "Love in the Time of Money" is...  positive
4   Probably my all-time favorite movie, a story o...  positive
..                                                ...       ...
95  Oh God, I must have seen this when I was only ...  negative
96  "Fate" leads Walter Sparrow to come in possess...  negative
97  We brought this film as a joke for a friend, a...  negative
98  This was probably the worst movie i have ever ...  negative
99  This is a typical Steele novel production in t...  negative

[100 rows x 2 columns]


000TESTING000:
                                                  review sentiment
87     Oh noes one of these attack of the Japanese gh...  negative
88     Nicholas Walker is Paul, the local

In [4]:
df["sentiment_label"] = df["sentiment"].map({"positive": 1, "negative": 0})
df

Unnamed: 0,review,sentiment,sentiment_label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
4,"Probably my all-time favorite movie, a story o...",positive,1
...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0
97,"We brought this film as a joke for a friend, a...",negative,0
98,This was probably the worst movie i have ever ...,negative,0


In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('opinion_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text: str) -> tuple[str, int]:
    text = text.lower()

    tokens = nltk.word_tokenize(text=text)

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    cleaned_text = " ".join(tokens)
    return cleaned_text, len(tokens)

df[["clean_review", "num_tokens"]] = pd.DataFrame(
    df["review"].apply(preprocess_text).to_list(),
    index=df.index,
)

df

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/shoto/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/shoto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/shoto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/shoto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/shoto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching oz episode hoo...,168
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br br filming tech...,86
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...,84
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...,127
4,"Probably my all-time favorite movie, a story o...",positive,1,probably favorite movie story selflessness sac...,52
...,...,...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0,oh god must seen twelve ask may young stupid a...,136
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0,fate lead walter sparrow come possession myste...,218
97,"We brought this film as a joke for a friend, a...",negative,0,brought film joke friend could worst joke play...,35
98,This was probably the worst movie i have ever ...,negative,0,probably worst movie ever seen life stupid plo...,66


In [6]:
from nltk.corpus import opinion_lexicon

positive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()

print(f"Lexicon with {len(positive_words)} positive and {len(negative_words)} negative words")

Lexicon with 2006 positive and 4783 negative words


In [7]:
def count_pos_neg(text: str) -> pd.Series:
    words = text.split()
    nwords = [w for w in words if w in negative_words]
    pwords = [w for w in words if w in positive_words]
    return pd.Series({
        "pwords": pwords,
        "nwords": nwords,
        "pwords_count": len(pwords),
        "nwords_count": len(nwords),
    })

df[["pwords", "nwords", "pwords_count", "nwords_count"]] = df["clean_review"].apply(count_pos_neg)

In [8]:
df

Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens,pwords,nwords,pwords_count,nwords_count
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching oz episode hoo...,168,"[right, right, trust, regard, classic, appeal,...","[struck, brutality, faint, timid, punch, priso...",13,20
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br br filming tech...,86,"[wonderful, comforting, well, seamless, well, ...",[terribly],11,1
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...,84,"[wonderful, hot, witty, likable, well, impress...","[plot, simplistic, killer, disappointed, risk,...",11,6
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...,127,"[love, stunning, vivid, success, stylishly, so...","[loneliness, anxiously]",15,2
4,"Probably my all-time favorite movie, a story o...",positive,1,probably favorite movie story selflessness sac...,52,"[favorite, noble, delight, like, fun, believable]","[boring, midget, slow, startling]",6,4
...,...,...,...,...,...,...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0,oh god must seen twelve ask may young stupid a...,136,"[impressive, better, convincing, ambitious, be...","[stupid, bad, nasty, gross, silly, shark, dang...",7,15
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0,fate lead walter sparrow come possession myste...,218,"[lead, intriguing, pretty, like, good, pretty,...","[mysterious, fiction, undone, weak, failed, di...",16,29
97,"We brought this film as a joke for a friend, a...",negative,0,brought film joke friend could worst joke play...,35,[],"[joke, worst, joke, dire, worst, substandard, ...",0,12
98,This was probably the worst movie i have ever ...,negative,0,probably worst movie ever seen life stupid plo...,66,"[good, hot, like]","[worst, stupid, plot, ridiculous, bad, sick, t...",3,9


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(2, 2)
        self.output = nn.Linear(2, 1)

    
    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = F.sigmoid(self.output(x))
        return x

model = MLP()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:

X = df[['pwords_count', 'nwords_count']].values
Y = df['sentiment_label'].values

X = torch.tensor(X, dtype=torch.float32)
Y = torch.tensor(Y, dtype=torch.float32).unsqueeze(1)

In [11]:
def train_model(model, criterion, optimizer, X, Y, epochs=10000):
    for epoch in range(epochs):
        output = model(X)
        loss = criterion(output, Y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % int(epochs / 10) == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

train_model(model, criterion, optimizer, X, Y)

Epoch 1000/10000, Loss: 0.5380
Epoch 2000/10000, Loss: 0.5299
Epoch 3000/10000, Loss: 0.5263
Epoch 4000/10000, Loss: 0.5247
Epoch 5000/10000, Loss: 0.5232
Epoch 6000/10000, Loss: 0.5217
Epoch 7000/10000, Loss: 0.5204
Epoch 8000/10000, Loss: 0.5192
Epoch 9000/10000, Loss: 0.5181
Epoch 10000/10000, Loss: 0.5173


In [12]:
testing = testing.head(100)
testing["sentiment_label"] = testing["sentiment"].map({"positive": 1, "negative": 0})

testing[["clean_review","num_tokens"]] = pd.DataFrame(
    testing["review"].apply(preprocess_text).to_list(),
    index=testing.index
)

In [13]:
testing

Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens
87,Oh noes one of these attack of the Japanese gh...,negative,0,oh no one attack japanese ghost girl movie eve...,65
88,"Nicholas Walker is Paul, the local town Revera...",negative,0,nicholas walker paul local town reverand marri...,110
89,Hollywood movie industry is the laziest one in...,negative,0,hollywood movie industry laziest one entire wo...,194
91,"If you came here, it's because you've already ...",negative,0,came already seen film curious others say br b...,81
94,"I watched this series out of curiosity,wanting...",negative,0,watched series curiosity wanting see could pos...,72
...,...,...,...,...,...
195,Phantasm ....Class. Phantasm II.....awesome. P...,negative,0,phantasm class phantasm ii awesome phantasm ii...,97
196,Ludicrous. Angelic 9-year-old Annakin turns in...,negative,0,ludicrous angelic annakin turn whiny brat anna...,107
197,"Scotty (Grant Cramer, who would go on to star ...",negative,0,scotty grant cramer would go star great killer...,95
198,If you keep rigid historical perspective out o...,positive,1,keep rigid historical perspective film actuall...,262


In [14]:
testing[["pwords", "nwords", "pwords_count", "nwords_count"]] = testing["clean_review"].apply(count_pos_neg)

In [15]:
testing

Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens,pwords,nwords,pwords_count,nwords_count
87,Oh noes one of these attack of the Japanese gh...,negative,0,oh no one attack japanese ghost girl movie eve...,65,[],"[attack, scary, sucked, lacked, worse]",0,5
88,"Nicholas Walker is Paul, the local town Revera...",negative,0,nicholas walker paul local town reverand marri...,110,"[good, pretty, good, like]","[womanizer, fake, death, crazy, misery, sad, w...",4,12
89,Hollywood movie industry is the laziest one in...,negative,0,hollywood movie industry laziest one entire wo...,194,"[fine, like, good, nice, glad, good, like, eas...","[crap, worst, dark, pointless, grudge, bad, sc...",8,25
91,"If you came here, it's because you've already ...",negative,0,came already seen film curious others say br b...,81,"[cure, like, good, positive]","[bleeding, stupid, ripoff, stupid, distraction...",4,7
94,"I watched this series out of curiosity,wanting...",negative,0,watched series curiosity wanting see could pos...,72,"[modern, classic, correct, spectacular, blockb...","[burning, drowning, trouble]",6,3
...,...,...,...,...,...,...,...,...,...
195,Phantasm ....Class. Phantasm II.....awesome. P...,negative,0,phantasm class phantasm ii awesome phantasm ii...,97,"[awesome, love, work, good, like]","[bad, pointless, crack, funny, dead, zombie, h...",5,11
196,Ludicrous. Angelic 9-year-old Annakin turns in...,negative,0,ludicrous angelic annakin turn whiny brat anna...,107,"[angelic, appealing, hero, precious, like, fan...","[ludicrous, whiny, brat, bad, crazy, die, ridi...",12,13
197,"Scotty (Grant Cramer, who would go on to star ...",negative,0,scotty grant cramer would go star great killer...,95,"[great, good, likable, lead, well, pretty]","[killer, bad, bad, bad, bad, limited, fat, fat...",6,9
198,If you keep rigid historical perspective out o...,positive,1,keep rigid historical perspective film actuall...,262,"[entertaining, lead, enjoy, work, prominence, ...","[rigid, fiction, hard, unresolved, quandary, j...",19,13


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X_t = testing[['pwords_count', 'nwords_count']].values
Y_t = testing['sentiment_label'].values

X_t = torch.tensor(X_t, dtype=torch.float32)
Y_t = torch.tensor(Y_t, dtype=torch.float32).unsqueeze(1)

model.eval()
with torch.no_grad():
    Y_pred = model(X)

for i, pred in enumerate(Y_pred):
    print(f"predicted {round(float(pred))} actual {int(Y_t[i])}")

y_true = Y_t.numpy().flatten()
y_pred = torch.round(Y_pred).numpy().flatten()

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

predicted 0 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 1
predicted 1 actual 1
predicted 1 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 1
predicted 1 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 1 actual 0
predicted 1 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 1
predicted 1 actual 0
predicted 1 actual 1
predicted 1 actual 0
predicted 0 actual 0
predicted 1 actual 0
predicted 0 actual 0
predicted 1 actual 1
predicted 0 actual 0
predicted 1 actual 1
predicted 1 actual 1
predicted 0 a

In [25]:
model2 = MLP()
criterion2 = nn.BCEWithLogitsLoss()
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.001)

train_model(model2, criterion2, optimizer2, X, Y, epochs=100000)

Epoch 10000/100000, Loss: 0.6162
Epoch 20000/100000, Loss: 0.6130
Epoch 30000/100000, Loss: 0.6130
Epoch 40000/100000, Loss: 0.6130
Epoch 50000/100000, Loss: 0.6130
Epoch 60000/100000, Loss: 0.6130
Epoch 70000/100000, Loss: 0.6130
Epoch 80000/100000, Loss: 0.6130
Epoch 90000/100000, Loss: 0.6130
Epoch 100000/100000, Loss: 0.6130


In [27]:
model2.eval()
with torch.no_grad():
    Y_pred = model2(X_t)

for i, pred in enumerate(Y_pred):
    print(f"predicted {round(float(pred))} actual {int(Y_t[i])}")

y_true = Y_t.numpy().flatten()
y_pred = torch.round(Y_pred).numpy().flatten()

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 1
predicted 1 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 0
predicted 0 actual 1
predicted 1 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 1
predicted 0 actual 1
predicted 1 actual 1
predicted 0 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 0
predicted 0 actual 1
predicted 0 actual 0
predicted 1 actual 1
predicted 1 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 1
predicted 0 actual 0
predicted 1 actual 1
predicted 0 actual 1
predicted 1 a

In [29]:
model3 = MLP()
criterion3 = nn.BCEWithLogitsLoss()
optimizer3 = torch.optim.SGD(model3.parameters(), lr=0.001)

train_model(model3, criterion3, optimizer3, X, Y)

model3.eval()
with torch.no_grad():
    Y_pred = model3(X_t)

for i, pred in enumerate(Y_pred):
    print(f"predicted {round(float(pred))} actual {int(Y_t[i])}")

y_true = Y_t.numpy().flatten()
y_pred = torch.round(Y_pred).numpy().flatten()

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

Epoch 1000/10000, Loss: 0.7337
Epoch 2000/10000, Loss: 0.7255
Epoch 3000/10000, Loss: 0.6926
Epoch 4000/10000, Loss: 0.6754
Epoch 5000/10000, Loss: 0.6678
Epoch 6000/10000, Loss: 0.6645
Epoch 7000/10000, Loss: 0.6625
Epoch 8000/10000, Loss: 0.6612
Epoch 9000/10000, Loss: 0.6601
Epoch 10000/10000, Loss: 0.6593
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 1
predicted 0 actual 0
predicted 0 actual 0
predicted 0 actual 0
predicted 1 actual 1
predicted 0 actual 1
predicted 1 actual 1
predicted 0 actual 1
predicted 0 actua

In [30]:
model3.eval()
with torch.no_grad():
    Y_pred = model3(X_t)

y_true = Y_t.numpy().flatten()
y_pred = torch.round(Y_pred).numpy().flatten()

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)


Accuracy:  0.7200
Precision: 0.8889
Recall:    0.2286
F1 Score:  0.3636

Confusion Matrix:
[[64  1]
 [27  8]]
