In [12]:
import pandas as pd

df = pd.read_csv('IMDB Dataset.csv')
positive = df[df["sentiment"] == "positive"].head(50)
negative = df[df["sentiment"] == "negative"].head(50)
df = pd.concat([positive, negative], ignore_index=True)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive
4,"Probably my all-time favorite movie, a story o...",positive
...,...,...
95,"Oh God, I must have seen this when I was only ...",negative
96,"""Fate"" leads Walter Sparrow to come in possess...",negative
97,"We brought this film as a joke for a friend, a...",negative
98,This was probably the worst movie i have ever ...,negative


In [13]:
df["sentiment_label"] = df["sentiment"].map({"positive": 1, "negative": 0})
df

Unnamed: 0,review,sentiment,sentiment_label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
4,"Probably my all-time favorite movie, a story o...",positive,1
...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0
97,"We brought this film as a joke for a friend, a...",negative,0
98,This was probably the worst movie i have ever ...,negative,0


In [16]:
import nltk
from nltk.corpus import stopwords

nltk.download('opinion_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text: str) -> tuple[str, int]:
    text = text.lower()

    tokens = nltk.word_tokenize(text=text)

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    cleaned_text = " ".join(tokens)
    return cleaned_text, len(tokens)

df[["clean_review", "num_tokens"]] = pd.DataFrame(
    df["review"].apply(preprocess_text).to_list(),
    index=df.index,
) 

df

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/shoto/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/shoto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/shoto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/shoto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/shoto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching oz episode hoo...,168
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br br filming tech...,86
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...,84
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...,127
4,"Probably my all-time favorite movie, a story o...",positive,1,probably favorite movie story selflessness sac...,52
...,...,...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0,oh god must seen twelve ask may young stupid a...,136
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0,fate lead walter sparrow come possession myste...,218
97,"We brought this film as a joke for a friend, a...",negative,0,brought film joke friend could worst joke play...,35
98,This was probably the worst movie i have ever ...,negative,0,probably worst movie ever seen life stupid plo...,66


In [6]:
from nltk.corpus import opinion_lexicon

positive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()

print(f"Lexicon with {len(positive_words)} positive and {len(negative_words)} negative words")

Lexicon with 2006 positive and 4783 negative words


In [26]:
def count_pos_neg(text: str) -> pd.Series:
    words = text.split()
    nwords = [w for w in words if w in negative_words]
    pwords = [w for w in words if w in positive_words]
    return pd.Series({
        "pwords": pwords,
        "nwords": nwords,
        "pwords_count": len(pwords),
        "nwords_count": len(nwords),
    })

df[["pwords", "nwords", "pwords_count", "nwords_count"]] = df["clean_review"].apply(count_pos_neg)

In [29]:
df

Unnamed: 0,review,sentiment,sentiment_label,clean_review,num_tokens,pwords,nwords,pwords_count,nwords_count
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching oz episode hoo...,168,"[right, right, trust, regard, classic, appeal,...","[struck, brutality, faint, timid, punch, priso...",13,20
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br br filming tech...,86,"[wonderful, comforting, well, seamless, well, ...",[terribly],11,1
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...,84,"[wonderful, hot, witty, likable, well, impress...","[plot, simplistic, killer, disappointed, risk,...",11,6
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...,127,"[love, stunning, vivid, success, stylishly, so...","[loneliness, anxiously]",15,2
4,"Probably my all-time favorite movie, a story o...",positive,1,probably favorite movie story selflessness sac...,52,"[favorite, noble, delight, like, fun, believable]","[boring, midget, slow, startling]",6,4
...,...,...,...,...,...,...,...,...,...
95,"Oh God, I must have seen this when I was only ...",negative,0,oh god must seen twelve ask may young stupid a...,136,"[impressive, better, convincing, ambitious, be...","[stupid, bad, nasty, gross, silly, shark, dang...",7,15
96,"""Fate"" leads Walter Sparrow to come in possess...",negative,0,fate lead walter sparrow come possession myste...,218,"[lead, intriguing, pretty, like, good, pretty,...","[mysterious, fiction, undone, weak, failed, di...",16,29
97,"We brought this film as a joke for a friend, a...",negative,0,brought film joke friend could worst joke play...,35,[],"[joke, worst, joke, dire, worst, substandard, ...",0,12
98,This was probably the worst movie i have ever ...,negative,0,probably worst movie ever seen life stupid plo...,66,"[good, hot, like]","[worst, stupid, plot, ridiculous, bad, sick, t...",3,9


In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(2, 2)
        self.output = nn.Linear(2, 1)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = torch.sigmoid(self.output(x))
        return x


model = SimpleMLP()

print(model)

SimpleMLP(
  (hidden): Linear(in_features=2, out_features=2, bias=True)
  (output): Linear(in_features=2, out_features=1, bias=True)
)


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[["pwords_count", "nwords_count"]].values
y = df["sentiment_label"].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

epochs = 50
for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

with torch.no_grad():
    y_pred_test = model(X_test)
    y_pred_labels = (y_pred_test >= 0.5).float()
    accuracy = (y_pred_labels == y_test).sum().item() / len(y_test)
    print(f"\nTest Accuracy: {accuracy * 100:.2f}%")

new_data = torch.tensor(scaler.transform([[3.0, 1.0]]), dtype=torch.float32)
with torch.no_grad():
    prediction = model(new_data).item()
    sentiment = "Positive" if prediction >= 0.5 else "Negative"
print(f"Predicted Sentiment: {sentiment} (score = {prediction:.2f})")


Epoch 10/50, Loss: 0.6059
Epoch 20/50, Loss: 0.6059
Epoch 30/50, Loss: 0.6059
Epoch 40/50, Loss: 0.6058
Epoch 50/50, Loss: 0.6058

Test Accuracy: 50.00%
Predicted Sentiment: Negative (score = 0.45)
