In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import re, string
import random

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

df = pd.read_csv("reviews_data.csv")#https://www.kaggle.com/datasets/harshalhonde/starbucks-reviews-dataset

TEXT_COL = "Review"
RATING_COL = "Rating"

df = df.dropna(subset=[TEXT_COL, RATING_COL])

df["label"] = (df[RATING_COL] >= 4).astype(int)

print(df[[TEXT_COL, "label"]].head())
print("Class distribution:")
print(df["label"].value_counts())

def tokenize(text):
    text = text.lower()                                       # до нижнього регістру
    text = re.sub(r"@\S+", " ", text)                         # прибрати згадки @user
    text = re.sub(r"http\S+", " ", text)                      # прибрати посилання
    text = re.sub(r"<.*?>", " ", text)                        # прибрати HTML-теги
    text = re.sub(r"[^a-z\s]", " ", text)                     # залишити лише букви (прибрати цифри, спецсимволи)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # прибрати пунктуацію
    text = re.sub(r"\s+", " ", text)                          # замінити багато пробілів на один
    return text

tokenized_texts = df[TEXT_COL].apply(tokenize)


MAX_VOCAB = 20000
counter = Counter()

for tokens in tokenized_texts:
    counter.update(tokens)

vocab = {"<PAD>": 0, "<UNK>": 1}
for word, _ in counter.most_common(MAX_VOCAB - 2):
    vocab[word] = len(vocab)

def encode(tokens):
    return [vocab.get(t, vocab["<UNK>"]) for t in tokens]

encoded_texts = tokenized_texts.apply(encode)

MAX_LEN = 100

def pad_sequence(seq):
    if len(seq) < MAX_LEN:
        return seq + [0] * (MAX_LEN - len(seq))
    return seq[:MAX_LEN]

X = torch.tensor(
    [pad_sequence(seq) for seq in encoded_texts],
    dtype=torch.long
)

y = torch.tensor(df["label"].values, dtype=torch.float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(TextDataset(X_train, y_train), batch_size=64, shuffle=True)

test_loader  = DataLoader(TextDataset(X_test, y_test), batch_size=64)

class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        out = self.fc(h[-1])
        return out.squeeze(1)

def train_and_evaluate(model, epochs=20):
    model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, loss = {total_loss:.4f}")

    # Evaluation
    model.eval()
    preds, true = [], []

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds.extend((probs > 0.5).cpu().numpy())
            true.extend(yb.numpy())

    return accuracy_score(true, preds)

model_A = LSTM(len(vocab))
acc_A = train_and_evaluate(model_A)


def load_glove_embeddings(path, vocab, emb_dim=100):
    embeddings = np.random.normal(scale=0.6, size=(len(vocab), emb_dim))

    with open(path, encoding="utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            if word in vocab:
                embeddings[vocab[word]] = np.array(parts[1:], dtype=np.float32)

    return torch.tensor(embeddings, dtype=torch.float)

glove_weights = load_glove_embeddings("glove.6B.100d.txt", vocab)

class LSTM_GloVe(nn.Module):
    def __init__(self, emb_weights, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            emb_weights, freeze=False
        )
        self.lstm = nn.LSTM(emb_weights.size(1), hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        return self.fc(h[-1]).squeeze(1)

model_B = LSTM_GloVe(glove_weights)
acc_B = train_and_evaluate(model_B)


print(f"LSTM: {acc_A:.4f}")
print(f"GloVe: {acc_B:.4f}")


Device: cpu
                                              Review  label
0  Amber and LaDonna at the Starbucks on Southwes...      1
1  ** at the Starbucks by the fire station on 436...      1
2  I just wanted to go out of my way to recognize...      1
3  Me and my friend were at Starbucks and my card...      1
4  I’m on this kick of drinking 5 cups of warm wa...      1
Class distribution:
label
0    583
1    122
Name: count, dtype: int64
Epoch 1/20, loss = 5.4995
Epoch 2/20, loss = 4.2335
Epoch 3/20, loss = 4.0829
Epoch 4/20, loss = 4.0479
Epoch 5/20, loss = 3.9752
Epoch 6/20, loss = 3.9580
Epoch 7/20, loss = 3.9600
Epoch 8/20, loss = 3.8531
Epoch 9/20, loss = 3.7999
Epoch 10/20, loss = 3.6955
Epoch 11/20, loss = 3.6113
Epoch 12/20, loss = 3.6159
Epoch 13/20, loss = 3.3657
Epoch 14/20, loss = 3.2385
Epoch 15/20, loss = 3.1271
Epoch 16/20, loss = 2.9618
Epoch 17/20, loss = 2.7284
Epoch 18/20, loss = 2.6504
Epoch 19/20, loss = 2.4564
Epoch 20/20, loss = 2.1709
Epoch 1/20, loss = 5.1729
E