In [2]:
import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import json
import re
from collections import Counter
from tqdm import tqdm

import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\neera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:

# Download tokenizers
nltk.download("punkt", quiet=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

MAX_ROWS = 200_000
MAX_FEATURES = 5000
BATCH_SIZE = 128
EPOCHS = 3

NON_ALPHANUM = re.compile(r"[\W]")
NON_ASCII = re.compile(r"[^a-z0-9\s]")

def normalize_text(text):
    text = text.lower()
    text = NON_ALPHANUM.sub(" ", text)
    text = NON_ASCII.sub("", text)
    return text.strip()


Using device: cpu


In [5]:

# DATA LOADER

def load_data(filepath):
   
    labels, texts = [], []
    with open(filepath, encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i == MAX_ROWS:
                break
            if not line.strip():
                continue

            parts = line.strip().split(' ', 1)
            if len(parts) != 2:
                continue

            label_str, text = parts
            # Convert labels like "__label__2" → 1 (positive), "__label__1" → 0 (negative)
            label = 1 if "2" in label_str else 0
            labels.append(label)
            texts.append(normalize_text(text))
    return labels, texts

#Use your paths here
train_path = r"reviews-dataset\train.ft.txt"
test_path = r"reviews-dataset\test.ft.txt"

train_labels, train_texts = load_data(train_path)
test_labels, test_texts = load_data(test_path)

print(f"Loaded {len(train_texts)} training samples, {len(test_texts)} testing samples.")


# TOKENIZATION & VOCAB

def tokenize_text(text):
    return word_tokenize(text)

def make_vocab(texts):
    counter = Counter(tk for text in texts for tk in tokenize_text(text))
    vocab = {tk: i + 2 for i, (tk, _) in enumerate(counter.most_common(MAX_FEATURES - 2))}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab

vocab = make_vocab(train_texts)
print(f"Vocab size: {len(vocab)}")

def encode_text(text):
    return [vocab.get(tk, 1) for tk in tokenize_text(text)]



Loaded 200000 training samples, 200000 testing samples.
Vocab size: 5000


In [6]:

# DATASETS

class ReviewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(encode_text(t), dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True)
    return texts_padded, torch.tensor(labels, dtype=torch.float32)

train_ds = ReviewsDataset(train_texts, train_labels)
test_ds = ReviewsDataset(test_texts, test_labels)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


# CNN MODEL

class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.conv1 = nn.Conv1d(emb_dim, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(2)
        self.conv3 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.globpool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64, 100)
        self.fc2 = nn.Linear(100, 1)
    
    def forward(self, x):
        x = self.embed(x).transpose(1, 2)
        x = self.pool1(torch.relu(self.conv1(x)))
        x = self.pool2(torch.relu(self.conv2(x)))
        x = torch.relu(self.conv3(x))
        x = self.globpool(x).squeeze(2)
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x)).squeeze(1)



In [5]:

# TRAINING & EVALUATION

def evaluate(model, loader):
    model.eval()
    preds, ys = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy()
            preds.extend(pred)
            ys.extend(yb.numpy())

    preds, ys = np.array(preds), np.array(ys)
    acc = accuracy_score(ys, preds > 0.5)
    f1 = f1_score(ys, preds > 0.5)
    auc = roc_auc_score(ys, preds)
    print(f"Accuracy: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")

def train_model(model, epochs=EPOCHS):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in tqdm(train_dl, desc=f"Epoch {epoch+1}/{epochs}"):
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_dl):.4f}")
        evaluate(model, test_dl)


# RUN TRAINING

model = CNNClassifier(len(vocab)).to(device)
train_model(model, EPOCHS)

torch.jit.save(torch.jit.script(model), "amazon_reviews_sentiment.pt")
json.dump(vocab, open("amazon_vocab.json", "w"))
print("Model and vocabulary saved successfully.")







Epoch 1/3: 100%|██████████| 1563/1563 [02:49<00:00,  9.24it/s]


Epoch 1 - Loss: 0.3229
Accuracy: 0.8864 | F1: 0.8787 | AUC: 0.9661


Epoch 2/3: 100%|██████████| 1563/1563 [02:33<00:00, 10.17it/s]


Epoch 2 - Loss: 0.2103
Accuracy: 0.9153 | F1: 0.9164 | AUC: 0.9731


Epoch 3/3: 100%|██████████| 1563/1563 [02:37<00:00,  9.92it/s]


Epoch 3 - Loss: 0.1821
Accuracy: 0.9204 | F1: 0.9209 | AUC: 0.9754
Model and vocabulary saved successfully.
