In [1]:
import pandas as pd
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [2]:
train.head()

Unnamed: 0,3,more like funchuck,"Gave this to my dad for a gag gift after directing ""Nunsense,"" he got a reall kick out of it!"
0,5,Inspiring,I hope a lot of people hear this cd. We need m...
1,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
3,5,Too good to be true,Probably the greatest soundtrack in history! U...
4,5,There's a reason for the price,"There's a reason this CD is so expensive, even..."


In [3]:
import re
import pandas as pd

def clean_text(text: str) -> str:
    """
    Clean review text for transformer fine-tuning.
    Keeps semantic content intact.
    """
    # 1. Lowercase (optional for uncased models)
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # 3. Remove URLs
    text = re.sub(r'http\S+|www\S+', ' ', text)

    # 4. Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # 5. Remove special characters and digits (keep punctuation)
    text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\[\] ]", " ", text)

    # 6. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. Handle long repeated characters (like “cooooool”)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    return text


In [4]:
train.columns = ['rating', 'short_text', 'long_text']
test.columns = ['rating', 'short_text', 'long_text']

In [5]:
train['long_text'] = train['long_text'].astype(str).apply(clean_text)
test['long_text'] = test['long_text'].astype(str).apply(clean_text)
train['short_text'] = train['short_text'].astype(str).apply(clean_text)
test['short_text'] = test['short_text'].astype(str).apply(clean_text)


In [6]:
train['rating']=train['rating'].astype(int)-1
test['rating']=test['rating'].astype(int)-1

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score


  torch.utils._pytree._register_pytree_node(


In [8]:
from sklearn.model_selection import train_test_split

# Assume your dataframe is called df and the label column is named 'label'

# Step 1: Take a balanced sample of 30,000 rows
sample_df = train.groupby('rating', group_keys=False).apply(
    lambda x: x.sample(n=int(40000 * len(x) / len(train)), random_state=42)
).sample(frac=1, random_state=42).reset_index(drop=True)

# Step 2: Split into train and validation (e.g., 80/20 split) with stratified labels
train_df, val_df = train_test_split(
    sample_df,
    test_size=0.2,
    stratify=sample_df['rating'],
    random_state=42
)

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print(train_df['rating'].value_counts(normalize=True).head())
print(val_df['rating'].value_counts(normalize=True).head())


Train shape: (31999, 3)
Val shape: (8000, 3)
rating
1    0.200006
4    0.200006
3    0.200006
0    0.200006
2    0.199975
Name: proportion, dtype: float64
rating
3    0.2
0    0.2
1    0.2
2    0.2
4    0.2
Name: proportion, dtype: float64


In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

class AmazonDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.texts = df['long_text'].tolist()
        self.labels = df['rating'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encodings = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(0),
            'attention_mask': encodings['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# Load tokenizer and model base
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Use Longformer-base (you can also try 'allenai/longformer-large-4096' if you have GPU memory)
model_name = "allenai/longformer-base-4096"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Number of labels (e.g., 1–5 star ratings)
num_labels = len(train_df['rating'].unique())

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Longformer model and tokenizer loaded on", device)


Downloading config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading vocab.json: 0.00B [00:00, ?B/s]

Downloading merges.txt: 0.00B [00:00, ?B/s]

Downloading tokenizer.json: 0.00B [00:00, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Longformer model and tokenizer loaded on cuda


In [11]:
train_dataset = AmazonDataset(train_df, tokenizer)
val_dataset   = AmazonDataset(val_df, tokenizer)
test_dataset  = AmazonDataset(test, tokenizer)


In [12]:
# Assuming your Dataset stores labels as a list or Series
print("Min label:", min(train_dataset.labels))
print("Max label:", max(train_dataset.labels))


Min label: 0
Max label: 4


In [13]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)



In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [15]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import os

save_dir = "./longformer_model_checkpoints"
os.makedirs(save_dir, exist_ok=True)

for epoch in range(num_epochs):
    # -------- Training --------
    model.train()
    train_losses = []

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_losses.append(loss.item())
        loop.set_postfix(loss=loss.item(), avg_loss=np.mean(train_losses))

    avg_train_loss = np.mean(train_losses)

    # -------- Validation --------
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            val_labels.extend(batch['labels'].cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1} | Avg Train Loss: {avg_train_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

    # -------- Save model after each epoch --------
    epoch_save_path = os.path.join(save_dir, f"epoch_{epoch+1}")
    model.save_pretrained(epoch_save_path)
    tokenizer.save_pretrained(epoch_save_path)
    print(f"Model and tokenizer saved at {epoch_save_path}")


Epoch 1 Training: 100%|██████████| 8000/8000 [44:40<00:00,  2.98it/s, avg_loss=1.06, loss=1.09] 
Epoch 2 Training:  21%|██        | 1692/8000 [09:26<35:11,  2.99it/s, avg_loss=0.875, loss=1.14] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 2 Training: 100%|██████████| 8000/8000 [44:38<00:00,  2.99it/s, avg_loss=0.881, loss=0.602]


Epoch 2 | Avg Train Loss: 0.8809 | Validation Accuracy: 0.5958
Model and tokenizer saved at ./longformer_model_checkpoints/epoch_2


Epoch 3 Training: 100%|██████████| 8000/8000 [44:38<00:00,  2.99it/s, avg_loss=0.718, loss=0.239] 


Epoch 3 | Avg Train Loss: 0.7184 | Validation Accuracy: 0.6010
Model and tokenizer saved at ./longformer_model_checkpoints/epoch_3


Epoch 4 Training: 100%|██████████| 8000/8000 [44:38<00:00,  2.99it/s, avg_loss=0.536, loss=0.849] 


Epoch 4 | Avg Train Loss: 0.5355 | Validation Accuracy: 0.6010
Model and tokenizer saved at ./longformer_model_checkpoints/epoch_4


Epoch 5 Training: 100%|██████████| 8000/8000 [44:38<00:00,  2.99it/s, avg_loss=0.378, loss=0.203] 


Epoch 5 | Avg Train Loss: 0.3778 | Validation Accuracy: 0.5988
Model and tokenizer saved at ./longformer_model_checkpoints/epoch_5


In [16]:
batch = next(iter(train_loader))
print("input_ids shape:", batch['input_ids'].shape)
print("attention_mask shape:", batch['attention_mask'].shape)
print("labels shape:", batch['labels'].shape)
print("labels min/max:", batch['labels'].min(), batch['labels'].max())
print("labels dtype:", batch['labels'].dtype)

input_ids shape: torch.Size([4, 512])
attention_mask shape: torch.Size([4, 512])
labels shape: torch.Size([4])
labels min/max: tensor(0) tensor(4)
labels dtype: torch.int64
