In [1]:
import os
import re
import html
import json
import math
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from datasets import load_dataset
from datasets import load_dataset

In [2]:


# ---------------------------
# Cleaning Functions
# ---------------------------
def clean_text(raw_text):
    """
    Clean raw text: join lists, remove HTML entities, strip newlines/quotes,
    and ensure ending with a period.
    """
    if isinstance(raw_text, list):
        cleaned_text = ' '.join(raw_text)
    elif isinstance(raw_text, dict):
        cleaned_text = str(raw_text)
    else:
        cleaned_text = str(raw_text)
    cleaned_text = html.unescape(cleaned_text)
    cleaned_text = re.sub(r'[\n\r]+', ' ', cleaned_text)
    cleaned_text = re.sub(r'["]+', '', cleaned_text)
    cleaned_text = cleaned_text.strip()
    if not cleaned_text.endswith('.'):
        cleaned_text += '.'
    return cleaned_text

def clean_review(example):
    """
    Clean review by combining title and text fields.
    """
    title = example.get('title', '')
    text = example.get('text', '')
    combined = (title + ". " + text).strip() if title else text
    example['cleaned_review'] = clean_text(combined)
    return example

def clean_metadata(example):
    """
    Clean metadata by combining fields such as title and description.
    """
    title = example.get('title', '')
    description = example.get('description', [])
    if isinstance(description, list):
        description = ' '.join(description)
    meta_combined = (title + ". " + description).strip() if title else description
    example['cleaned_metadata'] = clean_text(meta_combined)
    return example

# ---------------------------
# Streaming Preprocessing Functions
# ---------------------------
def stream_clean_jsonl(input_path, output_csv, mode="review"):
    """
    Stream-process a JSONL file (either reviews or metadata) and write cleaned CSV.
    mode: "review" or "metadata" determines which cleaning function and fields to use.
    """
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    with open(input_path, 'r', encoding='utf-8') as f_in, open(output_csv, "w", encoding="utf-8") as f_out:
        # Write header based on mode
        if mode == "review":
            header = "asin,cleaned_review,rating\n"
        elif mode == "metadata":
            header = "parent_asin,cleaned_metadata,price\n"
        else:
            raise ValueError("Mode must be 'review' or 'metadata'")
        f_out.write(header)
        
        for i, line in enumerate(tqdm(f_in, desc=f"Processing {mode} lines")):
            if max_lines and i >= max_lines:
                break
            try:
                data = json.loads(line.strip())
                if mode == "review":
                    data = clean_review(data)
                    # Ensure required fields exist
                    asin = data.get("asin", "").strip()
                    cleaned_review = data.get("cleaned_review", "").strip()
                    rating = data.get("rating", None)
                    if asin and cleaned_review and rating is not None:
                        # Wrap text fields in quotes to avoid CSV issues
                        row = f'"{asin}","{cleaned_review}",{rating}\n'
                        f_out.write(row)
                elif mode == "metadata":
                    data = clean_metadata(data)
                    parent_asin = data.get("parent_asin", "").strip()
                    cleaned_metadata = data.get("cleaned_metadata", "").strip()
                    price = data.get("price", None)
                    if parent_asin and cleaned_metadata:
                        # Price may be null; write as empty string if so.
                        price_str = str(price) if price not in [None, "None"] else ""
                        row = f'"{parent_asin}","{cleaned_metadata}",{price_str}\n'
                        f_out.write(row)
            except Exception as e:
                # Skip problematic lines
                print(f"Skipping line {i} due to error: {e}")
    print(f"Finished processing {mode} data; output written to {output_csv}")

# ---------------------------
# Chunked Merge Function
# ---------------------------
def merge_reviews_with_metadata(review_csv, meta_csv, output_csv, chunk_size=500000):
    """
    Merge large reviews CSV (loaded in chunks) with metadata CSV.
    Metadata is loaded fully (assumed to be smaller).
    Writes merged chunks to output CSV incrementally.
    """
    print("Loading metadata into memory...")
    meta_df = pd.read_csv(meta_csv)
    meta_df = meta_df.drop_duplicates(subset=["parent_asin"])
    meta_df.set_index("parent_asin", inplace=True)
    
    # Prepare output file
    if os.path.exists(output_csv):
        os.remove(output_csv)
    header_written = False
    
    print("Starting chunked merge of reviews with metadata...")
    for chunk in pd.read_csv(review_csv, chunksize=chunk_size):
        merged_chunk = chunk.merge(meta_df, left_on="asin", right_index=True, how="inner")
        # Keep only necessary columns
        merged_chunk = merged_chunk[["asin", "cleaned_review", "cleaned_metadata", "rating", "price"]]
        # Convert price to numeric and fill missing
        merged_chunk["price"] = pd.to_numeric(merged_chunk["price"], errors="coerce")
        merged_chunk["price"] = merged_chunk["price"].fillna(merged_chunk["price"].median())

        
        # Write header only once
        if not header_written:
            merged_chunk.to_csv(output_csv, index=False, mode="w")
            header_written = True
        else:
            merged_chunk.to_csv(output_csv, index=False, header=False, mode="a")
    print(f"Merged data written to {output_csv}")

# ---------------------------
# Data Splitting Function
# ---------------------------
def split_dataset_into_splits(merged_csv, output_dir="./preprocessed_data/splits", train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    """
    Load the merged CSV and split it into train/val/test sets, then save them.
    """
    os.makedirs(output_dir, exist_ok=True)
    # If the merged file is still too large, consider using chunking or Dask.
    # Here, we assume the merged CSV fits into memory.
    df = pd.read_csv(merged_csv)
    print(f"Merged data loaded: {df.shape}")
    
    train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed)
    
    train_df.to_csv(os.path.join(output_dir, "electronics_train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "electronics_val.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "electronics_test.csv"), index=False)
    
    print("Data split complete:")
    print("Train shape:", train_df.shape)
    print("Val shape:", val_df.shape)
    print("Test shape:", test_df.shape)


In [14]:

# ---------------------------
# Main Execution
# ---------------------------

# Set file paths for your raw JSONL files
reviews_jsonl = "/Users/sanamoin/Documents/sites/gadgets/backend/data/amazon_2023/Electronics.jsonl"         # 22.6GB file
meta_jsonl = "/Users/sanamoin/Documents/sites/gadgets/backend/data/amazon_2023/meta_Electronics.jsonl"         # 5.25GB file

In [15]:
# Set output paths for cleaned CSV files (you can process a subset first by setting max_lines)
# cleaned_reviews_csv = "./preprocessed_data/electronics_reviews_cleaned.csv"
# cleaned_meta_csv = "./preprocessed_data/electronics_meta_cleaned.csv"

In [16]:
# Process raw JSONL files in streaming mode
# Uncomment these lines to run the cleaning; you might start with max_lines for testing
# stream_clean_jsonl(reviews_jsonl, cleaned_reviews_csv, mode="review")

Processing review lines: 43886944it [08:53, 82190.42it/s]

Finished processing review data; output written to ./preprocessed_data/electronics_reviews_cleaned.csv





In [17]:
# stream_clean_jsonl(meta_jsonl, cleaned_meta_csv, mode="metadata")

Processing metadata lines: 1610012it [00:49, 32505.94it/s]

Finished processing metadata data; output written to ./preprocessed_data/electronics_meta_cleaned.csv





In [19]:
# Once cleaned CSVs are generated, merge them in chunks.
# merged_csv = "./preprocessed_data/merged_electronics.csv"

In [20]:
# merge_reviews_with_metadata(cleaned_reviews_csv, cleaned_meta_csv, merged_csv, chunk_size=500000)

Loading metadata into memory...
Starting chunked merge of reviews with metadata...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_chunk["price"].fillna(merged_chunk["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_chunk["price"].fillna(merged_chunk["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becaus

Merged data written to ./preprocessed_data/merged_electronics.csv


In [21]:
#
# merged_csv = '/Users/sanamoin/Documents/sites/gadgets/backend/experiments/preprocessed_data/merged_filtered_reviews_asin_keywordmatch.csv'
# Finally, split the merged CSV into train, validation, and test sets.
# split_dataset_into_splits(merged_csv, output_dir="./preprocessed_data/filtered_splits", train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42)

Merged data loaded: (24211976, 5)
Data split complete:
Train shape: (16948383, 5)
Val shape: (3631796, 5)
Test shape: (3631797, 5)


In [3]:
#
merged_csv = '/Users/sanamoin/Documents/sites/gadgets/backend/experiments/preprocessed_data/merged_filtered_reviews_asin_keywordmatch.csv'
# Finally, split the merged CSV into train, validation, and test sets.
split_dataset_into_splits(merged_csv, output_dir="./preprocessed_data/filtered_splits", train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42)

Merged data loaded: (2845751, 5)
Data split complete:
Train shape: (1992025, 5)
Val shape: (426863, 5)
Test shape: (426863, 5)


In [4]:

# ===========================
# 3. PyTorch Dataset Class
# ===========================
class ElectronicsDataset(Dataset):
    def __init__(self, csv_file, tokenizer: BertTokenizer, max_length=256):
        """
        Load CSV file and prepare data for training.
        """
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data["cleaned_review"] = self.data["cleaned_review"].astype(str)
        self.data["cleaned_metadata"] = self.data["cleaned_metadata"].astype(str)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        review_text = row["cleaned_review"]
        meta_text = row["cleaned_metadata"]
        price = float(row["price"])
        rating = float(row["rating"])
        
        # Tokenize review text
        encoding = self.tokenizer.encode_plus(
            review_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        
        # Tokenize metadata text
        meta_encoding = self.tokenizer.encode_plus(
            meta_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        meta_input_ids = meta_encoding["input_ids"].squeeze(0)
        meta_attention_mask = meta_encoding["attention_mask"].squeeze(0)
        
        sample = {
            "review_input_ids": input_ids,
            "review_attention_mask": attention_mask,
            "meta_input_ids": meta_input_ids,
            "meta_attention_mask": meta_attention_mask,
            "price": torch.tensor(price, dtype=torch.float),
            "rating": torch.tensor(rating, dtype=torch.float)
        }
        return sample


In [5]:

# ===========================
# 4. Model Architecture
# ===========================
class JointReviewMetadataModel(nn.Module):
    def __init__(self, 
                 bert_model_name="bert-base-uncased",
                 review_emb_dim=256,
                 meta_token_emb_dim=128,
                 meta_cnn_out_channels=128,
                 meta_price_emb_dim=64,
                 meta_final_dim=256,
                 fused_dim=256,
                 dropout=0.1,
                 tokenizer_vocab_size=30522):
        super(JointReviewMetadataModel, self).__init__()
        
        # Review Branch: Pretrained BERT
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.review_fc = nn.Linear(self.bert.config.hidden_size, review_emb_dim)
        self.review_dropout = nn.Dropout(dropout)
        
        # Metadata Branch: Train-from-scratch embedding and CNN
        self.meta_embedding = nn.Embedding(num_embeddings=tokenizer_vocab_size,
                                           embedding_dim=meta_token_emb_dim,
                                           padding_idx=0)
        self.meta_conv = nn.Conv1d(in_channels=meta_token_emb_dim,
                                   out_channels=meta_cnn_out_channels,
                                   kernel_size=3,
                                   padding=1)
        self.meta_pool = nn.AdaptiveMaxPool1d(1)
        self.meta_dropout = nn.Dropout(dropout)
        
        # Price processing: MLP to project numeric price to an embedding
        self.price_fc = nn.Sequential(
            nn.Linear(1, meta_price_emb_dim),
            nn.ReLU()
        )
        
        # Fuse metadata CNN features with price embedding
        self.meta_fuse = nn.Sequential(
            nn.Linear(meta_cnn_out_channels + meta_price_emb_dim, meta_final_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Fusion Module: combine review and metadata embeddings
        self.fusion_fc = nn.Sequential(
            nn.Linear(review_emb_dim + meta_final_dim, fused_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Prediction Head: Regression output (e.g., predicting rating)
        self.output_fc = nn.Linear(fused_dim, 1)
        
    def forward(self, review_input_ids, review_attention_mask,
                meta_input_ids, meta_attention_mask, price):
        # ---- Review Branch ----
        bert_outputs = self.bert(input_ids=review_input_ids, attention_mask=review_attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # (batch, 768)
        review_emb = self.review_dropout(self.review_fc(cls_output))  # (batch, review_emb_dim)
        
        # ---- Metadata Branch ----
        meta_embedded = self.meta_embedding(meta_input_ids)  # (batch, meta_seq_len, meta_token_emb_dim)
        meta_embedded = meta_embedded.transpose(1, 2)  # (batch, meta_token_emb_dim, meta_seq_len)
        meta_conv_out = F.relu(self.meta_conv(meta_embedded))  # (batch, meta_cnn_out_channels, meta_seq_len)
        meta_pooled = self.meta_pool(meta_conv_out).squeeze(-1)  # (batch, meta_cnn_out_channels)
        meta_pooled = self.meta_dropout(meta_pooled)
        
        # Process price feature
        price = price.view(-1, 1)  # (batch, 1)
        price_emb = self.price_fc(price)  # (batch, meta_price_emb_dim)
        
        # Concatenate metadata CNN output with price embedding
        meta_concat = torch.cat([meta_pooled, price_emb], dim=1)  # (batch, meta_cnn_out_channels + meta_price_emb_dim)
        meta_emb = self.meta_fuse(meta_concat)  # (batch, meta_final_dim)
        
        # ---- Fusion ----
        fused = torch.cat([review_emb, meta_emb], dim=1)  # (batch, review_emb_dim + meta_final_dim)
        fused_emb = self.fusion_fc(fused)  # (batch, fused_dim)
        
        # ---- Prediction Head ----
        output = self.output_fc(fused_emb)  # (batch, 1)
        
        return output, fused_emb

In [6]:

# ===========================
# 5. Training and Evaluation Setup
# ===========================
# ---------------------------
# Hyperparameters and Device Setup
# ---------------------------
BATCH_SIZE = 32
NUM_EPOCHS = 5
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-2
MAX_GRAD_NORM = 1.0
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [7]:
device

device(type='mps')

In [8]:

# ---------------------------
# Assume we already have CSV splits from preprocessing.
# Here we load the train, validation, and test splits.
# ---------------------------
DATA_DIR = "./preprocessed_data/filtered_splits"
train_csv = os.path.join(DATA_DIR, "electronics_train.csv")
val_csv = os.path.join(DATA_DIR, "electronics_val.csv")
test_csv = os.path.join(DATA_DIR, "electronics_test.csv")




In [9]:
# Load CSV splits as Hugging Face datasets (which use memory mapping)
train_dataset = load_dataset("csv", data_files={"train": train_csv}, split="train")
val_dataset   = load_dataset("csv", data_files={"val": val_csv}, split="val")
test_dataset  = load_dataset("csv", data_files={"test": test_csv}, split="test")


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
# val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
# test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [11]:
def tokenize_function(batch):
    # Tokenize review and metadata texts
    review_enc = tokenizer(batch["cleaned_review"], truncation=True, padding="max_length", max_length=256)
    meta_enc = tokenizer(batch["cleaned_metadata"], truncation=True, padding="max_length", max_length=256)

    prices = []
    ratings = []

    for price_raw, rating_raw in zip(batch["price"], batch["rating"]):
        try:
            price_val = float(price_raw)
        except:
            price_val = 0.0
        prices.append(price_val)

        try:
            rating_val = float(rating_raw)
        except:
            rating_val = 0.0
        ratings.append(rating_val)

    return {
        "review_input_ids": review_enc["input_ids"],
        "review_attention_mask": review_enc["attention_mask"],
        "meta_input_ids": meta_enc["input_ids"],
        "meta_attention_mask": meta_enc["attention_mask"],
        "price": prices,
        "rating": ratings
    }


In [12]:
# Apply tokenization in batched mode (this is efficient and uses memory mapping)
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset   = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
test_dataset  = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)


Map:   0%|          | 0/1992025 [00:00<?, ? examples/s]

Map:   0%|          | 0/426863 [00:00<?, ? examples/s]

Map:   0%|          | 0/426863 [00:00<?, ? examples/s]

In [13]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [14]:
# # Set format to PyTorch tensors for necessary columns
columns = ["review_input_ids", "review_attention_mask", "meta_input_ids", "meta_attention_mask", "price", "rating"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)


In [15]:

# Instantiate the model and move to device
model = JointReviewMetadataModel(tokenizer_vocab_size=tokenizer.vocab_size)
model.to(device)


JointReviewMetadataModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [16]:
# Loss function: MSE for regression (predicting rating)
criterion = nn.MSELoss()

# Optimizer: AdamW
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Total training steps and scheduler
total_steps = NUM_EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)


In [17]:
# ---------------
# Training Function
# ---------------
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    running_loss = 0.0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        review_input_ids = batch["review_input_ids"].to(device)
        review_attention_mask = batch["review_attention_mask"].to(device)
        meta_input_ids = batch["meta_input_ids"].to(device)
        meta_attention_mask = batch["meta_attention_mask"].to(device)
        price = batch["price"].to(device)
        rating = batch["rating"].to(device)
        
        output, _ = model(review_input_ids, review_attention_mask,
                          meta_input_ids, meta_attention_mask, price)
        output = output.squeeze(-1)
        loss = criterion(output, rating)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        
        running_loss += loss.item() * review_input_ids.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss


In [18]:
# ---------------
# Evaluation Function
# ---------------
def evaluate_model(model, dataloader, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            review_input_ids = batch["review_input_ids"].to(device)
            review_attention_mask = batch["review_attention_mask"].to(device)
            meta_input_ids = batch["meta_input_ids"].to(device)
            meta_attention_mask = batch["meta_attention_mask"].to(device)
            price = batch["price"].to(device)
            rating = batch["rating"].to(device)
            
            output, _ = model(review_input_ids, review_attention_mask,
                              meta_input_ids, meta_attention_mask, price)
            output = output.squeeze(-1)
            loss = criterion(output, rating)
            running_loss += loss.item() * review_input_ids.size(0)
            
            all_preds.extend(output.cpu().numpy())
            all_targets.extend(rating.cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    rmse = np.sqrt(np.mean((np.array(all_preds) - np.array(all_targets)) ** 2))
    return epoch_loss, rmse


In [19]:
best_val_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_rmse = evaluate_model(model, val_loader, device)
    
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | RMSE: {val_rmse:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print("Best model saved.")




Epoch 1/5


Training: 100%|██████████| 62251/62251 [17:30:14<00:00,  1.01s/it]  
Evaluating: 100%|██████████| 13340/13340 [1:18:51<00:00,  2.82it/s]


Training Loss: 0.5978
Validation Loss: 0.2795 | RMSE: 0.5287
Best model saved.

Epoch 2/5


Training: 100%|██████████| 62251/62251 [17:20:24<00:00,  1.00s/it]  
Evaluating: 100%|██████████| 13340/13340 [1:22:55<00:00,  2.68it/s]


Training Loss: 0.2749
Validation Loss: 0.2643 | RMSE: 0.5141
Best model saved.

Epoch 3/5


Training: 100%|██████████| 62251/62251 [17:50:27<00:00,  1.03s/it]  
Evaluating: 100%|██████████| 13340/13340 [1:25:11<00:00,  2.61it/s]


Training Loss: 0.2422
Validation Loss: 0.2679 | RMSE: 0.5176

Epoch 4/5


Training: 100%|██████████| 62251/62251 [17:34:41<00:00,  1.02s/it]  
Evaluating: 100%|██████████| 13340/13340 [1:15:44<00:00,  2.94it/s]


Training Loss: 0.2112
Validation Loss: 0.2635 | RMSE: 0.5133
Best model saved.

Epoch 5/5


Training: 100%|██████████| 62251/62251 [17:17:23<00:00,  1.00it/s]  
Evaluating: 100%|██████████| 13340/13340 [1:23:05<00:00,  2.68it/s]

Training Loss: 0.1845
Validation Loss: 0.2683 | RMSE: 0.5180





In [20]:
test_loss, test_rmse = evaluate_model(model, test_loader, device)
print(f"\nTest Loss: {test_loss:.4f} | Test RMSE: {test_rmse:.4f}")

Evaluating: 100%|██████████| 13340/13340 [1:22:27<00:00,  2.70it/s]


Test Loss: 0.2680 | Test RMSE: 0.5177



