#Baseline for Rating Prediction

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.accuracy import rmse, mae
from surprise import accuracy

In [None]:
#MEAN RATING PREDICTION

train=pd.read_csv('/kaggle/input/diss-train/data.train.csv') #Entire dataset of nine domains
test=pd.read_csv('/kaggle/input/diss-train/data.test.csv')

item_mean = train.groupby('parent_asin')['rating'].mean()

# Predict ratings for the test set using item mean
test['predicted_rating'] = test['parent_asin'].map(item_mean)

# Handle NaN values by filling with the overall mean rating
overall_mean = train['rating'].mean()
test['predicted_rating'].fillna(overall_mean, inplace=True)

# Ensure predicted ratings are integers
test['predicted_rating'] = test['predicted_rating'].round().astype(int)

rmse = mean_squared_error(test['rating'], test['predicted_rating'], squared=False)
mae = mean_absolute_error(test['rating'], test['predicted_rating'])

print(f'Baseline Model RMSE: {rmse:.4f}')
print(f'Baseline Model MAE: {mae:.4f}')

In [None]:
#PMF Method

# Define the reader with the appropriate rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data from the DataFrame
train_data = Dataset.load_from_df(train[['user_id', 'parent_asin', 'rating']], reader)
test_data = Dataset.load_from_df(test[['user_id', 'parent_asin', 'rating']], reader)

trainset = train_data.build_full_trainset()

testset = test_data.construct_testset(test_data.raw_ratings)

pmf = SVD()

pmf.fit(trainset)

predictions = pmf.test(testset)

pmf_rmse = accuracy.rmse(predictions)
pmf_mae = accuracy.mae(predictions)

print(f'PMF Model RMSE: {pmf_rmse:.4f}')
print(f'PMF Model MAE: {pmf_mae:.4f}')

#ST5 + MLP Implementation for Rating Predicition

In [None]:
#MAKING EMBEDDINGS User embedding concat with item embedding

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from collections import Counter

os.environ['TRANSFORMERS_CACHE'] = '/exports/eddie/scratch/s2550585/huggingface_cache/transformers'
os.environ['HF_DATASETS_CACHE'] = '/exports/eddie/scratch/s2550585/huggingface_cache/datasets'
os.environ['HF_HOME'] = '/exports/eddie/scratch/s2550585/huggingface_cache'

def load_data(split, output_dir, domain, seed=36):
    print(f"Processing {split} split...")
    df = pd.read_csv(os.path.join(output_dir, f'{domain}.{split}.csv'))
    df = df.groupby('category', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=seed))
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    return df

def load_and_process_data(df, st_model, device):
    users = list(df['history_text']) #user text
    items = list(df['target_item_text']) #item text
    ratings = torch.tensor(df['rating'].values, dtype=torch.float)  # Ensure ratings are float for regression
    
    print("Encoding user and item data...")
    users_emb = st_model.encode(users, convert_to_tensor=True,show_progress_bar=True)
    items_emb = st_model.encode(items, convert_to_tensor=True,show_progress_bar=True)
    
    combined_emb = torch.cat([users_emb, items_emb], dim=1).to(device)
    
    return combined_emb, ratings

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

data_path = '/exports/eddie/scratch/s2550585/diss/dataset3/dataset'
domain = 'data'
train_df = load_data("train", data_path, domain)
test_df = load_data("test", data_path, domain)
valid_df = load_data("valid", data_path, domain)


print("Loading SentenceTransformer model...")
st_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device=device)

train_combined_emb, train_ratings = load_and_process_data(train_df, st_model, device)
valid_combined_emb, valid_ratings = load_and_process_data(valid_df, st_model, device)
test_combined_emb, test_ratings = load_and_process_data(test_df, st_model, device)


saving_path = '/exports/eddie/scratch/s2550585/rating/data'
if not os.path.exists(saving_path):
    os.makedirs(saving_path)

#Saving the embeddings and ratings
torch.save(train_combined_emb, f'{saving_path}/train_combined_emb.pt')
torch.save(train_ratings, f'{saving_path}/train_ratings.pt')
torch.save(valid_combined_emb, f'{saving_path}/valid_combined_emb.pt')
torch.save(valid_ratings, f'{saving_path}/valid_ratings.pt')
torch.save(test_combined_emb, f'{saving_path}/test_combined_emb.pt')
torch.save(test_ratings, f'{saving_path}/test_ratings.pt')


print("Embeddings and ratings saved successfully!")

#New Model
print("loading ST5-Final Model...")
st_model = SentenceTransformer('/exports/eddie/scratch/s2550585/diss/dataset/data/seq-item-eddie/best_model-seq-item-eddie', device=device)

train_combined_emb, train_ratings = load_and_process_data(train_df, st_model, device)
valid_combined_emb, valid_ratings = load_and_process_data(valid_df, st_model, device)
test_combined_emb, test_ratings = load_and_process_data(test_df, st_model, device)

#Saving the embeddings and ratings
torch.save(train_combined_emb, f'{saving_path}/train_combined_emb_new.pt')
torch.save(train_ratings, f'{saving_path}/train_ratings_new.pt')
torch.save(valid_combined_emb, f'{saving_path}/valid_combined_emb_new.pt')
torch.save(valid_ratings, f'{saving_path}/valid_ratings_new.pt')
torch.save(test_combined_emb, f'{saving_path}/test_combined_emb_new.pt')
torch.save(test_ratings, f'{saving_path}/test_ratings_new.pt')

print("Embeddings and ratings saved successfully!")



In [None]:
#Rating Prediciton using embeddings

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import numpy as np
import os


class RatingModel(nn.Module):
    def __init__(self, input_size):
        super(RatingModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, 768),
            nn.ReLU(),
            nn.BatchNorm1d(768),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 1) 
        )
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    
    def forward(self, embeddings):
        return self.model(embeddings).squeeze(1)
    

class UserItemDataset(Dataset):
    def __init__(self, embeddings, ratings):
        self.embeddings = embeddings
        self.ratings = ratings.float()  # Ensure ratings are float for regression
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.ratings[idx]
    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

num_epochs = 100
metric_to_monitor = 'rmse'

# Load the embedding for the variant of ST5 Model  
print("ST5 MODEL") 

train_combined_emb=torch.load('/exports/eddie/scratch/s2550585/rating/data/train_combined_emb.pt')
train_ratings=torch.load('/exports/eddie/scratch/s2550585/rating/data/train_ratings.pt')
valid_combined_emb=torch.load('/exports/eddie/scratch/s2550585/rating/data/valid_combined_emb.pt')
valid_ratings=torch.load('/exports/eddie/scratch/s2550585/rating/data/valid_ratings.pt')

train_dataset = UserItemDataset(train_combined_emb, train_ratings)
val_dataset = UserItemDataset(valid_combined_emb, valid_ratings)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

input_size = train_combined_emb.shape[1]
model = RatingModel(input_size).to(device)

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

saving_path = '/exports/eddie/scratch/s2550585/rating/data'

best_score = float('inf')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for embeddings, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        embeddings = embeddings.to(device)
        labels = labels.to(device)

        outputs = model(embeddings)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    model.eval()
    true_ratings = []
    predicted_ratings = []

    with torch.no_grad():
        for embeddings, labels in val_dataloader:
            embeddings = embeddings.to(device)
            labels = labels.to(device)

            outputs = model(embeddings)
            predicted = outputs

            true_ratings.extend(labels.cpu().numpy())
            predicted_ratings.extend(predicted.cpu().numpy())

    true_ratings = np.array(true_ratings)
    predicted_ratings = np.array(predicted_ratings)

    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    current_score = rmse
    if current_score < best_score:
        best_score = current_score
        torch.save(model.state_dict(), os.path.join(saving_path, 'st5_model.pt'))
        print(f"New best model saved with {metric_to_monitor.upper()}: {best_score:.4f}")
