In [1]:
import torch.nn as nn
import torch
from utils import *
from model import AutoRecItemBased
from torch.optim import lr_scheduler
from train import train_model_item_based

# Exploratory Data Analysis

In [2]:
DATA_DIR = "data/ml-100k"
data, num_users, num_items = read_data_ml100k(DATA_DIR)
sparsity:int = 1 - len(data) / (num_users * num_items)
print(f'number of users: {num_users}, number of items: {num_items}')
print(f'matrix sparsity: {sparsity:f}')
print(data.head(10))


number of users: 943, number of items: 1682
matrix sparsity: 0.936953
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
5      298      474       4  884182806
6      115      265       2  881171488
7      253      465       5  891628467
8      305      451       3  886324817
9        6       86       3  883603013


In [3]:
# Split dataset
train_data, test_data = split_data_ml100k(data, num_users, split_mode='seq-aware')
print("Lenght of train data:",len(train_data))
print("Length of test data",len(test_data))

# Convert into user-item interaction matrix
users_train, items_train, _, train_inter_mat = load_data_ml100k(train_data, num_users,num_items)
print("user interaction matrix is a 2D matrix where rows represent items and columns represent users and the value of each element is the ranking.")
print()
print("user-item interaction matrix training set shape:",train_inter_mat.shape)
print("user-item interaction matrix training set:\n",train_inter_mat)

users_test, items_test, _, test_inter_mat = load_data_ml100k(test_data, num_users,num_items)
print("user-item interaction matrix test set shape:",test_inter_mat.shape)
print("user-item interaction matrix test set:\n",test_inter_mat)
# Convert to PyTorch tensors
train_inter_mat_tensor = torch.tensor(train_inter_mat, dtype=torch.float32)
test_inter_mat_tensor = torch.tensor(test_inter_mat, dtype=torch.float32)

# Create TensorDataset
train_inter_dataset = TensorDataset(train_inter_mat_tensor)
test_inter_dataset = TensorDataset(test_inter_mat_tensor)

# Create DataLoader
train_iter = DataLoader(train_inter_dataset, shuffle=True, batch_size=512)
test_iter = DataLoader(test_inter_dataset, batch_size=512)

Lenght of train data: 99057
Length of test data 943
user interaction matrix is a 2D matrix where rows represent items and columns represent users and the value of each element is the ranking.

user-item interaction matrix training set shape: (1682, 943)
user-item interaction matrix training set:
 [[5. 4. 0. ... 5. 0. 0.]
 [3. 0. 0. ... 0. 0. 5.]
 [4. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
user-item interaction matrix test set shape: (1682, 943)
user-item interaction matrix test set:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Create model and train it

In [5]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
else:
    mps_device = torch.device("mps")

    dataloaders = {'train':train_iter, 'val':test_iter}

    dataset_sizes = {'train':len(train_iter.dataset),'val':len(test_iter.dataset)}
    
    model = AutoRecItemBased(100,num_users,dropout=0.005).to(mps_device)
    
    learning_rate = 5e-2
    optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
    # Decay LR by a factor of 1e-5 every 10 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=50, gamma=1e-5)
    model = train_model_item_based(model,dataloaders,dataset_sizes,mps_device,optimizer,exp_lr_scheduler,num_epochs=500)


Training complete in 0m 13s
Best loss: 2.052986


# Inference testing and analysis

In [8]:
users, items, _, all_inter_mat = load_data_ml100k(data, num_users,num_items)
print("user interaction matrix is a 2D matrix where rows represent items and columns represent users and the value of each element is the ranking.")
print()
print("user-item interaction matrix training set shape:",all_inter_mat.shape)
print("user-item interaction matrix training set:\n",all_inter_mat)
print()
print("First row from all interaction matrix:\n",all_inter_mat[0][100:140])
print("First row from train interaction matrix:\n",train_inter_mat[0][100:140])
print("First row from test interaction matrix:\n",test_inter_mat[0][100:140])
# Convert to PyTorch tensors
all_inter_mat_tensor = torch.tensor(all_inter_mat, dtype=torch.float32,device=mps_device)
actual_rating = all_inter_mat_tensor.flatten().cpu().numpy()

user interaction matrix is a 2D matrix where rows represent items and columns represent users and the value of each element is the ranking.

user-item interaction matrix training set shape: (1682, 943)
user-item interaction matrix training set:
 [[5. 4. 0. ... 5. 0. 0.]
 [3. 0. 0. ... 0. 0. 5.]
 [4. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

First row from all interaction matrix:
 [3. 3. 0. 0. 0. 4. 0. 4. 4. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 4. 4. 0. 0. 3.
 4. 0. 0. 4. 0. 5. 4. 0. 0. 5. 0. 0. 3. 4. 0. 0.]
First row from train interaction matrix:
 [3. 3. 0. 0. 0. 4. 0. 4. 4. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 4. 4. 0. 0. 3.
 4. 0. 0. 4. 0. 5. 4. 0. 0. 0. 0. 0. 3. 4. 0. 0.]
First row from test interaction matrix:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 5. 0. 0. 0. 0. 0. 0.]


In [9]:
model.eval()
predicted_rating = np.array([])
for user_item_inter_mat in all_inter_mat_tensor:
    with torch.no_grad():
        mask = torch.sign(user_item_inter_mat)
        output = model(user_item_inter_mat)
        masked_output = mask * output
        predicted_rating = np.append(predicted_rating,masked_output.cpu().numpy())

In [10]:
predicted_rating_rounded = np.round(predicted_rating)
print("Actual rating:\n", actual_rating[:50])
print("Predicted rating:\n", predicted_rating_rounded[:50])

Actual rating:
 [5. 4. 0. 0. 4. 4. 0. 0. 0. 4. 0. 0. 3. 0. 1. 5. 4. 5. 0. 3. 5. 0. 5. 0.
 5. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 5. 0. 0. 4. 5. 5. 4. 5. 0. 0. 0.
 2. 0.]
Predicted rating:
 [4. 4. 0. 0. 4. 3. 0. 0. 0. 4. 0. 0. 3. 0. 3. 4. 0. 4. 0. 3. 0. 0. 4. 0.
 4. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 4. 4. 5. 4. 4. 0. 0. 0.
 2. 0.]


In [11]:
number_of_correct_prediction = np.array([1 if pred_rat==actual_rat and actual_rat!=0 else 0 for  pred_rat,actual_rat in zip(predicted_rating_rounded,actual_rating)]).sum()
print("Number of correct rating prediction:", number_of_correct_prediction)
print("Percentage of correct prediction:", number_of_correct_prediction/len(data) *100)

Number of correct rating prediction: 33102
Percentage of correct prediction: 33.102


# Appendix

## Illustration how to evaluate prediction in inference analysis

In [None]:
# Test data
test_data = np.array([
    [5, 0, 3],
    [0, 4, 0],
    [2, 0, 0]
])
test_data = torch.tensor(test_data, dtype=torch.float32)
print("Test data:")
print(test_data)

# Model's predictions
predictions = np.array([
    [4.8, 2.1, 3.2],
    [1.5, 3.9, 2.0],
    [2.1, 1.0, 0.5]
])
predictions = torch.tensor(predictions, dtype=torch.float32)
print("Prediction:")
print(predictions)

# Mask using torch.sign
mask = torch.sign(test_data)
print("mask")
print(mask)

# Masked predictions
masked_predictions = predictions * mask
print("Masked prediction")
print(masked_predictions)

# Calculate RMSE only for observed entries
mse_loss = nn.MSELoss(reduction='sum')
squared_diff = mse_loss(test_data * mask, masked_predictions)
rmse = torch.sqrt(squared_diff / torch.sum(mask))
print(f'RMSE: {rmse:.4f}')


