In [6]:
from sklearn.model_selection import train_test_split
import pymc as pm
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [7]:
# Rating files are inside the project
ratings_path = 'u.data'

# Loading the dataset
ratings = pd.read_csv(ratings_path, sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [8]:
# Creating table to construct the user-item matrix
all_user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)

In [9]:
selected_users = all_user_item_matrix.index[:15]
selected_items = all_user_item_matrix.columns[:]
user_item_matrix = all_user_item_matrix.loc[selected_users, selected_items]

In [10]:
user_item_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.model_selection import train_test_split
import pymc as pm
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from itertools import product

# Rating files are inside the project
ratings_path = 'u.data'

# Loading the dataset
ratings = pd.read_csv(ratings_path, sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Creating table to construct the user-item matrix
all_user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)

selected_users = all_user_item_matrix.index[:15]
selected_items = all_user_item_matrix.columns[:15]
user_item_matrix = all_user_item_matrix.loc[selected_users, selected_items]

# Split into training and testing sets
X_train, X_test = train_test_split(user_item_matrix, test_size=0.3, random_state=42)

n_users, n_items = X_train.shape

# Hyperparameter ranges to tune
n_factors_list = [5, 10, 20]
alpha_list = [0.1, 0.3, 0.5]

best_hyperparams = None
best_mse = float('inf')

# Grid search over the hyperparameters
for n_factors, alpha in product(n_factors_list, alpha_list):
    print(f"Training with n_factors={n_factors}, alpha={alpha}")
    
    with pm.Model() as model:
        # User and item latent feature matrices
        user_features = pm.Gamma('user_features', alpha=alpha, beta=alpha, shape=(n_users, n_factors))
        item_features = pm.Gamma('item_features', alpha=alpha, beta=alpha, shape=(n_items, n_factors))

        # Dot product of user and item features to get the rate parameter lambda for the Poisson distribution
        rate = pm.math.dot(user_features, item_features.T)

        # We use Poisson distribution for observations
        ratings = pm.Poisson('ratings', mu=rate, observed=X_train)

        trace = pm.sample(1000, chains=3, tune=1000, target_accept=0.95, progressbar=False)

    # Extracting the predicted ratings from the posterior distribution
    posterior_user_features = trace.posterior['user_features'].mean(dim=('chain', 'draw'))
    posterior_item_features = trace.posterior['item_features'].mean(dim=('chain', 'draw'))

    # Predicted ratings on the validation set
    predicted_ratings = np.dot(posterior_user_features, posterior_item_features.T)
    predicted_ratings_flat = predicted_ratings.flatten()

    # Calculate MSE on the validation set
    true_ratings = X_train.values.flatten()
    mse = mean_squared_error(true_ratings, predicted_ratings_flat)

    print(f"Validation MSE: {mse}")

    # Update the best hyperparameters if the current model is better
    if mse < best_mse:
        best_mse = mse
        best_hyperparams = {'n_factors': n_factors, 'alpha': alpha}

print(f"Best hyperparameters: {best_hyperparams} with MSE: {best_mse}")

# You can use the best hyperparameters to retrain the model on the full dataset
n_factors = best_hyperparams['n_factors']
alpha = best_hyperparams['alpha']

with pm.Model() as best_model:
    # User and item latent feature matrices with best hyperparameters
    user_features = pm.Gamma('user_features', alpha=alpha, beta=alpha, shape=(n_users, n_factors))
    item_features = pm.Gamma('item_features', alpha=alpha, beta=alpha, shape=(n_items, n_factors))

    # Dot product of user and item features
    rate = pm.math.dot(user_features, item_features.T)

    # Poisson distribution for observations
    ratings = pm.Poisson('ratings', mu=rate, observed=X_train)

    trace = pm.sample(1000, chains=3, tune=1000, target_accept=0.95)

# Model is now trained with the best hyperparameters and you can further use it for predictions on test data


Training with n_factors=5, alpha=0.1


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 25 seconds.
There were 28 divergences after tuning. Increase `target_accept` or reparameterize.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Validation MSE: 2.1039052203673183
Training with n_factors=5, alpha=0.3


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 26 seconds.
There were 6 divergences after tuning. Increase `target_accept` or reparameterize.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Validation MSE: 2.0770526577829833
Training with n_factors=5, alpha=0.5


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 15 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Validation MSE: 2.027215714321588
Training with n_factors=10, alpha=0.1


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 61 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


Validation MSE: 3.6845835471909374
Training with n_factors=10, alpha=0.3


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 22 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details


Validation MSE: 2.2466707036214295
Training with n_factors=10, alpha=0.5


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 14 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


Validation MSE: 2.205512492044414
Training with n_factors=20, alpha=0.1


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 71 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


Validation MSE: 4.862215775520062
Training with n_factors=20, alpha=0.3


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 25 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


Validation MSE: 2.682629680770151
Training with n_factors=20, alpha=0.5


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]
Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 16 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


Validation MSE: 2.7373646800949065
Best hyperparameters: {'n_factors': 5, 'alpha': 0.5} with MSE: 2.027215714321588


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [user_features, item_features]


Output()

Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 16 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
