In [50]:
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader

In [51]:
class datasetReader(Dataset):
    def __init__(self, df, dataset_name):
        self.df = df
        self.name = dataset_name
        print(f"{self.name} : {self.df.shape[0]}")

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        sd = self.df.iloc[idx]
        user = sd['user_id']
        item = sd['item_id']
        rating = sd['rating']
        return torch.tensor(user-1).long(), torch.tensor(item-1).long(), torch.tensor(rating).float()

def get_datasets(bucket='datasets', dataset='ml-25m', split=['test', 'train', 'val']):
    from pyarrow import fs, parquet
    valid_splits = ['test', 'train', 'val']
    data_map = {}
    minio = fs.S3FileSystem(
        endpoint_override='http://minio-service.kubeflow:9000',
         access_key='minio',
         secret_key='minio123',
         scheme='http')

    if type(split) is not list:
        split = [split]

    for dataset_name in valid_splits:
        paraquet_data = minio.open_input_file(f'{bucket}/{dataset}/{dataset_name}.parquet.gzip')
        df = parquet.read_table(paraquet_data).to_pandas()
        data_map['n_users'] = max(data_map['n_users'], df.user_id.max())
        data_map['n_items'] = max(data_map['n_items'], df.item_id.max())
        if dataset_name in split:
            data_map[dataset_name] = datasetReader(df, dataset_name=dataset_name)
    
    assert list(data_map.keys()) == split, f"Mismatched or invalid splits. Received {split} but can only process {valid_splits}"
    return data_map


In [60]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np

### Testing code.
def negative_sampling(ratings, num_ng_test=10):
    print('x')
    item_pool = set(ratings['item_id'].unique())
    print('y')
    interact_status = (
			ratings.groupby('user_id')['item_id']
			.apply(set)
			.reset_index()
			.rename(columns={'item_id': 'interacted_items'}))
    print('z')
    interact_status['negative_samples'] = interact_status['interacted_items'].apply(lambda x: np.random.choice(list(item_pool - x), num_ng_test))
    interact_status['rating'] = 0.0
    interact_status['timestamp'] = 1051631039
    interact_status = interact_status.drop(columns=['interacted_items']).explode('negative_samples').rename(columns={'negative_samples':'item_id'})
    print('a')
    #ret = ratings.append(interact_status, ignore_index=True)
    ret = pd.concat([ratings, interact_status], ignore_index=True)
    return ret

def split_dataset(path_to_ml_25m = "/Users/shanoop/Downloads/ml-25m",  random_state: int = 42):
    train_ratio = 0.75
    validation_ratio = 0.15
    test_ratio = 0.10
	
    num_ng_test = 10
    num_ng = 10

    names = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv(os.path.join(path_to_ml_25m, 'ratings.csv'),names=names, index_col=False, skiprows=1)
    ratings_df = ratings_df.iloc[:806400]
    ratings_df = negative_sampling(ratings_df)
	

    n_users = ratings_df.user_id.max()
    n_items = ratings_df.item_id.max()

    # train is now 75% of the entire data set
    train, test = train_test_split(
        ratings_df,                                    
        test_size=1 - train_ratio,
        random_state=random_state)
	
    # preprocess
    #train_ratings, test_ratings = leave_one_out(preprocess_ratings)

	

    # test is now 10% of the initial data set
    # validation is now 15% of the initial data set
    val, test = train_test_split(   
        test,
        test_size=test_ratio / (test_ratio + validation_ratio),
        random_state=random_state)
	
    
    return train, test, val, (n_users, n_items)

def get_datasets_local(bucket='datasets', dataset='ml-25m', split=['test', 'train', 'val']):
    train, test, val, (n_users, n_items) = split_dataset()

    data_map = {'n_users': n_users, 'n_items': n_items, 'train': train, 'test': test, 'val': val}

    for dataset_name in ['train', 'test', 'val']:
        data_map[dataset_name] = datasetReader(data_map[dataset_name], dataset_name=dataset_name)
    #    data_map['n_users'] = max(data_map['n_users'], df.user_id.max())
    #    data_map['n_items'] = max(data_map['n_items'], df.item_id.max())
    
    assert data_map['n_users'] == n_users
    assert data_map['n_items'] == n_items
    return data_map

### Model training component

In [65]:
def train_model(mlflow_experiment_name='recommender', mlflow_run_id=None, mlflow_tags={},
                hot_reload_model_run_id=None,
                model_embedding_factors=20, model_learning_rate=1e-3,model_hidden_dims=256, model_dropout_rate=0.2,
                optimizer_step_size=10, optimizer_gamma=0.1,
                training_epochs=30,
                train_batch_size=64, test_batch_size=64, shuffle_training_data=True, shuffle_testing_data=True):
    input_params = {}
    for k, v in locals().items():
        if k == 'input_params':
            continue
        input_params[k] = v
    import torch
    from torch.autograd import Variable
    from torch.utils.data import DataLoader
    import mlflow
    from torchinfo import summary
    from mlflow.models import infer_signature

    
    class BiasedMatrixFactorization(torch.nn.Module):
        def __init__(self, n_users, n_items, n_factors, hidden_dim, dropout_rate):
            super().__init__()
            self.n_items = n_items
            self.user_factors = torch.nn.Embedding(n_users+1, 
                                               n_factors,
                                               sparse=False)
            self.item_factors = torch.nn.Embedding(n_items+1, 
                                               n_factors,
                                               sparse=False)
            #self.user_biases = torch.nn.Embedding(n_users+1, 
            #                                      1,
            #                                      sparse=True)
            #self.item_biases = torch.nn.Embedding(n_items+1,
            #                                      1,
            #                                      sparse=True)
        
            self.linear = torch.nn.Linear(in_features=n_factors, out_features=hidden_dim)
            self.linear2 = torch.nn.Linear(in_features=hidden_dim, out_features=1)
            self.dropout = torch.nn.Dropout(p=dropout_rate)
            self.relu = torch.nn.ReLU()
        
        def forward(self, user, item):
            user_embedding = self.user_factors(user)
            item_embedding = self.item_factors(item)
            embeddding_vector = torch.mul(user_embedding, item_embedding)
            x = self.relu(self.linear(embeddding_vector))
            x = self.dropout(x)
            #rating = self.sigmoid(self.linear(embeddding_vector))
            rating = self.linear2(x)
            return rating
    
    dataset_map = get_datasets_local(split=['train', 'test'])
    
    if hot_reload_model_run_id is not None:
        model_uri = f"runs:/{hot_reload_model_run_id}/model"
        model = mlflow.pytorch.load_model(model_uri)
    else:
        model = BiasedMatrixFactorization(dataset_map['n_users'], dataset_map['n_items'], n_factors=model_embedding_factors, hidden_dim=model_hidden_dims, dropout_rate=model_dropout_rate)

    optimizer = torch.optim.SGD(model.parameters(), lr=model_learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=optimizer_step_size, gamma=optimizer_gamma)
    loss_func = torch.nn.L1Loss()
    train_dataloader = DataLoader(dataset_map['train'], batch_size=train_batch_size, shuffle=shuffle_training_data)
    test_dataloader = DataLoader(dataset_map['test'], batch_size=test_batch_size, shuffle=shuffle_testing_data)

    # Set our tracking server uri for logging
    mlflow.set_tracking_uri(uri="http://192.168.1.104:8080")

    # Create a new MLflow Experiment
    mlflow.set_experiment(mlflow_experiment_name)

    with mlflow.start_run(run_id=mlflow_run_id):
        for k,v in input_params.items():
            if 'mlflow_' not in k:
                mlflow.log_param(k, v)
        mlflow.log_param("loss_function", loss_func.__class__.__name__)
        #mlflow.log_param("metric_function", metric_fn.__class__.__name__,
        mlflow.log_param("optimizer", "SGD")
        mlflow.log_params({'n_user': dataset_map['n_users'], 'n_items': dataset_map['n_items']})
    
        for k,v in mlflow_tags.items():
            mlflow.set_tag(k, v)

        with open("model_summary.txt", "w") as f:
            f.write(str(summary(model)))
        mlflow.log_artifact("model_summary.txt")

        model_signature = None

        for train_iter in range(training_epochs):
            print(train_iter)
            model.train()
            t_loss = 0
            t_count = 0
            for row, col, rating in train_dataloader:
                # Predict and calculate loss
                #try:
                prediction = model(row, col)
                if model_signature is None:
                    model_signature = infer_signature({'user': row.cpu().detach().numpy(), 'movie': col.cpu().detach().numpy()}, prediction.cpu().detach().numpy())

                #except Exception as e:
                #print(f"R:{row}, C:{col}")
                loss = loss_func(prediction, rating.unsqueeze(1))
                t_loss += loss
                t_count += 1

                # Backpropagate
                loss.backward()

                # Update the parameters
                optimizer.step()
                optimizer.zero_grad()
            mlflow.log_metric("avg_training_loss", f"{(t_loss/t_count):3f}", step=train_iter)
            scheduler.step()
            model.eval()
            te_loss = 0
            te_count = 0
            print('Evaluating')
            with torch.no_grad():
                #HR, NDCG = metrics(model, test_dataloader, 5)
                for row, col,rating in test_dataloader:
                    prediction = model(row, col)
                    loss = loss_func(prediction, rating.unsqueeze(1))
                    te_loss += loss
                    te_count += 1
            mlflow.log_metric("avg_testing_loss", f"{(te_loss/te_count):3f}", step=train_iter)
            #print(f"HR: {HR} NDCG:{NDCG}")
            print(f"Test loss: {te_loss/te_count}")
            print(f"Train loss: {t_loss/t_count}")

        mlflow.pytorch.log_model(model, "model", signature=model_signature)

train_model(train_batch_size=64, test_batch_size=64, training_epochs=30, mlflow_tags={'negative_sampling': 'True'})

x
y
z
a
train : 645570
test : 86076
val : 129114


2024/05/12 16:17:19 INFO mlflow.types.utils: MLflow 2.9.0 introduces model signature with new data types for lists and dictionaries. For input such as Dict[str, Union[scalars, List, Dict]], we infer dictionary values types as `List -> Array` and `Dict -> Object`. 


0
Evaluating
Test loss: 1.0483975410461426
Train loss: 1.163323163986206
1
Evaluating
Test loss: 1.0204118490219116
Train loss: 1.0520703792572021
2
Evaluating
Test loss: 1.0143870115280151
Train loss: 1.0359437465667725
3
Evaluating
Test loss: 1.0118434429168701
Train loss: 1.0291141271591187
4
Evaluating
Test loss: 1.0107758045196533
Train loss: 1.0246461629867554
5
Evaluating
Test loss: 1.0099152326583862
Train loss: 1.0214537382125854
6
Evaluating
Test loss: 1.0109100341796875
Train loss: 1.0196093320846558
7
Evaluating
Test loss: 1.0098681449890137
Train loss: 1.0177663564682007
8
Evaluating
Test loss: 1.0097213983535767
Train loss: 1.0160810947418213
9
Evaluating
Test loss: 1.0085365772247314
Train loss: 1.014838695526123
10
Evaluating
Test loss: 1.0084292888641357
Train loss: 1.0141973495483398
11
Evaluating
Test loss: 1.0085948705673218
Train loss: 1.0143283605575562
12
Evaluating
Test loss: 1.0084978342056274
Train loss: 1.0139269828796387
13
Evaluating
Test loss: 1.0084267854



## Pipeline definition

In [None]:
import kfp.dsl as dsl
client = kfp.Client() # change arguments accordingly
@dsl.pipeline(
  name='Model training pipeline',
  description='A pipeline to train models on the movielens dataset for recommenders'
)
def training_pipeline(
    minio_bucket:str='datasets',
    trainig_batch_size: int = 1,
    training_learning_rate:float = 0.001,
    training_factors: int = 20,
    optimizer_step_size: float= 25.0,
    optimizer_gamma: float = 0.1):
    check_dataset
    train_model
    validate_model
    model_quality_gate
    promote_model

    download_dataset = download_op()
    unzip_folder = unzip_op(download_dataset.output)
    ratings_parquet_op = csv_to_parquet_op(unzip_folder.outputs['ratings_output'])
    movies_parquet_op = csv_to_parquet_op(unzip_folder.outputs['movies_output'])
    split_op = split_dataset_op(ratings_parquet_op.output,random_state=random_init)
    u1 = upload_to_minio_op(movies_parquet_op.output, upload_file_name='movies.parquet.gzip', bucket=minio_bucket)
    u2 = upload_to_minio_op(split_op.output, bucket=minio_bucket)
    qa_component_op(bucket=minio_bucket).after(u2)

# Create a pipeline run, using the client you initialized in a prior step.
kfp.compiler.Compiler().compile(
    pipeline_func=dataprep_pipeline,
    package_path='dataPrep_pipeline.yaml')

In [69]:

def evaluate_model(model_run_id, top_k=50, threshold=3, val_batch_size=32):
    from collections import defaultdict
    import torch
    import mlflow.pytorch
    from sklearn.metrics import mean_squared_error

    mlflow.set_tracking_uri(uri="http://192.168.1.104:8080")

    model_uri = f"runs:/{model_run_id}/model"
    recommendation_model = mlflow.pytorch.load_model(model_uri)

    def calculate_precision_recall(user_ratings, k, threshold):
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
        n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
        n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])

        precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        return precision, recall

    user_ratings_comparison = defaultdict(list)

    dataset_map = get_datasets_local(split=['val'])
    val_dataloader = DataLoader(dataset_map['val'], batch_size=val_batch_size, shuffle=True)

    y_pred = []
    y_true = []

    recommendation_model.eval()

    with torch.no_grad():
        for users, movies, ratings in val_dataloader:
            output = recommendation_model(users, movies)

            y_pred.append(output.sum().item() / len(users))
            y_true.append(ratings.sum().item() / len(users))

            for user, pred, true in zip(users, output, ratings):
                user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))

    user_precisions = dict()
    user_based_recalls = dict()

    k = top_k

    for user_id, user_ratings in user_ratings_comparison.items():
        precision, recall = calculate_precision_recall(user_ratings, k, threshold)
        user_precisions[user_id] = precision
        user_based_recalls[user_id] = recall


    average_precision = sum(prec for prec in user_precisions.values()) / len(user_precisions)
    average_recall = sum(rec for rec in user_based_recalls.values()) / len(user_based_recalls)
    rms = mean_squared_error(y_true, y_pred, squared=False)

    print(f"precision_{k}: {average_precision:.4f}")
    print(f"recall_{k}: {average_recall:.4f}")
    print(f"rms: {rms:.4f}")
    mlflow.log_metric(f"precision_{k}", average_precision, run_id=model_run_id)
    mlflow.log_metric(f"recall_{k}", average_recall, run_id=model_run_id)
    mlflow.log_metric("rms", rms, run_id=model_run_id)

evaluate_model('c76393853a034a938656f990e6dc783f')

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/12 16:50:21 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


x
y
z
a
train : 645570
test : 86076
val : 129114
precision_50: 0.7344
recall_50: 0.9575
rms: 0.3358


