In [17]:
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader

In [None]:


def get_datasets(bucket='datasets', dataset='ml-25m', split=['test', 'train', 'val']):
    from pyarrow import fs, parquet
    valid_splits = ['test', 'train', 'val']
    data_map = {}
    minio = fs.S3FileSystem(
        endpoint_override='http://minio-service.kubeflow:9000',
         access_key='minio',
         secret_key='minio123',
         scheme='http')

    if type(split) is not list:
        split = [split]

    for dataset_name in valid_splits:
        paraquet_data = minio.open_input_file(f'{bucket}/{dataset}/{dataset_name}.parquet.gzip')
        df = parquet.read_table(paraquet_data).to_pandas()
        data_map['n_users'] = max(data_map['n_users'], df.user_id.max())
        data_map['n_items'] = max(data_map['n_items'], df.item_id.max())
        if dataset_name in split:
            data_map[dataset_name] = datasetReader(df, dataset_name=dataset_name)
    
    assert list(data_map.keys()) == split, f"Mismatched or invalid splits. Received {split} but can only process {valid_splits}"
    return data_map


### Test Dataset

### Model training component

In [None]:


train_model(train_batch_size=64, test_batch_size=64, training_epochs=30, model_learning_rate=1e-2, mlflow_tags={'negative_sampling': 'True', 'testing_sample': 'tracking'})

x
y
z
a
train : 5145
test : 686
val : 1029


2024/05/13 01:48:51 INFO mlflow.types.utils: MLflow 2.9.0 introduces model signature with new data types for lists and dictionaries. For input such as Dict[str, Union[scalars, List, Dict]], we infer dictionary values types as `List -> Array` and `Dict -> Object`. 


0
Evaluating
Test loss: 1.1749027967453003
Train loss: 1.4642317295074463
1
Evaluating
Test loss: 1.1438854932785034
Train loss: 1.1712555885314941
2
Evaluating
Test loss: 1.121619462966919
Train loss: 1.141432523727417
3
Evaluating
Test loss: 1.102582573890686
Train loss: 1.1218864917755127
4
Evaluating
Test loss: 1.090576410293579
Train loss: 1.113316535949707
5
Evaluating
Test loss: 1.0769343376159668
Train loss: 1.0954406261444092
6
Evaluating
Test loss: 1.0638604164123535
Train loss: 1.0800702571868896
7
Evaluating
Test loss: 1.0568610429763794
Train loss: 1.0677101612091064
8
Evaluating
Test loss: 1.0546883344650269
Train loss: 1.057148814201355
9
Evaluating
Test loss: 1.04603910446167
Train loss: 1.046576976776123
10
Evaluating
Test loss: 1.0394843816757202
Train loss: 1.037510633468628
11
Evaluating
Test loss: 1.041893482208252
Train loss: 1.042807936668396
12
Evaluating
Test loss: 1.0418674945831299
Train loss: 1.0450609922409058
13
Evaluating
Test loss: 1.039739727973938
Trai



### Model validation

In [None]:


validate_model('0b2bb12d3db94a938005dfe1651a1d9f')

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/13 01:50:04 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


x
y
z
a
train : 5145
test : 686
val : 1029
precision_50: 0.7442
recall_50: 0.8649
rms: 0.2836




### Model quality gate

In [None]:
def model_quality_gate():
    pass

### Model registration

In [None]:




promote_model('9cc6321b8dcf4e43a1fd6b339107d10e')

Registered model 'recommender_production' already exists. Creating a new version of this model...
2024/05/13 00:49:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: recommender_production, version 4
Created version '4' of model 'recommender_production'.


## Pipeline definition

In [None]:
import kfp.dsl as dsl
client = kfp.Client() # change arguments accordingly
@dsl.pipeline(
  name='Model training pipeline',
  description='A pipeline to train models on the movielens dataset for recommenders'
)
def training_pipeline(
    minio_bucket:str='datasets',
    trainig_batch_size: int = 1,
    training_learning_rate:float = 0.001,
    training_factors: int = 20,
    optimizer_step_size: float= 25.0,
    optimizer_gamma: float = 0.1):
    check_dataset
    train_model
    validate_model
    model_quality_gate
    promote_model

    download_dataset = download_op()
    unzip_folder = unzip_op(download_dataset.output)
    ratings_parquet_op = csv_to_parquet_op(unzip_folder.outputs['ratings_output'])
    movies_parquet_op = csv_to_parquet_op(unzip_folder.outputs['movies_output'])
    split_op = split_dataset_op(ratings_parquet_op.output,random_state=random_init)
    u1 = upload_to_minio_op(movies_parquet_op.output, upload_file_name='movies.parquet.gzip', bucket=minio_bucket)
    u2 = upload_to_minio_op(split_op.output, bucket=minio_bucket)
    qa_component_op(bucket=minio_bucket).after(u2)

# Create a pipeline run, using the client you initialized in a prior step.
kfp.compiler.Compiler().compile(
    pipeline_func=dataprep_pipeline,
    package_path='dataPrep_pipeline.yaml')