### Factorization Machines - Movie Recommendation Model
Input features: ['userId','movieId']
Target: rating

In [1]:
import numpy as np
import pandas as pd

#Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### Upload Data to S3

In [2]:
### Import s3 bucket name as environment variable

import os
env_vars = !cat ./.env
for var in env_vars:
    key, value = var.split('=')
    os.environ[key] = value

In [4]:
bucket_name = os.environ['BUCKET_NAME']
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{}/{}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{}/{}'.format(bucket_name, test_file_key)

In [5]:
# Read Dimension:Number of unique users + number of unique movies in our dataset

dim_movie = 0

#Update movie dimension - from file used for training
with open(r'ml-latest-small/movie_dimension.txt') as f:
    dim_movie= int(f.read())

In [6]:
dim_movie

10334

In [8]:
# Write and Reading from S3
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

def write_to_s3(filename,bucket,key):
    with open(filename, 'rb') as f:
        boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [9]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

### Training Algorithm Docker Image

In [10]:
# We use spot instance for traing

use_spot_instances = True
max_run = 3600
max_wait = 3600

job_name = 'fm-movie-v4'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/movie/checkpoints/{job_name}'

In [12]:
sess = sagemaker.Session()
sess

<sagemaker.session.Session at 0x7f9321763be0>

In [13]:
role = get_execution_role()

In [15]:
# Use fatorization-machines

container = sagemaker.image_uris.retrieve("factorization-machines",sess.boto_region_name)
print(f'using FM container {container}')

using FM container 382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:1


#### Build Model

In [16]:
# Configure training job
# specify type and number of instances to use
# s3 location where final artifacts needs tobe stored

# SDK 2.x version does not require train prefix for instance count and type

estimator = sagemaker.estimator.Estimator(container,
                                          role,
                                          instance_count=1,
                                          instance_type='ml.m5.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name=job_name,
                                          use_spot_instances=use_spot_instances,
                                          max_run=max_run,
                                          max_wait=max_wait,
                                          checkpoint_s3_uri=checkpoint_s3_uri
                                    
                )

#### New Configuration after Model tuning

In [17]:
estimator.set_hyperparameters(feature_dim=dim_movie,
                             num_factors=8,
                              predictor_type='regressor',
                              mini_batch_size=994,
                              epochs=91,
                              bias_init_method='normal',
                              bias_lr=0.21899531189430518,
                              factors_init_method='normal',
                              factors_lr=5.357593337770278e-05,
                              linear_init_method='normal',
                              linear_lr=0.00021524948053767607)

In [18]:
estimator.hyperparameters()

{'feature_dim': 10334,
 'num_factors': 8,
 'predictor_type': 'regressor',
 'mini_batch_size': 994,
 'epochs': 91,
 'bias_init_method': 'normal',
 'bias_lr': 0.21899531189430518,
 'factors_init_method': 'normal',
 'factors_lr': 5.357593337770278e-05,
 'linear_init_method': 'normal',
 'linear_lr': 0.00021524948053767607}

#### Train the model

In [None]:
estimator.fit({'train':s3_training_file_location, 'test':s3_test_file_location})

INFO:sagemaker:Creating training-job with name: fm-movie-v4-2024-05-14-23-24-41-131


2024-05-14 23:24:41 Starting - Starting the training job...
2024-05-14 23:24:57 Starting - Preparing the instances for training...
2024-05-14 23:25:24 Downloading - Downloading input data...
2024-05-14 23:25:45 Downloading - Downloading the training image..................