# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import boto3,io,json
from sklearn.model_selection import train_test_split
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_serializer, json_deserializer
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from sagemaker.amazon.amazon_estimator import get_image_uri
sage_client = boto3.Session().client('sagemaker')
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Copying dataset from S3

In [2]:
df = pd.read_csv('run-1607268992801-part-r-00000.csv')

## Dropping columns (timestamp), and changing the userid as index

In [3]:
df = df.drop(df[df.userid == 'userId'].index)
df.userid = df.userid.astype(int)
df.movieid = df.movieid.astype(int)
df.rating = df.rating.astype(float)
df = df.drop('timestamp', axis=1)
df = df.set_index('userid')

## Creating Label vector

In [7]:
y = df['rating'].values.astype('float32')

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

## Creating a sparse matrix (Users to movies)by using One-hot encoder technique  

In [9]:
encoder = OneHotEncoder(handle_unknown='ignore',sparse=True)

In [11]:
X = encoder.fit_transform(df).astype('float32')

## Create a training and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Creating data repositiories for input/output and model data

In [17]:
bucket = 'movies-mlready-bucket'
prefix = 'fm-regression'
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')
test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')
output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

## Helper function to convert input data to record-io format

In [18]:
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
fm_regression_train_data_path = writeDatasetToProtobuf(X_train, bucket, train_prefix, train_key, "sparse", y_train)
fm_regression_test_data_path  = writeDatasetToProtobuf(X_test, bucket, test_prefix, test_key, "sparse", y_test)

  
print ("Training data S3 path: ",fm_regression_train_data_path)
print ("Test data S3 path: ",fm_regression_test_data_path)
print ("FM model output S3 path: {}".format(output_prefix))

Training data S3 path:  s3://movies-mlready-bucket/fm-regression/train/train.protobuf
Test data S3 path:  s3://movies-mlready-bucket/fm-regression/test/test.protobuf
FM model output S3 path: s3://movies-mlready-bucket/fm-regression/output


## Create an Factorization Machines Estimator

In [19]:
role = 'arn:aws:iam::719009365707:role/role_sagemaker'
instance_type='ml.m5.large'
features= 9734


In [20]:
fm_regression = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "factorization-machines"),
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type=instance_type,
                                   base_job_name='FM-Regression-prod2',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm_regression.set_hyperparameters(feature_dim=features,
                      predictor_type='regressor',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=50)

## Define Auto-tuning job with HyperParameter Optimization (HPO) objective metrics and HPO parameters

In [21]:
objective_metric_name='test:accuracy'


In [22]:
hyperparameter_ranges = {'bias_init_sigma': ContinuousParameter(1e-8,2),
                         'bias_lr': ContinuousParameter(1e-8,2),
                         'bias_wd': ContinuousParameter(1e-8,2),
                         }

In [23]:
tuner_regression = HyperparameterTuner(fm_regression,
                            objective_metric_name,
                            hyperparameter_ranges,
                            objective_type='Minimize'
                            base_tuning_job_name='FM-Regression',
                            max_jobs=5,
                            max_parallel_jobs=2)

## Fit the HPO tuning job

In [24]:
tuner_regression.fit({'train': fm_regression_train_data_path, 'test': fm_regression_test_data_path})

## Deploy the Best model which is provided by Hyperparameter tuner

In [26]:
fm_regression_pred = tuner_regression.deploy(instance_type='ml.m5.large', initial_instance_count=1)

2021-01-06 16:38:23 Starting - Preparing the instances for training
2021-01-06 16:38:23 Downloading - Downloading input data
2021-01-06 16:38:23 Training - Training image download completed. Training in progress.
2021-01-06 16:38:23 Uploading - Uploading generated training model
2021-01-06 16:38:23 Completed - Training job completed[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[01/06/2021 16:37:41 INFO 140480896128832] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metr

-------------!

## Inference Serialization

In [27]:
def fm_serializer(data):
    js = {"instances": []}
    for row in data:
        js["instances"].append({"features": row.tolist()})
    return json.dumps(js).encode()


fm_regression_pred.serializer = fm_serializer
fm_regression_pred.deserializer = json_deserializer

## Inference and  Predictions

In [28]:
fm_regression_result = fm_regression_pred.predict(X_test[1002:1009].toarray())


In [29]:
Actuals = pd.DataFrame(y_test[1002:1009])

In [40]:
predictions = fm_regression_result.values()

In [42]:
predictions

dict_values([[{'score': 4.0022454261779785}, {'score': 3.9643962383270264}, {'score': 4.015176773071289}, {'score': 2.282944440841675}, {'score': 1.9619837999343872}, {'score': 4.0184173583984375}, {'score': 4.961643218994141}]])

## Comparing Predictions vs Actuals

In [31]:
Actuals

Unnamed: 0,0
0,4.0
1,4.0
2,4.0
3,2.0
4,2.0
5,4.0
6,5.0
