## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import boto3,io,json
from sklearn.model_selection import train_test_split
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from sagemaker.amazon.amazon_estimator import get_image_uri
sage_client = boto3.Session().client('sagemaker')
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Importing datasets from S3 Buckets & read into pandas dataset

In [2]:
df_ratings = pd.read_csv('run-1607268992801-part-r-00000.csv')
df_movies = pd.read_csv('run-1608674764528-part-r-00000.csv')

In [3]:
df_ratings = df_ratings.drop(df_ratings[df_ratings.userid == 'userId'].index)
df_movies = df_movies.drop(df_movies[df_movies.movieid == 'movieId'].index)
df_ratings.userid = df_ratings.userid.astype(int)
df_ratings.movieid = df_ratings.movieid.astype(int)
df_ratings.rating = df_ratings.rating.astype(float)
df_movies.movieid = df_movies.movieid.astype(int)

## Dropping the columns like timestamp and genres

In [4]:
df_ratings = df_ratings.drop('timestamp', axis=1)
df_movies = df_movies.drop('genres', axis=1)

## Creating  pivot table of users and movies

In [5]:
df = pd.merge(df_ratings,df_movies, on='movieid')

In [9]:
mtx = df.pivot_table(index='movieid', columns='userid', values='rating').fillna(0)

## Creating test and train sets

In [10]:
X_train, X_test = train_test_split(mtx, test_size=0.33, random_state=42)

## Create a sparse matrix for test and train sets

In [14]:
mtx_X_train=csr_matrix(X_train.values).astype('float32')
mtx_X_test=csr_matrix(X_test.values).astype('float32')

## Create data repositories for input/output and model data

In [18]:
#Change this value to your own bucket name
bucket = 'movies-mlready-bucket'
prefix = 'knn-regressor'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')


output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [19]:
knn_train_label = np.arange(1,6516)
knn_test_label = np.arange(1,3210)

## Converting train data into record-io format

In [21]:
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
knn_train_data_path = writeDatasetToProtobuf(mtx_X_train, bucket, train_prefix, train_key, "sparse", knn_train_label)
knn_test_data_path = writeDatasetToProtobuf(mtx_X_test, bucket, test_prefix, test_key, "sparse", knn_test_label)

print ("Training data S3 path: ",knn_train_data_path)
print ("Test data S3 path: ",knn_test_data_path)
print ("FM model output S3 path: {}".format(output_prefix))

Training data S3 path:  s3://movies-mlready-bucket/knn-regressor/train/train.protobuf
Test data S3 path:  s3://movies-mlready-bucket/knn-regressor/test/test.protobuf
FM model output S3 path: s3://movies-mlready-bucket/knn-regressor/output


In [22]:
knn_prefix = 'knn'
knn_output_prefix  = 's3://{}/{}/output'.format(bucket, knn_prefix)

## Creating an Esitmator for K-NearestNeighbor

In [23]:
instance_type='ml.m5.large'
role = 'arn:aws:iam::719009365707:role/role_sagemaker'

In [24]:
knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
    role,
    train_instance_count=1,
    train_instance_type=instance_type,
    output_path=knn_output_prefix,
    sagemaker_session=sagemaker.Session())

## Define Auto-tuning job with HyperParameter Optimization (HPO) objective metrics and HPO parameters

In [25]:
knn.set_hyperparameters(feature_dim='610', 
                        index_metric="INNER_PRODUCT", 
                        predictor_type='regressor'
                        )

In [26]:
hyperparameter_ranges = {'k': IntegerParameter(1,10),
    'sample_size': IntegerParameter(5000,20000)}

In [27]:
objective_metric_type='test:mse'

In [28]:
tuner_knn = HyperparameterTuner(knn,
                                objective_metric_type,
                                hyperparameter_ranges,
                                objective_type='Minimize',
                                base_tuning_job_name='knn-HPO',
                                max_jobs=5,
                                max_parallel_jobs=2)

## Fit the HPO tuning job

In [29]:
fit_input = {'train': knn_train_data_path, 'test': knn_test_data_path}
tuner_knn.fit(fit_input)

## Deploy the Best model which is provided by Hyperparameter tuner

In [30]:
knn_predictor = tuner_knn.deploy(instance_type='ml.m5.large', 
                            initial_instance_count=1,)

2021-01-10 14:56:04 Starting - Preparing the instances for training
2021-01-10 14:56:04 Downloading - Downloading input data
2021-01-10 14:56:04 Training - Training image download completed. Training in progress.
2021-01-10 14:56:04 Uploading - Uploading generated training model
2021-01-10 14:56:04 Completed - Training job completed[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/10/2021 14:55:50 INFO 140043370022720] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'feature_dim': u'auto', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000'}[0m
[34m[01/10/2021 14:55:50 INFO 140043370022720] Merging w

-------------!

## Prediction data input

## Inference input serialization

In [31]:
def knn_serializer(data):
    js = {"instances": []}
    for row in data:
        js["instances"].append({"features": row.tolist()})
    return json.dumps(js).encode()


knn_predictor.serializer = knn_serializer
knn_predictor.deserializer = json_deserializer
knn_predictor.accept = 'application/json; verbose=true'

accept="application/jsonlines; verbose=true"

## Predict the labels and nearest neighbor distances

In [32]:
knn_result = knn_predictor.predict(mtx_X_test[1002:1004].toarray())

In [33]:
import json

In [34]:
results_json = json.dumps(knn_result, indent=10)

In [150]:
print(results_json)

{
          "predictions": [
                    {
                              "distances": [
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.0,
                                        0.15014603734016418,
                                        0.3081578314304352,
                                        0.4481106996536255
                              ],
                              "predicted_label": 3098.8,
                              "labels": [
                                        1647.0,
    