## Importing Libraries

In [23]:
import pandas as pd
import numpy as np
import sagemaker
import boto3,io,json
from sklearn.model_selection import train_test_split
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from sagemaker.amazon.amazon_estimator import get_image_uri
sage_client = boto3.Session().client('sagemaker')
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Importing datasets from S3 Buckets & read into pandas dataset

In [24]:
df_ratings = pd.read_csv('run-1607268992801-part-r-00000.csv')
df_movies = pd.read_csv('run-1608674764528-part-r-00000.csv')

In [25]:
df_movies.head(2)

Unnamed: 0,movieid,title,genres
0,579,"Escort, The (Scorta, La) (1993)",Crime|Thriller
1,movieId,title,genres


In [26]:
df_ratings = df_ratings.drop(df_ratings[df_ratings.userid == 'userId'].index)
df_movies = df_movies.drop(df_movies[df_movies.movieid == 'movieId'].index)

In [27]:
df_ratings.userid = df_ratings.userid.astype(int)
df_ratings.movieid = df_ratings.movieid.astype(int)
df_ratings.rating = df_ratings.rating.astype(float)
df_movies.movieid = df_movies.movieid.astype(int)

## Dropping the columns like timestamp and genres

In [28]:
df_ratings = df_ratings.drop('timestamp', axis=1)
df_movies = df_movies.drop('genres', axis=1)

## Creating  pivot table of users and movies

In [29]:
df = pd.merge(df_ratings,df_movies, on='movieid')

In [30]:
df.head(2)

Unnamed: 0,userid,movieid,rating,title
0,7,30816,2.0,"Phantom of the Opera, The (2004)"
1,21,30816,3.0,"Phantom of the Opera, The (2004)"


In [31]:
mtx = df.pivot_table(index='title', columns='userid', values='rating').fillna(0)

In [32]:
X_train, X_test = train_test_split(mtx, test_size=0.33, random_state=42)

In [33]:
mtx.head(2)

userid,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
X_test.shape

(3208, 610)

## Create a sparse matrix for pivot table

In [35]:
mtx_X_train=csr_matrix(X_train.values).astype('float32')
mtx_X_test=csr_matrix(X_test.values).astype('float32')

## Create data repositories for input/output and model data

In [36]:
#Change this value to your own bucket name
bucket = 'movies-mlready-bucket'
prefix = 'fm-knn-classifier'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')


output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [37]:
knn_train_label = np.arange(1,6512)
knn_test_label = np.arange(1,3209)

## Converting train data into record-io format

In [38]:
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
fm_knn_train_data_path = writeDatasetToProtobuf(mtx_X_train, bucket, train_prefix, train_key, "sparse", knn_train_label)
fm_knn_test_data_path = writeDatasetToProtobuf(mtx_X_test, bucket, test_prefix, test_key, "sparse", knn_test_label)

print ("Training data S3 path: ",fm_knn_train_data_path)
print ("Test data S3 path: ",fm_knn_test_data_path)
print ("FM model output S3 path: {}".format(output_prefix))

Training data S3 path:  s3://movies-mlready-bucket/fm-knn-classifier/train/train.protobuf
Test data S3 path:  s3://movies-mlready-bucket/fm-knn-classifier/test/test.protobuf
FM model output S3 path: s3://movies-mlready-bucket/fm-knn-classifier/output


In [39]:
nb_recommendations = 5
knn_prefix = 'knn'
knn_output_prefix  = 's3://{}/{}/output'.format(bucket, knn_prefix)

## Creating an Esitmator for K-NearestNeighbor

In [40]:
instance_type='ml.m5.large'
role = 'arn:aws:iam::719009365707:role/role_sagemaker'

In [41]:
knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
    role,
    train_instance_count=1,
    train_instance_type=instance_type,
    output_path=knn_output_prefix,
    sagemaker_session=sagemaker.Session())

## Setting Hyperparameters

In [42]:
knn.set_hyperparameters(feature_dim='610', 
                        k=nb_recommendations,
                        index_metric="L2", 
                        predictor_type='classifier',
                        sample_size=5000)

## Fit the K-NN estimator

In [43]:
fit_input = {'train': fm_knn_train_data_path, 'test': fm_knn_test_data_path}
knn.fit(fit_input)

2021-01-06 16:31:40 Starting - Starting the training job...
2021-01-06 16:31:42 Starting - Launching requested ML instances.........
2021-01-06 16:33:14 Starting - Preparing the instances for training...
2021-01-06 16:33:57 Downloading - Downloading input data...
2021-01-06 16:34:26 Training - Downloading the training image...
2021-01-06 16:35:05 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/06/2021 16:35:09 INFO 139752276612928] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'feature_dim': u'auto', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_b

[34m[01/06/2021 16:35:10 INFO 139752276612928] Checkpoint loading and saving are disabled.[0m
[34m[01/06/2021 16:35:10 INFO 139752276612928] nvidia-smi took: 0.0251741409302 secs to identify 0 gpus[0m
[34m[01/06/2021 16:35:10 INFO 139752276612928] Create Store: dist_async[0m
[34m[01/06/2021 16:35:10 ERROR 139752276612928] nvidia-smi: failed to run (127): /bin/sh: nvidia-smi: command not found[0m
[34m[01/06/2021 16:35:10 INFO 139752276612928] Using per-worker sample size = 5000 (Available virtual memory = 6404329472 bytes, GPU free memory = 0 bytes, number of workers = 1). If an out-of-memory error occurs, choose a larger instance type, use dimension reduction, decrease sample_size, and/or decrease mini_batch_size.[0m
[34m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Number of Batches Since Last Reset": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Number of Records Since Last Reset": {"count": 1, "max": 0, "sum": 0.0

## Create a deployment endpoint for predictions

In [44]:
knn_predictor = knn.deploy(instance_type='ml.m5.large', 
                            initial_instance_count=1,)

---------------!

## Prediction data input

## Inference input serialization

In [45]:
def knn_serializer(data):
    js = {"instances": []}
    for row in data:
        js["instances"].append({"features": row.tolist()})
    return json.dumps(js).encode()


knn_predictor.serializer = knn_serializer
knn_predictor.deserializer = json_deserializer
knn_predictor.accept = 'application/json; verbose=true'

accept="application/jsonlines; verbose=true"

## Predict the labels and nearest neighbor distances

In [46]:
knn_result = knn_predictor.predict(mtx_X_test[1002:1004].toarray())

In [47]:
import json

In [48]:
results_json = json.dumps(knn_result, indent=10)

In [49]:
print(results_json)

{
          "predictions": [
                    {
                              "distances": [
                                        15.0,
                                        15.0,
                                        15.0,
                                        15.0,
                                        15.0
                              ],
                              "predicted_label": 1049.0,
                              "labels": [
                                        1106.0,
                                        1946.0,
                                        1049.0,
                                        5273.0,
                                        2325.0
                              ]
                    },
                    {
                              "distances": [
                                        17.5,
                                        17.5,
                                        17.5,
                                        17.5