## Importing Libraries

In [42]:
import pandas as pd
import numpy as np
import sagemaker
import boto3,io,json
from sklearn.model_selection import train_test_split
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from sagemaker.amazon.amazon_estimator import get_image_uri
sage_client = boto3.Session().client('sagemaker')
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Importing datasets from S3 Buckets & read into pandas dataset

In [43]:
df_ratings = pd.read_csv('run-1607268992801-part-r-00000.csv')
df_movies = pd.read_csv('run-1608674764528-part-r-00000.csv')

In [47]:
df_ratings = df_ratings.drop(df_ratings[df_ratings.userid == 'userId'].index)

In [68]:
df_ratings.userid = df_ratings.userid.astype(int)
df_ratings.movieid = df_ratings.movieid.astype(int)
df_ratings.rating = df_ratings.rating.astype(float)
df_movies.movieid = df_movies.movieid.astype(int)

In [49]:
df_movies = df_movies.drop(df_movies[df_movies.movieid == 'movieId'].index)

In [50]:
df_movies.movied = df_movies.movieid.astype(int)
df_movies.title = df_movies.title.astype(str)

  """Entry point for launching an IPython kernel.


## Dropping the columns like timestamp and genres

In [51]:
df_ratings = df_ratings.drop('timestamp', axis=1)
df_movies = df_movies.drop('genres', axis=1)

## Creating  pivot table of users and movies

In [52]:
movies_users = df_ratings.pivot(index='movieid', columns='userid', values='rating').fillna(0)

In [69]:
nb_movies = df_movies['movieid'].max()

## Create a sparse matrix for pivot table

In [55]:
mat_movies_users=csr_matrix(movies_users.values).astype('float32')

## Create data repositories for input/output and model data

In [57]:
#Change this value to your own bucket name
bucket = 'movies-mlready-bucket'
prefix = 'fm-knn-classifier'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')


output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [79]:
nb_movies = df_movies['movieid'].max()
knn_train_label = np.arange(1,9725)

## Converting train data into record-io format

In [80]:
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
fm_knn_train_data_path = writeDatasetToProtobuf(mat_movies_users, bucket, train_prefix, train_key, "sparse", knn_train_label)
  
print ("Training data S3 path: ",fm_knn_train_data_path)
print ("FM model output S3 path: {}".format(output_prefix))

Training data S3 path:  s3://movies-mlready-bucket/fm-knn-classifier/train/train.protobuf
FM model output S3 path: s3://movies-mlready-bucket/fm-knn-classifier/output


In [85]:
nb_recommendations = 10
knn_prefix = 'knn'
knn_output_prefix  = 's3://{}/{}/output'.format(bucket, knn_prefix)

## Creating an Esitmator for K-NearestNeighbor

In [82]:
instance_type='ml.m5.large'
role = 'arn:aws:iam::719009365707:role/role_sagemaker'

In [86]:
knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
    role,
    train_instance_count=1,
    train_instance_type=instance_type,
    output_path=knn_output_prefix,
    sagemaker_session=sagemaker.Session())

## Setting Hyperparameters

In [93]:
knn.set_hyperparameters(feature_dim='610', 
                        k=nb_recommendations, 
                        index_metric="INNER_PRODUCT", 
                        predictor_type='classifier', 
                        sample_size=200000)

## Fit the K-NN estimator

In [94]:
fit_input = {'train': fm_knn_train_data_path}
knn.fit(fit_input)

2020-12-23 00:01:22 Starting - Starting the training job...
2020-12-23 00:01:25 Starting - Launching requested ML instances......
2020-12-23 00:02:38 Starting - Preparing the instances for training...
2020-12-23 00:03:23 Downloading - Downloading input data...
2020-12-23 00:03:34 Training - Downloading the training image....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/23/2020 00:04:30 INFO 139694968457024] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'feature_dim': u'auto', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000'}[0m
[34m[12/23/2020 00:04:30 INFO 139694968457024] Merging with prov


2020-12-23 00:04:44 Uploading - Uploading generated training model
2020-12-23 00:04:44 Completed - Training job completed
Training seconds: 81
Billable seconds: 81


## Create a deployment endpoint for predictions

In [95]:
knn_predictor = knn.deploy(instance_type='ml.m5.large', 
                            initial_instance_count=1,)

---------------!

## Prediction data input

In [None]:
testinput = mat_movies_users[100:109]

## Inference input serialization

In [110]:
def knn_serializer(data):
    js = {"instances": []}
    for row in data:
        js["instances"].append({"features": row.tolist()})
    return json.dumps(js).encode()


knn_predictor.serializer = knn_serializer
knn_predictor.deserializer = json_deserializer
knn_predictor.accept = 'application/json; verbose=true'

accept="application/jsonlines; verbose=true"

## Predict the labels and nearest neighbor distances

In [111]:
result = knn_predictor.predict(testinput.toarray())

In [114]:
result

{'predictions': [{'distances': [43.0,
    43.0,
    43.0,
    43.0,
    43.0,
    44.0,
    44.0,
    46.0,
    50.0,
    50.0],
   'labels': [308.0,
    33.0,
    509.0,
    437.0,
    399.0,
    124.0,
    511.0,
    335.0,
    315.0,
    278.0],
   'predicted_label': 33.0},
  {'distances': [49.75,
    50.5,
    54.0,
    54.0,
    55.75,
    61.0,
    65.0,
    66.0,
    71.5,
    80.25],
   'labels': [810.0,
    14.0,
    521.0,
    225.0,
    1046.0,
    32.0,
    1.0,
    616.0,
    1044.0,
    102.0],
   'predicted_label': 1.0},
  {'distances': [12.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0],
   'labels': [33.0,
    258.0,
    286.0,
    523.0,
    2.0,
    325.0,
    511.0,
    524.0,
    250.0,
    218.0],
   'predicted_label': 2.0},
  {'distances': [25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0],
   'labels': [900.0,
    686.0,
    1158.0,
    508.0,
    687.0,
    984.0,
    2225.0,
    2195.0,
    899.0,
    312.0],
   'predicted_label': 312.0},
  