# Recommendation System (prediction)

In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.metrics import mean_squared_error as MSE
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.deserializers import JSONDeserializer

In [2]:
# Call the endpoint
recommendation_predictor = sagemaker.predictor.Predictor(endpoint_name="recommentation-anime")

In [3]:
# Create sparse serializer
dim = 69 + 3098  # = 3167

def sparse_serializer(user_anime_data):
    instances = {"instances": []}
    for key in user_anime_data:
        keys = key.tolist()
        values = np.ones(len(keys), dtype=int).tolist()

        instances["instances"].append(
            {
                "data": {
                    "features": {
                        "keys": keys,
                        "shape": [dim],
                        "values": values
                    }
                }
            }
        )
    return json.dumps(instances)

Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

In [4]:
# Set serializers and deserializer to predictor
recommendation_predictor.serializer.serialize = sparse_serializer
recommendation_predictor.serializer.content_type = "application/json"
recommendation_predictor.deserializer = JSONDeserializer()

In [5]:
# Test the sparse serializer
sparse_serializer([np.array([0,318])])

'{"instances": [{"data": {"features": {"keys": [0, 318], "shape": [3167], "values": [1, 1]}}}]}'

In [6]:
# Predict single data
# data 7 0:1 318:1

recommendation_predictor.predict([np.array([0,318])])

{'predictions': [{'score': 7.559768199920654}]}

In [7]:
# Test the sparse serializer with multiple data
sparse_serializer([np.array([1,3007]), np.array([2,2579]), np.array([3,1177])])

'{"instances": [{"data": {"features": {"keys": [1, 3007], "shape": [3167], "values": [1, 1]}}}, {"data": {"features": {"keys": [2, 2579], "shape": [3167], "values": [1, 1]}}}, {"data": {"features": {"keys": [3, 1177], "shape": [3167], "values": [1, 1]}}}]}'

In [8]:
# Predict multiple data
# 8 1:1 3007:1
# 7 2:1 2579:1
# 6 3:1 1177:1

recommendation_predictor.predict([np.array([1,3007]), np.array([2,2579]), np.array([3,1177])])

{'predictions': [{'score': 7.648407936096191},
  {'score': 7.642399787902832},
  {'score': 7.650603294372559}]}

## Predict the test data

In [9]:
# Load the test data
anime_test = pd.read_csv(
    "anime_test.svm",
    sep=" ",
    names=["rating","user_id", "anime_id"]
)

# Remove the ":1"
anime_test["user_id"] = anime_test["user_id"].str.split(":").str[0].astype(int)
anime_test["anime_id"] = anime_test["anime_id"].str.split(":").str[0].astype(int)
print("Shape of the test data: {}".format(anime_test.shape))

anime_test.head()

Shape of the test data: (2000, 3)


Unnamed: 0,rating,user_id,anime_id
0,6,0,625
1,6,0,380
2,9,0,887
3,5,0,303
4,8,0,195


In [10]:
# Split predictions
def split_predictions(predictor, input_data, n_split):
    pred = []
    for data in np.array_split(input_data, n_split):
        if data.shape[0] > 0:
            predictions = predictor.predict(data)
            pred += [score['score'] for score in predictions['predictions']]
    return pred

In [11]:
# Predict the test data
pred = split_predictions(recommendation_predictor, anime_test[["user_id", "anime_id"]].values, 200)

In [12]:
rmse = MSE(anime_test["rating"], pred)**0.5
print(f"The RMSE is: {rmse}")

The RMSE is: 1.5746361808686264


In [13]:
# Save prediction result
pred_df = anime_test.reset_index(drop=True)
pred_df["pred_rating"] = pred
pred_df.to_csv("anime_pred.csv", index=False)

In [14]:
pred_df.head(3)

Unnamed: 0,rating,user_id,anime_id,pred_rating
0,6,0,625,7.629499
1,6,0,380,7.402591
2,9,0,887,7.560416


## Predict single user rating to all anime titles

In [15]:
# Load the test data
users_anime = pd.read_csv(
    "user_animes.svm",
    sep=" ",
    names=["id", "user_id", "anime_id"]
)
users_anime["user_id"] = users_anime["user_id"].str.split(":").str[0].astype(int)
users_anime["anime_id"] = users_anime["anime_id"].str.split(":").str[0].astype(int)
print("Shape of the test data: {}".format(users_anime.shape))

users_anime.head()

Shape of the test data: (3098, 3)


Unnamed: 0,id,user_id,anime_id
0,430,1,293
1,1004,1,498
2,3010,1,839
3,570,1,366
4,2762,1,789


In [16]:
# Predict the test data
pred_users_anime = split_predictions(recommendation_predictor, users_anime[["user_id", "anime_id"]].values, 200)

In [17]:
# Combine predictions with users_anime and anime data
pred_users_anime_df = users_anime.reset_index(drop=True)
pred_users_anime_df["pred_rating"] = pred_users_anime

In [18]:
# save the predictions
pred_users_anime_df.to_csv("pred_users_anime_df.csv")

Delete the endpoint if not needed anymore.