# Hotel Recommender System
Train and deploy a Hotel Recommender System using the public Expedia data from Kaggle competition https://www.kaggle.com/c/expedia-hotel-recommendations/overview

**The goal is to help Expedia visitors find their dream hotel under cold start conditions!**
It's a hard problem as we don't have historical information for new hotel clusters and we want to offer to customer more diversity when it comes to trip planning.

Download data from https://www.kaggle.com/c/expedia-hotel-recommendations/data and unzip the downloaded file.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./expedia-hotel-recommendations/train.csv')

In [3]:
df.head(100)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2014-01-08 14:09:47,2,3,66,462,41898,2454.8588,1482,0,0,...,0,2,28494,6,0,1,2,50,680,95
96,2014-01-08 14:15:40,2,3,66,462,41898,2455.2272,1482,0,0,...,0,1,28494,6,0,4,2,50,680,77
97,2014-01-08 14:18:31,2,3,66,462,41898,2455.2272,1482,0,0,...,0,1,28494,6,0,4,2,50,680,77
98,2014-01-08 14:30:25,2,3,66,462,41898,2455.2272,1482,0,0,...,0,2,28494,6,0,2,2,50,680,77


In [4]:
df.dtypes

date_time                     object
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_ci                       object
srch_co                       object
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
is_booking                     int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
dtype: object

In [5]:
df['date_time'].head(10)

0    2014-08-11 07:46:59
1    2014-08-11 08:22:12
2    2014-08-11 08:24:33
3    2014-08-09 18:05:16
4    2014-08-09 18:08:18
5    2014-08-09 18:13:12
6    2014-07-16 09:42:23
7    2014-07-16 09:45:48
8    2014-07-16 09:52:11
9    2014-07-16 09:55:24
Name: date_time, dtype: object

In [6]:
df['ts'] = pd.to_datetime(df['date_time'], format="%Y-%m-%d %H:%M:%S").values.astype(np.int64) // 10**6

In [7]:
df.dtypes

date_time                     object
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_ci                       object
srch_co                       object
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
is_booking                     int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
ts                             int64
dtype: object

In [8]:
df.head(10)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,1,8250,1,0,3,2,50,628,1,1407743219000
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,1,8250,1,1,1,2,50,628,1,1407745332000
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,1,8250,1,0,1,2,50,628,1,1407745473000
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,1,14984,1,0,1,2,50,1457,80,1407607516000
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,1,14984,1,0,1,2,50,1457,21,1407607698000
5,2014-08-09 18:13:12,2,3,66,442,35390,911.5142,93,0,0,...,1,14984,1,0,1,2,50,1457,92,1407607992000
6,2014-07-16 09:42:23,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,2,2,50,675,41,1405503743000
7,2014-07-16 09:45:48,2,3,66,189,10067,,501,0,1,...,1,8267,1,0,1,2,50,675,41,1405503948000
8,2014-07-16 09:52:11,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,1,2,50,675,69,1405504331000
9,2014-07-16 09:55:24,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,1,2,50,675,70,1405504524000


Let's split dataset into train/test

In [9]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

### Let's remove some hotel clusters to emulate the case of new items (i.e. hotel clusters)

In [10]:
unique_items = df['hotel_cluster'].unique()
len(unique_items)

100

### Build the user-item interactions data set

In [11]:
df_interactions = train_df[['user_id', 'hotel_cluster', 'ts']]

In [12]:
df_interactions.columns = ['USER_ID','ITEM_ID', 'TIMESTAMP']

In [13]:
df_interactions.head(10)

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
20542980,584697,48,1372521804000
31497156,935951,80,1392768291000
7873479,228619,8,1383321425000
30009687,643555,8,1406298060000
1522023,36116,52,1406497296000
28173360,691268,40,1381564837000
6478397,1182855,81,1415913819000
27571869,250680,91,1391013654000
17974516,761281,53,1412619393000
21497693,10488,46,1400508137000


In [14]:
import boto3

import json
import numpy as np
import pandas as pd
import time

session = boto3.Session(profile_name='personalize')  # replace with an aws profile with access to S3 and Personalize
personalize = session.client('personalize', region_name='us-east-1')
personalize_runtime = session.client('personalize-runtime', region_name='us-east-1')

In [15]:
bucket = "personalize-hotels"  # replace with the name of your S3 bucket. Make sure the bucket is already created.
filename = "hotels-interactions.csv"  # replace with a name that you want to save the dataset under

In [16]:
# Save user-item interactions data set in a file locally
df_interactions.to_csv(filename, index=False)

# Upload user-item interactions data set file to S3
session.resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

### Let's add metadata (i.e. features in terms of Machine Learning)
In our case, we can add features for hotel clusters and/or features related to the context of a user

In [17]:
df_item_metadata = train_df[['hotel_cluster', 'hotel_continent', 'hotel_country', 'hotel_market']]

df_item_metadata.columns = ['ITEM_ID', 'HOTEL_CONTINENT', 'HOTEL_COUNTRY', 'HOTEL_MARKET']

In [18]:
filename_item_metadata = "hotel-metadata"

In [19]:
df_item_metadata.to_csv(filename_item_metadata, index=False)

# Upload item metadata file to S3
session.resource('s3').Bucket(bucket).Object(filename_item_metadata).upload_file(filename_item_metadata)

Time for user/context metadata/features

In [20]:
df_user_metadata = train_df[
    [
        'user_id', 'user_location_country', 'user_location_region', 'user_location_city',
        'srch_adults_cnt', 'srch_children_cnt'
    ]
]

df_user_metadata.columns = [
    'USER_ID', 'USER_LOCATION', 'USER_LOCATION_REGION', 'USER_LOCATION_CITY',
    'SRCH_ADULTS_CNT', 'SRCH_CHILDREN_CNT'
]

In [21]:
filename_user_metadata = "user-metadata.csv"

In [22]:
df_user_metadata.to_csv(filename_user_metadata, index=False)

# Upload item metadata file to S3
session.resource('s3').Bucket(bucket).Object(filename_user_metadata).upload_file(filename_user_metadata)

### Define user-item interactions/item-metadata/user-metadata data set schema in Amazon Personalize

In [23]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "hotel-recommender-cold-start-schema-v2",
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:296654805457:schema/hotel-recommender-cold-start-schema-v2",
  "ResponseMetadata": {
    "RequestId": "83ea74b8-ae5d-4cae-87ca-a7f9d14e1c46",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:37:05 GMT",
      "x-amzn-requestid": "83ea74b8-ae5d-4cae-87ca-a7f9d14e1c46",
      "content-length": "104",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [24]:
metadata_schema = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
    {
        "name": "ITEM_ID",
        "type": "string"
    },
    {
        "name": "HOTEL_CONTINENT",
        "type": "int",
        "categorical": True
    },
    {
        "name": "HOTEL_COUNTRY",
        "type": "int",
        "categorical": True
    },
    {
        "name": "HOTEL_MARKET",
        "type": "int",
        "categorical": True
    }
    ],
    "version": "1.0"
}

create_metadata_schema_response = personalize.create_schema(
    name = "hotel-recommender-item-metadata-cold-start-schema-v2",
    schema = json.dumps(metadata_schema)
)

metadata_schema_arn = create_metadata_schema_response['schemaArn']
print(json.dumps(create_metadata_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:296654805457:schema/hotel-recommender-item-metadata-cold-start-schema-v2",
  "ResponseMetadata": {
    "RequestId": "77750c3b-c0de-4c9f-bf9b-317588d2837a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:37:05 GMT",
      "x-amzn-requestid": "77750c3b-c0de-4c9f-bf9b-317588d2837a",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [25]:
user_metadata_schema = {
  "type": "record",
  "name": "Users",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
      {
          "name": "USER_ID",
          "type": "string"
      },
      {
          "name": "USER_LOCATION",
          "type": "int",
          "categorical": True
      },
      {
          "name": "USER_LOCATION_REGION",
          "type": "int",
          "categorical": True
      },
      {
          "name": "USER_LOCATION_CITY",
          "type": "int",
          "categorical": True
      },
      {
          "name": "SRCH_ADULTS_CNT",
          "type": "int"
      },
      {
          "name": "SRCH_CHILDREN_CNT",
          "type": "int"
      }
  ],
  "version": "1.0"
}

create_user_metadata_schema_response = personalize.create_schema(
    name = "hotel-recommender-user-metadata-cold-start-schema-v2",
    schema = json.dumps(user_metadata_schema)
)

user_metadata_schema_arn = create_user_metadata_schema_response['schemaArn']
print(json.dumps(create_user_metadata_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:296654805457:schema/hotel-recommender-user-metadata-cold-start-schema-v2",
  "ResponseMetadata": {
    "RequestId": "783323c5-0968-4f76-a98d-92384d654f76",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:37:05 GMT",
      "x-amzn-requestid": "783323c5-0968-4f76-a98d-92384d654f76",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create data set group in Amazon Personalize

In [26]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "cold-start-hotel-recommender-group-v2"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:296654805457:dataset-group/cold-start-hotel-recommender-group-v2",
  "ResponseMetadata": {
    "RequestId": "75be46de-251d-489b-8623-93b88295fc94",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:37:06 GMT",
      "x-amzn-requestid": "75be46de-251d-489b-8623-93b88295fc94",
      "content-length": "116",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [27]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


### Create interaction data set in the group in Amazon Personalize

In [28]:
create_dataset_interactions_response = personalize.create_dataset(
    name = "hotel-recommender-cold-start-dataset-v2",
    datasetType = "INTERACTIONS",
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_interactions_arn = create_dataset_interactions_response['datasetArn']
print(json.dumps(create_dataset_interactions_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:296654805457:dataset/cold-start-hotel-recommender-group-v2/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "edb02df9-0236-48f8-840e-73f53fd63192",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:38:07 GMT",
      "x-amzn-requestid": "edb02df9-0236-48f8-840e-73f53fd63192",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [29]:
role_arn = "arn:aws:iam::296654805457:role/Personalize"  # replace with a Role that has access to Personalize

create_dataset_interactions_import_job_response = personalize.create_dataset_import_job(
    jobName = "cold-start-hotel-recommender-dataset-import-job-v2",
    datasetArn = dataset_interactions_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename)
    },
    roleArn = role_arn
)

dataset_interactions_import_job_arn = create_dataset_interactions_import_job_response['datasetImportJobArn']
print(json.dumps(dataset_interactions_import_job_arn, indent=2))

"arn:aws:personalize:us-east-1:296654805457:dataset-import-job/cold-start-hotel-recommender-dataset-import-job-v2"


In [30]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_interactions_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


### Create item metadata data set in the group in Amazon Personalize

In [31]:
create_dataset_item_metadata_response = personalize.create_dataset(
    name = "hotel-recommender-item-metadata-cold-start-dataset-v2",
    datasetType = "ITEMS",
    datasetGroupArn = dataset_group_arn,
    schemaArn = metadata_schema_arn
)

dataset_item_metadata_arn = create_dataset_item_metadata_response['datasetArn']
print(json.dumps(create_dataset_item_metadata_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:296654805457:dataset/cold-start-hotel-recommender-group-v2/ITEMS",
  "ResponseMetadata": {
    "RequestId": "c3de9bf1-08cb-4d68-aa76-2b69eec1c2c5",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 05:51:14 GMT",
      "x-amzn-requestid": "c3de9bf1-08cb-4d68-aa76-2b69eec1c2c5",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [33]:
create_dataset_item_metatadata_import_job_response = personalize.create_dataset_import_job(
    jobName = "cold-start-hotel-recommender-item-metatadata-dataset-job-v2",
    datasetArn = dataset_item_metadata_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename_item_metadata)
    },
    roleArn = role_arn
)

dataset_item_metatadata_import_job_arn = create_dataset_item_metatadata_import_job_response['datasetImportJobArn']
print(json.dumps(dataset_item_metatadata_import_job_arn, indent=2))

"arn:aws:personalize:us-east-1:296654805457:dataset-import-job/cold-start-hotel-recommender-item-metatadata-dataset-job-v2"


In [34]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_item_metatadata_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


### Create user metadata data set in the group in Amazon Personalize

In [35]:
create_dataset_user_metadata_response = personalize.create_dataset(
    name = "hotel-recommender-user-metadata-cold-start-dataset-v2",
    datasetType = "USERS",
    datasetGroupArn = dataset_group_arn,
    schemaArn = user_metadata_schema_arn
)

dataset_user_metadata_arn = create_dataset_user_metadata_response['datasetArn']
print(json.dumps(create_dataset_user_metadata_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:296654805457:dataset/cold-start-hotel-recommender-group-v2/USERS",
  "ResponseMetadata": {
    "RequestId": "7ae439ff-57b4-4bac-a0f5-042b057c1e83",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 09:22:34 GMT",
      "x-amzn-requestid": "7ae439ff-57b4-4bac-a0f5-042b057c1e83",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [37]:
create_dataset_user_metatadata_import_job_response = personalize.create_dataset_import_job(
    jobName = "cold-start-hotel-recommender-user-metatadata-dataset-import-v2",
    datasetArn = dataset_user_metadata_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename_user_metadata)
    },
    roleArn = role_arn
)

dataset_user_metatadata_import_job_arn = create_dataset_user_metatadata_import_job_response['datasetImportJobArn']
print(json.dumps(dataset_user_metatadata_import_job_arn, indent=2))

"arn:aws:personalize:us-east-1:296654805457:dataset-import-job/cold-start-hotel-recommender-user-metatadata-dataset-import-v2"


In [38]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_user_metatadata_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


### List recommender algorithms/recipes available in Amazon Personalize

In [39]:
list_recipes_response = personalize.list_recipes()
list_recipes_response

{'recipes': [{'name': 'aws-hrnn',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 65000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-coldstart',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-coldstart',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-metadata',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-metadata',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-personalized-ranking',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-personalized-ranking',
   'stat

In [40]:
recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn-metadata"

### Create solution in Amazon Personalize
In other words, let's train the hotel-recommender system!

In [42]:
create_solution_response = personalize.create_solution(
    name = "hotel-recommender-cold-start-metadata-v2",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

{
  "solutionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-cold-start-metadata-v2",
  "ResponseMetadata": {
    "RequestId": "d4bbd5b7-c11c-4cfa-b631-6cd114ec238f",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 09:59:30 GMT",
      "x-amzn-requestid": "d4bbd5b7-c11c-4cfa-b631-6cd114ec238f",
      "content-length": "110",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [43]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-cold-start-metadata-v2/2cc183a8",
  "ResponseMetadata": {
    "RequestId": "bdcde85d-1d5b-40a2-a916-4d3afa66af0e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 09:59:31 GMT",
      "x-amzn-requestid": "bdcde85d-1d5b-40a2-a916-4d3afa66af0e",
      "content-length": "126",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [44]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

SolutionVersion: CREATE PENDING
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGR

In [45]:
describe_solution_version_response["solutionVersion"]

{'solutionVersionArn': 'arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-cold-start-metadata-v2/2cc183a8',
 'solutionArn': 'arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-cold-start-metadata-v2',
 'performHPO': False,
 'performAutoML': False,
 'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-metadata',
 'datasetGroupArn': 'arn:aws:personalize:us-east-1:296654805457:dataset-group/cold-start-hotel-recommender-group-v2',
 'trainingHours': 31.842,
 'status': 'ACTIVE',
 'creationDateTime': datetime.datetime(2019, 11, 25, 9, 59, 32, 119000, tzinfo=tzlocal()),
 'lastUpdatedDateTime': datetime.datetime(2019, 11, 25, 12, 30, 0, 146000, tzinfo=tzlocal())}

### Time to retrieve accuracy metrics of the trained recommender system model!

In [46]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-cold-start-metadata-v2/2cc183a8",
  "metrics": {
    "coverage": 0.9901,
    "mean_reciprocal_rank_at_25": 0.4308,
    "normalized_discounted_cumulative_gain_at_10": 0.5153,
    "normalized_discounted_cumulative_gain_at_25": 0.5558,
    "normalized_discounted_cumulative_gain_at_5": 0.481,
    "precision_at_10": 0.0636,
    "precision_at_25": 0.0319,
    "precision_at_5": 0.1071
  },
  "ResponseMetadata": {
    "RequestId": "70a6de50-897f-4bac-af6d-e4770be34b53",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 13:30:05 GMT",
      "x-amzn-requestid": "70a6de50-897f-4bac-af6d-e4770be34b53",
      "content-length": "423",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Let's deploy the trained recommender model!

In [48]:
create_campaign_response = personalize.create_campaign(
    name = "hotel-recommender-cold-start-metadata-campaign-v1",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

{
  "campaignArn": "arn:aws:personalize:us-east-1:296654805457:campaign/hotel-recommender-cold-start-metadata-campaign-v1",
  "ResponseMetadata": {
    "RequestId": "f1639f83-0ef6-444b-ad5e-18ab960bcd2b",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 25 Nov 2019 13:30:36 GMT",
      "x-amzn-requestid": "f1639f83-0ef6-444b-ad5e-18ab960bcd2b",
      "content-length": "119",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [49]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

Campaign: CREATE PENDING
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: ACTIVE


In [69]:
train_df[train_df['user_id'] == 194297]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
13356550,2014-01-15 15:21:24,2,3,66,348,53377,1559.5506,194297,0,1,...,1,8791,1,0,2,4,8,110,65,1389799284000
13356547,2014-01-15 14:36:15,2,3,66,348,53377,1097.9567,194297,0,1,...,1,8260,1,0,1,2,50,701,51,1389796575000
13356554,2014-11-24 18:15:07,2,3,66,348,48862,1553.378,194297,1,1,...,1,8791,1,0,1,4,8,110,52,1416852907000
13356552,2014-01-16 10:49:26,2,3,66,348,53377,1559.5506,194297,0,1,...,1,8791,1,0,2,4,8,110,65,1389869366000
13356549,2014-01-15 15:17:55,2,3,66,348,53377,1559.5506,194297,0,1,...,1,8791,1,0,2,4,8,110,65,1389799075000
13356553,2014-01-16 11:17:43,2,3,66,348,53377,1559.5506,194297,0,1,...,1,8791,1,1,1,4,8,110,65,1389871063000


In [68]:
test_df[test_df['user_id'] == 194297]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
13356548,2014-01-15 14:37:26,2,3,66,348,53377,1101.6106,194297,0,1,...,1,8260,1,0,1,2,50,701,17,1389796646000
13356551,2014-01-15 15:44:47,2,3,66,348,53377,1559.5506,194297,0,1,...,1,8791,1,0,1,4,8,110,65,1389800687000


In [70]:
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = '194297'
)

In [71]:
get_recommendations_response

{'ResponseMetadata': {'RequestId': '552831ed-2451-4ba6-8689-541af53f4508',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/json',
   'date': 'Mon, 25 Nov 2019 13:47:22 GMT',
   'x-amzn-requestid': '552831ed-2451-4ba6-8689-541af53f4508',
   'content-length': '411',
   'connection': 'keep-alive'},
  'RetryAttempts': 0},
 'itemList': [{'itemId': '65'},
  {'itemId': '52'},
  {'itemId': '87'},
  {'itemId': '66'},
  {'itemId': '31'},
  {'itemId': '96'},
  {'itemId': '80'},
  {'itemId': '89'},
  {'itemId': '26'},
  {'itemId': '0'},
  {'itemId': '73'},
  {'itemId': '34'},
  {'itemId': '84'},
  {'itemId': '92'},
  {'itemId': '44'},
  {'itemId': '51'},
  {'itemId': '91'},
  {'itemId': '41'},
  {'itemId': '25'},
  {'itemId': '83'},
  {'itemId': '64'},
  {'itemId': '5'},
  {'itemId': '1'},
  {'itemId': '21'},
  {'itemId': '76'}]}

In [80]:
train_df[train_df['user_id'] == 837487]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
34300529,2013-11-12 18:09:16,2,3,66,331,40794,418.4469,837487,1,0,...,1,12009,1,0,1,2,50,680,59,1384279756000
34300532,2014-05-27 07:19:28,2,3,66,331,40794,1412.9784,837487,0,0,...,1,12789,5,0,1,2,50,566,72,1401175168000
34300530,2013-11-12 18:14:01,2,3,66,331,40794,418.4469,837487,1,0,...,1,12009,1,1,1,2,50,680,59,1384280041000
34300524,2013-03-06 13:06:23,2,3,66,324,4430,,837487,1,0,...,1,8219,1,0,2,2,50,688,68,1362575183000
34300540,2014-10-01 11:00:15,2,3,66,331,40794,1.7247,837487,0,0,...,1,12792,5,1,1,2,50,684,42,1412161215000
34300539,2014-10-01 10:56:40,2,3,66,331,40794,1.7247,837487,0,0,...,1,12792,5,0,1,2,50,684,42,1412161000000
34300527,2013-08-02 06:47:31,2,3,66,331,40794,582.8581,837487,1,0,...,1,14840,1,0,2,2,50,1628,31,1375426051000
34300528,2013-08-02 06:49:05,2,3,66,331,40794,582.9969,837487,1,0,...,1,14840,1,0,1,2,50,1628,34,1375426145000
34300526,2013-03-06 13:28:32,2,3,66,324,4430,,837487,1,0,...,1,8219,1,0,1,2,50,688,69,1362576512000
34300536,2014-07-16 11:08:39,2,3,66,331,40794,355.8718,837487,0,0,...,1,8216,1,0,1,2,50,350,64,1405508919000


In [81]:
test_df[test_df['user_id'] == 837487]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
34300535,2014-07-16 11:08:08,2,3,66,331,40794,351.9947,837487,0,0,...,1,8216,1,0,1,2,50,350,69,1405508888000
34300531,2014-05-20 09:12:59,2,3,66,331,40794,1412.9784,837487,0,0,...,1,27262,1,0,2,2,50,566,42,1400577179000


In [82]:
personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = '837487'
)

{'ResponseMetadata': {'RequestId': '05d095b2-72d5-490f-8a9a-2d1f5d413797',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/json',
   'date': 'Mon, 25 Nov 2019 13:50:18 GMT',
   'x-amzn-requestid': '05d095b2-72d5-490f-8a9a-2d1f5d413797',
   'content-length': '411',
   'connection': 'keep-alive'},
  'RetryAttempts': 0},
 'itemList': [{'itemId': '42'},
  {'itemId': '91'},
  {'itemId': '64'},
  {'itemId': '69'},
  {'itemId': '59'},
  {'itemId': '68'},
  {'itemId': '28'},
  {'itemId': '37'},
  {'itemId': '18'},
  {'itemId': '41'},
  {'itemId': '97'},
  {'itemId': '48'},
  {'itemId': '25'},
  {'itemId': '2'},
  {'itemId': '94'},
  {'itemId': '16'},
  {'itemId': '95'},
  {'itemId': '72'},
  {'itemId': '70'},
  {'itemId': '34'},
  {'itemId': '90'},
  {'itemId': '98'},
  {'itemId': '21'},
  {'itemId': '5'},
  {'itemId': '9'}]}