# Hotel Recommender System
Train and deploy a Hotel Recommender System using the public Expedia data from Kaggle competition https://www.kaggle.com/c/expedia-hotel-recommendations/overview

**The goal is to help Expedia visitors find their dream hotel!**

Download data from https://www.kaggle.com/c/expedia-hotel-recommendations/data and unzip the downloaded file.

In [None]:
import pandas as pd

In [3]:
df = pd.read_csv('./expedia-hotel-recommendations/train.csv')

In [5]:
df.head(100)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2014-01-08 14:09:47,2,3,66,462,41898,2454.8588,1482,0,0,...,0,2,28494,6,0,1,2,50,680,95
96,2014-01-08 14:15:40,2,3,66,462,41898,2455.2272,1482,0,0,...,0,1,28494,6,0,4,2,50,680,77
97,2014-01-08 14:18:31,2,3,66,462,41898,2455.2272,1482,0,0,...,0,1,28494,6,0,4,2,50,680,77
98,2014-01-08 14:30:25,2,3,66,462,41898,2455.2272,1482,0,0,...,0,2,28494,6,0,2,2,50,680,77


In [4]:
df.dtypes

date_time                     object
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_ci                       object
srch_co                       object
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
is_booking                     int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
dtype: object

In [6]:
df['date_time'].head(10)

0    2014-08-11 07:46:59
1    2014-08-11 08:22:12
2    2014-08-11 08:24:33
3    2014-08-09 18:05:16
4    2014-08-09 18:08:18
5    2014-08-09 18:13:12
6    2014-07-16 09:42:23
7    2014-07-16 09:45:48
8    2014-07-16 09:52:11
9    2014-07-16 09:55:24
Name: date_time, dtype: object

In [7]:
df['ts'] = pd.to_datetime(df['date_time'], format="%Y-%m-%d %H:%M:%S").values.astype(np.int64) // 10**6

In [8]:
df.dtypes

date_time                     object
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_ci                       object
srch_co                       object
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
is_booking                     int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
ts                             int64
dtype: object

In [9]:
df.head(10)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,ts
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,1,8250,1,0,3,2,50,628,1,1407743219000
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,1,8250,1,1,1,2,50,628,1,1407745332000
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,1,8250,1,0,1,2,50,628,1,1407745473000
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,1,14984,1,0,1,2,50,1457,80,1407607516000
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,1,14984,1,0,1,2,50,1457,21,1407607698000
5,2014-08-09 18:13:12,2,3,66,442,35390,911.5142,93,0,0,...,1,14984,1,0,1,2,50,1457,92,1407607992000
6,2014-07-16 09:42:23,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,2,2,50,675,41,1405503743000
7,2014-07-16 09:45:48,2,3,66,189,10067,,501,0,1,...,1,8267,1,0,1,2,50,675,41,1405503948000
8,2014-07-16 09:52:11,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,1,2,50,675,69,1405504331000
9,2014-07-16 09:55:24,2,3,66,189,10067,,501,0,0,...,1,8267,1,0,1,2,50,675,70,1405504524000


### Build the user-item interactions data set

In [10]:
df_subset = df[['user_id', 'hotel_cluster', 'ts']]

In [11]:
df_subset.head(10)

Unnamed: 0,user_id,hotel_cluster,ts
0,12,1,1407743219000
1,12,1,1407745332000
2,12,1,1407745473000
3,93,80,1407607516000
4,93,21,1407607698000
5,93,92,1407607992000
6,501,41,1405503743000
7,501,41,1405503948000
8,501,69,1405504331000
9,501,70,1405504524000


In [12]:
df_subset.columns = ['USER_ID','ITEM_ID', 'TIMESTAMP']

In [13]:
import boto3

import json
import numpy as np
import pandas as pd
import time

session = boto3.Session(profile_name='personalize')  # replace with an aws profile with access to S3 and Personalize
personalize = session.client('personalize', region_name='us-east-1')
personalize_runtime = session.client('personalize-runtime', region_name='us-east-1')

In [14]:
bucket = "personalize-hotels"  # replace with the name of your S3 bucket. Make sure the bucket is already created.
filename = "hotels-interactions.csv"  # replace with a name that you want to save the dataset under

In [15]:
# Save user-item interactions data set in a file locally
df_subset.to_csv(filename, index=False)

# Upload user-item interactions data set file to S3
session.resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

### Define user-item interactions data set schema in Amazon Personalize

In [18]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "hotel-recommender-schema",
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:296654805457:schema/hotel-recommender-schema",
  "ResponseMetadata": {
    "RequestId": "5e11481f-f9cb-4bf0-97fc-337479173fe4",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:19:29 GMT",
      "x-amzn-requestid": "5e11481f-f9cb-4bf0-97fc-337479173fe4",
      "content-length": "90",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create data set group in Amazon Personalize

In [19]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "hotel-recommender-group"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:296654805457:dataset-group/hotel-recommender-group",
  "ResponseMetadata": {
    "RequestId": "9f330167-d501-43f9-ba64-abd7141dba10",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:19:56 GMT",
      "x-amzn-requestid": "9f330167-d501-43f9-ba64-abd7141dba10",
      "content-length": "102",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [20]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


### Create data set in the group in Amazon Personalize

In [21]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    name = "hotel-recommender-dataset",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:296654805457:dataset/hotel-recommender-group/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "992a19a9-662b-4d7f-a8e8-b7af6637af34",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:20:57 GMT",
      "x-amzn-requestid": "992a19a9-662b-4d7f-a8e8-b7af6637af34",
      "content-length": "104",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [22]:
role_arn = "arn:aws:iam::296654805457:role/Personalize"  # replace with a Role that has access to Personalize

create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "hotel-recommender-dataset-import-job",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:296654805457:dataset-import-job/hotel-recommender-dataset-import-job",
  "ResponseMetadata": {
    "RequestId": "1202f4e6-6b2a-418a-81bb-e6fc8124a6a7",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:20:57 GMT",
      "x-amzn-requestid": "1202f4e6-6b2a-418a-81bb-e6fc8124a6a7",
      "content-length": "124",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [23]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


### List recommender algorithms/recipes available in Amazon Personalize

In [24]:
list_recipes_response = personalize.list_recipes()
list_recipes_response

{'recipes': [{'name': 'aws-hrnn',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 65000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-coldstart',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-coldstart',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-metadata',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-metadata',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 1, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 1, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-personalized-ranking',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-personalized-ranking',
   'stat

In [25]:
recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn"

### Create solution in Amazon Personalize
In other words, let's train the hotel-recommender system!

In [26]:
create_solution_response = personalize.create_solution(
    name = "hotel-recommender-solution",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

{
  "solutionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-solution",
  "ResponseMetadata": {
    "RequestId": "968f5e33-8b77-4433-89d2-c285d7d8a5c8",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:40:02 GMT",
      "x-amzn-requestid": "968f5e33-8b77-4433-89d2-c285d7d8a5c8",
      "content-length": "96",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [27]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-solution/c6fa5fc8",
  "ResponseMetadata": {
    "RequestId": "437614c5-d428-49fa-8a08-a267f5e3f749",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 11:40:02 GMT",
      "x-amzn-requestid": "437614c5-d428-49fa-8a08-a267f5e3f749",
      "content-length": "112",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [28]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

SolutionVersion: CREATE PENDING
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGR

### Time to retrieve accuracy metrics of the trained recommender system model!

In [29]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:296654805457:solution/hotel-recommender-solution/c6fa5fc8",
  "metrics": {
    "coverage": 0.9901,
    "mean_reciprocal_rank_at_25": 0.477,
    "normalized_discounted_cumulative_gain_at_10": 0.5562,
    "normalized_discounted_cumulative_gain_at_25": 0.5922,
    "normalized_discounted_cumulative_gain_at_5": 0.5262,
    "precision_at_10": 0.0659,
    "precision_at_25": 0.0321,
    "precision_at_5": 0.1143
  },
  "ResponseMetadata": {
    "RequestId": "eefdea27-bbf6-4605-afb0-a181e1349e43",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 13:56:41 GMT",
      "x-amzn-requestid": "eefdea27-bbf6-4605-afb0-a181e1349e43",
      "content-length": "409",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Let's deploy the trained recommender model!

In [30]:
create_campaign_response = personalize.create_campaign(
    name = "hotel-recommender-campaign",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

{
  "campaignArn": "arn:aws:personalize:us-east-1:296654805457:campaign/hotel-recommender-campaign",
  "ResponseMetadata": {
    "RequestId": "79caabf9-b1c3-4700-a3b2-229184b24bd8",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sat, 19 Oct 2019 15:04:49 GMT",
      "x-amzn-requestid": "79caabf9-b1c3-4700-a3b2-229184b24bd8",
      "content-length": "96",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [31]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

Campaign: CREATE PENDING
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: ACTIVE


In [42]:
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = '93'
)

In [43]:
get_recommendations_response

{'ResponseMetadata': {'RequestId': '48857bee-0c39-4917-8fca-1c1819d3ce2d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/json',
   'date': 'Sat, 19 Oct 2019 18:32:12 GMT',
   'x-amzn-requestid': '48857bee-0c39-4917-8fca-1c1819d3ce2d',
   'content-length': '413',
   'connection': 'keep-alive'},
  'RetryAttempts': 0},
 'itemList': [{'itemId': '92'},
  {'itemId': '80'},
  {'itemId': '26'},
  {'itemId': '84'},
  {'itemId': '52'},
  {'itemId': '44'},
  {'itemId': '86'},
  {'itemId': '0'},
  {'itemId': '34'},
  {'itemId': '65'},
  {'itemId': '63'},
  {'itemId': '66'},
  {'itemId': '96'},
  {'itemId': '64'},
  {'itemId': '41'},
  {'itemId': '27'},
  {'itemId': '73'},
  {'itemId': '69'},
  {'itemId': '98'},
  {'itemId': '22'},
  {'itemId': '97'},
  {'itemId': '21'},
  {'itemId': '58'},
  {'itemId': '70'},
  {'itemId': '93'}]}