# Download data
## Install Kaggle

In [1]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 5.4 MB/s eta 0:00:011
[?25hCollecting urllib3<1.25,>=1.21.1
  Downloading urllib3-1.24.3-py2.py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 11.6 MB/s eta 0:00:01
Collecting python-slugify
  Downloading python-slugify-4.0.1.tar.gz (11 kB)
Collecting slugify
  Downloading slugify-0.0.1.tar.gz (1.2 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 8.9 MB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: kaggle, python-slugify, slugify
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73274 sha256=6415a396e7b60a7bd14977350de6228cef55068fd5df63227ee670c81bc5a08a
  Stored in directory: /home/ec2-user/.cache/pip/wheels/cf/aa/f0/ed1179bbcd729b29d0dfda59826fb3b55f0a4a0c3f7

### Kaggle Settings

Before running the kaggle download, kaggle configurations should saved locally in the notebook using a terminal prompt. 
```
sh-4.2$ history
sh-4.2$ mkdir ~/.kaggle
sh-4.2$ vi kaggle.json //add the kaggle credentials
sh-4.2$ mv kaggle.json ~/.kaggle/kaggle.json
sh-4.2$ chmod 600 ~/.kaggle/kaggle.json
sh-4.2$ kaggle
```

In [5]:
!kaggle datasets download --force  janiobachmann/bank-marketing-dataset

Downloading bank-marketing-dataset.zip to /home/ec2-user/SageMaker/amazon-personalize
  0%|                                                | 0.00/142k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 142k/142k [00:00<00:00, 20.9MB/s]


In [6]:
!unzip bank-marketing-dataset.zip

Archive:  bank-marketing-dataset.zip
  inflating: bank.csv                


# Prepare Customer Data


## Imports


In [48]:
# Imports
import boto3
import json
import numpy as np
import pandas as pd
import time

In [52]:
df=pd.read_csv("bank.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


### Add a unique customer ID to the data

In [72]:
df['USER_ID'] = np.arange(len(df))+30908
# cols = df.columns.tolist()
# cols = cols[-1:] + cols[:-1]

## max 5 user attributes, get rid of some columns not very useful 
df_custmer = df[['USER_ID','deposit','age', 'job','marital','education']]
df_custmer.to_csv('customer11k.csv')
df_custmer.head()

Unnamed: 0,USER_ID,deposit,age,job,marital,education
0,30908,yes,59,admin.,married,secondary
1,30909,yes,56,admin.,married,secondary
2,30910,yes,41,technician,married,secondary
3,30911,yes,55,services,married,secondary
4,30912,yes,54,admin.,married,tertiary


## Upload to S3

In [74]:
bucket ='personalize-custdata'      # replace with the name of your S3 bucket
filename ='customer11k.csv'
boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

## Create schema for customer data

In [73]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

customer_schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "deposit",
            "type": "string"
        },
        {
            "name": "age",
            "type": "long"
        },
        {
            "name": "job",
            "type": "string"
        },
        {
            "name": "marital",
            "type": "string"
        },
        {
            "name": "education",
            "type": "string"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-customer-schema",
    schema = json.dumps(customer_schema)
)

customer_schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))


{
  "schemaArn": "arn:aws:personalize:ap-southeast-1:248025046818:schema/personalize-customer-schema",
  "ResponseMetadata": {
    "RequestId": "9a926beb-c05c-4083-bbe9-c0979e7e000c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 06 Sep 2020 13:27:08 GMT",
      "x-amzn-requestid": "9a926beb-c05c-4083-bbe9-c0979e7e000c",
      "content-length": "98",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


# Create Interactions Data 

In [80]:
import json
import csv
from datetime import datetime
from datetime import timedelta
from datetime import time
from calendar import monthrange
from random import seed
from random import randint
import numpy as np
#import platform
# print(platform.python_version())


###################
## Declare constants 
###################
# customer ID start
starting_cust_id=30908

# customer ID end
ending_cust_id=30908+11000

# how many evemts we need?
max_interactions_sequence_count=8000

# starting date of the dataset
event_start_date = datetime(2019, 3, 3)
# datetime.datetime.utcfromtimestamp(0)
epoch_event_start_date = int(event_start_date.strftime('%s'))
# print(epoch_event_start_date)

event_time_range = 31536000 #seconds in a year. 

max_time_between_events = 1059200  
min_time_between_events = 3900 # 3 days. 


# set of possible interactions 
# Lets say there are 10 different types of events. 
# this array defines a few set of interaction sequence 
# we will use these fixed set of interaction sequence to generate synthetic data
array = np.arange(10)
newarray = np.repeat(array, 3)
np.random.shuffle(newarray)

interaction_sequence_length = 5
number_different_interaction_sequences=6
interaction_array = newarray.reshape(number_different_interaction_sequences,interaction_sequence_length)



#data column headers 
# USER_ID (string), ITEM_ID (string), TIMESTAMP (long), IMPRESSION


with open('interactions.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(["USER_ID", "ITEM_ID", "TIMESTAMP"])

    ## Loop for interaction sequences 
    interactions_sequence_count=0 
    while ( interactions_sequence_count <= max_interactions_sequence_count ):

        #pic a customer id randomly 
        customer_id = randint(starting_cust_id, ending_cust_id)

        #pic a event sequence randomly 
        sequence_id =  randint(0, number_different_interaction_sequences-1)

        # select a event sequence length 
        max_sequence_length = randint(0, interaction_sequence_length-1)

        sequence_length = 0
        time_lapse_between_event = epoch_event_start_date+event_time_range
        next_event_time =randint(epoch_event_start_date , time_lapse_between_event)
        
        # Loop for generating individual interaction events within a sequence
        while (sequence_length <= max_sequence_length):
            writer.writerow([customer_id, interaction_array[sequence_id][sequence_length], next_event_time])
            sequence_length+=1
            next_event_time += randint(min_time_between_events, max_time_between_events)

        interactions_sequence_count+=1

file.close()

interactions_df=pd.read_csv("interactions.csv")
print(interactions_df.describe())
interactions_df.head(20)

            USER_ID       ITEM_ID     TIMESTAMP
count  23969.000000  23969.000000  2.396900e+04
mean   36377.740123      4.724728  1.568092e+09
std     3173.550875      2.855511  9.203915e+06
min    30908.000000      0.000000  1.551572e+09
25%    33608.000000      3.000000  1.560049e+09
50%    36375.000000      5.000000  1.568023e+09
75%    39160.000000      7.000000  1.576135e+09
max    41908.000000      9.000000  1.585917e+09


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
0,35873,5,1578556273
1,35873,3,1578604177
2,35873,9,1579509607
3,31950,3,1580779928
4,31950,3,1581746466
5,31950,8,1582464100
6,31950,7,1582649990
7,38544,7,1569982368
8,39338,6,1561972882
9,39338,0,1562581877


## Upload the interactions data to S3

In [81]:
bucket ='personalize-custdata'      # replace with the name of your S3 bucket
filename ='interactions.csv'
boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

## Create a schema definition in Amazon Personalize

In [89]:
interactions_schema =  {
  "type": "record",
  "name": "Interactions",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
      {
          "name": "USER_ID",
          "type": "string"
      },
      {
          "name": "ITEM_ID",
          "type": "string"
      },
      {
          "name": "TIMESTAMP",
          "type": "long"
      }
  ],
  "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-intertactions-schema",
    schema = json.dumps(interactions_schema)
)

interactions_schema_arn = create_schema_response['schemaArn']

print(json.dumps(create_schema_response, indent=2))

ResourceAlreadyExistsException: An error occurred (ResourceAlreadyExistsException) when calling the CreateSchema operation: Another resource with Arn arn:aws:personalize:ap-southeast-1:248025046818:schema/personalize-intertactions-schema already exists.

## Createa a dataset group

In [86]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "personalize-demo-dataset"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:ap-southeast-1:248025046818:dataset-group/personalize-demo-dataset",
  "ResponseMetadata": {
    "RequestId": "d1fe52be-da51-4299-acb2-f7a2b0de2889",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 07 Sep 2020 09:07:10 GMT",
      "x-amzn-requestid": "d1fe52be-da51-4299-acb2-f7a2b0de2889",
      "content-length": "108",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [88]:
import time
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetGroup: ACTIVE


## Create datasets 

### DELTE THIS BLOCK LATER

In [98]:
personalize.list_schemas()
interactions_schema_arn = 'arn:aws:personalize:ap-southeast-1:248025046818:schema/personalize-intertactions-schema'
customer_schema_arn = 'arn:aws:personalize:ap-southeast-1:248025046818:schema/personalize-customer-schema'

In [99]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interactions_schema_arn,
    name = "demo-dataset"
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))


{
  "datasetArn": "arn:aws:personalize:ap-southeast-1:248025046818:dataset/personalize-demo-dataset/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "2186a438-8dfb-4945-acae-6faba1477892",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 07 Sep 2020 09:26:42 GMT",
      "x-amzn-requestid": "2186a438-8dfb-4945-acae-6faba1477892",
      "content-length": "110",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [101]:
dataset_type = "USERS"
create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = customer_schema_arn,
    name = "demo-interection-dataset"
)

CUSTOMER_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))


{
  "datasetArn": "arn:aws:personalize:ap-southeast-1:248025046818:dataset/personalize-demo-dataset/USERS",
  "ResponseMetadata": {
    "RequestId": "dd9f135e-a947-40d2-b5af-d917691d2a23",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 07 Sep 2020 09:28:06 GMT",
      "x-amzn-requestid": "dd9f135e-a947-40d2-b5af-d917691d2a23",
      "content-length": "103",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create Data Import Jobs

In [105]:
role_arn = 'arn:aws:iam::248025046818:role/AmazonPersonalizeRole'
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "interactions-dataset-import-job",
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, 'interactions.csv')
        
    }, roleArn = role_arn
)

interactions_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:ap-southeast-1:248025046818:dataset-import-job/interactions-dataset-import-job",
  "ResponseMetadata": {
    "RequestId": "572d5e7b-4ac0-4435-9628-faf7e2671412",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 07 Sep 2020 09:40:25 GMT",
      "x-amzn-requestid": "572d5e7b-4ac0-4435-9628-faf7e2671412",
      "content-length": "124",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [106]:
role_arn = 'arn:aws:iam::248025046818:role/AmazonPersonalizeRole'
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "user-dataset-import-job",
    datasetArn = CUSTOMER_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, 'customer11k.csv')
        
    }, roleArn = role_arn
)

users_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:ap-southeast-1:248025046818:dataset-import-job/user-dataset-import-job",
  "ResponseMetadata": {
    "RequestId": "4708054d-7e9b-4923-8ae8-48b91da9c56c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 07 Sep 2020 09:42:07 GMT",
      "x-amzn-requestid": "4708054d-7e9b-4923-8ae8-48b91da9c56c",
      "content-length": "116",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [113]:
personalize.list_dataset_import_jobs()
interactions_dataset_import_job_arn = 'arn:aws:personalize:ap-southeast-1:248025046818:dataset-import-job/interactions-dataset-import-job'
users_dataset_import_job_arn ='arn:aws:personalize:ap-southeast-1:248025046818:dataset-import-job/user-dataset-import-job'

In [114]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = users_dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE FAILED


In [122]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = interactions_dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    print(dataset_import_job)
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

{'jobName': 'interactions-dataset-import-job', 'datasetImportJobArn': 'arn:aws:personalize:ap-southeast-1:248025046818:dataset-import-job/interactions-dataset-import-job', 'datasetArn': 'arn:aws:personalize:ap-southeast-1:248025046818:dataset/personalize-demo-dataset/INTERACTIONS', 'dataSource': {'dataLocation': 's3://personalize-custdata/interactions.csv'}, 'roleArn': 'arn:aws:iam::248025046818:role/AmazonPersonalizeRole', 'status': 'CREATE FAILED', 'creationDateTime': datetime.datetime(2020, 9, 7, 9, 40, 26, 234000, tzinfo=tzlocal()), 'lastUpdatedDateTime': datetime.datetime(2020, 9, 7, 9, 40, 55, 68000, tzinfo=tzlocal()), 'failureReason': 'Insufficient privileges for accessing data in S3. Please look at https://docs.aws.amazon.com/personalize/latest/dg/getting-started.html#gs-upload-to-bucket and fix bucket policy on personalize-custdata.'}
DatasetImportJob: CREATE FAILED
