# Pizza recommendation on Amazon SageMaker with Factorization Machines

### Use this OLO dataset

In [65]:
!head -10 UpdatedOLOdata.csv

UserID,OrderDate,Month,Order ID,BPO,MenuCode,SizeCode,CouponCode,Price,Flag_CorePizza,Flag_PizzaMania,Flag_EdvReg,Flag_EDVMed,Flag_Side,Flag_Dessert,OrderDateInEpoch
6207089.068,4-Feb-18, 2 ,DPI66285_43135_174,289,PIZ0103,HT07,,69,0,1,0,0,0,0,1517702400
6207089.068,04-02-18, 2 ,DPI66285_43135_174,289,PIZ0104,HT07,,75,0,1,0,0,0,0,1517702400
6207089.068,04-02-18, 2 ,DPI66285_43135_174,289,VGPARCEL,PVEG01,,35,0,0,0,0,1,0,1517702400
6207089.068,08-02-18, 2 ,DPI66285_43139_158,165,PIZ0100,BHT07,,165,1,0,0,0,0,0,1518048000
6207089.068,18-02-18, 2 ,DPI66285_43149_76,165,PIZ0100,BHT07,,165,1,0,0,0,0,0,1518912000
6207089.068,23-03-18, 3 ,DPI66285_43182_20,165,PIZ0100,BHT07,,165,1,0,0,0,0,0,1521763200
6207089.068,15-04-18, 4 ,DPI66285_43205_421,303,NVPARCEL,PNVG01,,39,0,0,0,0,1,0,1523750400
6207089.068,15-04-18, 4 ,DPI66285_43205_421,303,PIZ0133,FP07,,120,0,1,0,0,0,0,1523750400
6207089.068,15-04-18, 4 ,DPI66285_43205_421,303,VGPARCEL,PVEG01,,35,0,0,0,0,1,0,1523750400


In [28]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

import pandas as pd
from sklearn.model_selection import train_test_split

import datetime
from dateutil.parser import parse

## Load data

Check how man

In [12]:
df = pd.read_csv("UpdatedOLOdata.csv", usecols=[0,1,5]) 
df = df[df.OrderDate == df.OrderDate]
df = df[df.OrderDate != '4-Feb-18']
movie_dict = dict([(y,x+1) for x,y in enumerate(sorted(set(df['MenuCode'])))])
user_dict = dict([(y,x+1) for x,y in enumerate(sorted(set(df['UserID'])))])
user_reverse_dict = dict([(x+1,y) for x,y in enumerate(sorted(set(df['UserID'])))])
movie_reverse_dict = dict([(x+1, y) for x,y in enumerate(sorted(set(df['MenuCode'])))])

#idx = df.index[df['OrderDate'] == 'nan'].tolist()
#idx = df[df['OrderDate'].isnull()]

print(df.isnull().any().any())

#df['OrderDate'] = [parse(str(x)) for x in df['OrderDate']]

#d['PIZ0100']
df['MenuCode'] = df['MenuCode'].apply(lambda x: int(movie_dict[str(x)]))
df['UserID'] = df['UserID'].apply(lambda x: int(user_dict[float(x)]))
df['DayOfWeek'] = df['OrderDate'].apply(lambda x: datetime.datetime.strptime(x, "%d-%m-%y").date().weekday())
df['WeekOfYear'] = df['OrderDate'].apply(lambda x: datetime.datetime.strptime(x, "%d-%m-%y").date().isocalendar()[1])
#df

df_g = df.groupby(['UserID', 'MenuCode', 'WeekOfYear']).size().reset_index(name='count')

df_g[df_g['count'] >2]

False


Unnamed: 0,UserID,MenuCode,WeekOfYear,count
94,8,36,14,3
95,8,36,15,3
96,8,36,16,3
97,8,36,17,4
98,8,36,18,3
127,8,115,19,3
312,15,36,10,3
318,15,83,12,4
416,19,36,23,3
430,20,13,15,3


In [13]:
numberOfUsers=len(df['UserID'].unique().tolist())
numberOfMenuItems=len(df['MenuCode'].unique().tolist())
numFeatures = numberOfUsers + numberOfMenuItems
print(numberOfUsers)
print(numberOfMenuItems)
print(len(df))

14813
115
379187


## Add Rating into data set
Rating Assumption:
If a customer has eaten once but not repeated a pizza (count <2) then she didnt like it.

In [14]:
#df['MenuCode'] = df['MenuCode'].apply(lambda x: int(movie_dict[str(x)]))
#df['UserID'] = df['UserID'].apply(lambda x: int(user_dict[float(x)]))

df_g = df.groupby(['UserID', 'MenuCode']).size().reset_index(name='count')

df_g[df_g['count'] >1]

df_g['rating'] = df_g['count'].apply(lambda x: 1 if x>1 else 0)

print('Rating 1 : {}'.format(df_g[df_g['rating']==1].count()))
print('Rating 0 : {}'.format(df_g[df_g['rating']==0].count()))

Rating 1 : UserID      27643
MenuCode    27643
count       27643
rating      27643
dtype: int64
Rating 0 : UserID      120024
MenuCode    120024
count       120024
rating      120024
dtype: int64


## Downsampling the  data to reduce Label 0

In [15]:
df = df_g.drop('count', axis=1)
print(df.shape)
df = df.drop(df.query('rating == 0').sample(frac=.5).index)
print(df.shape)
print('Rating 1 : {}'.format(df[df['rating']==1].count()))
print('Rating 0 : {}'.format(df[df['rating']==0].count()))

(147667, 3)
(87655, 3)
Rating 1 : UserID      27643
MenuCode    27643
rating      27643
dtype: int64
Rating 0 : UserID      60012
MenuCode    60012
rating      60012
dtype: int64


### Spilt into test and train

In [16]:
train, test = train_test_split(df, test_size=0.2)
numberOfRowsTrain = len(train)
numberOfRowsTest = len(test)
print(numberOfRowsTrain,numberOfRowsTest)
print(train[2:5])

(70124, 17531)
        UserID  MenuCode  rating
107162   10681       114       1
93218     9287        38       0
42784     4220        78       0


## One-hot encode the data
[user1,user2,...,user n, menu1, menu2, ..., menu n]

[1,0,0,...,0,0,1,0,...,0]

In [17]:
#Do OneHot Encoding
def loadDataset(dfname, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    for userId, menuCode, rating in dfname.values:
        X[line,userId] = 1
        X[line,int(numberOfUsers)+int(menuCode)-1] = 1
        Y.append(rating)
        line=line+1
            
    Y=np.array(Y).astype('float32')
    #print (X,Y)
    return X,Y

In [18]:
X_train, Y_train = loadDataset(train,numberOfRowsTrain,numFeatures)
X_test, Y_test = loadDataset(test,numberOfRowsTest,numFeatures)

In [38]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (numberOfRowsTrain, numFeatures)
assert Y_train.shape == (numberOfRowsTrain, )
non_zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (numberOfRowsTrain-non_zero_labels, non_zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (numberOfRowsTest, numFeatures)
assert Y_test.shape  == (numberOfRowsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (numberOfRowsTest-zero_labels, zero_labels))

(118133, 14928)
(118133,)
Training labels: 114747 zeros, 3386 ones
(29534, 14928)
(29534,)
Test labels: 28677 zeros, 857 ones


In [51]:
print(train.values[1])
print(X_train[1])
print(train.values[2])
print(X_train[2])

[9937    6]
  (0, 9937)	1.0
  (0, 14819)	1.0
[10018    11]
  (0, 10018)	1.0
  (0, 14824)	1.0


### Convert to protobuf and save to S3

In [19]:
bucket = 'ar-sm-bucket'
prefix = 'Jubilant'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)
print(output_prefix)

s3://ar-sm-bucket/Jubilant/output


In [20]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://ar-sm-bucket/Jubilant/train3/train.protobuf
s3://ar-sm-bucket/Jubilant/test3/test.protobuf
Output: s3://ar-sm-bucket/Jubilant/output


### Run training job

In [21]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}

In [22]:
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(feature_dim=numFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

fm.fit({'train': train_data, 'test': test_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-09-18-11-27-58-824


....................
[31mDocker entrypoint called with argument(s): train[0m
[31m[09/18/2018 11:31:09 INFO 139994573981504] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'batch_metrics_publish_interval': u'500', u'bias_init_sigma': u'0.01', u'_num_gpus': u'auto', u'_data_format': u'record', u'factors_wd': u'0.00001', u'linear_wd': u'0.001', u'_kvstore': u'auto', u'_learning_rate': u'1.0', u'_optimizer': u'adam'}[0m
[31m[09/18/2018 11:31:09 I

[31m[09/18/2018 11:31:17 INFO 139994573981504] #quality_metric: host=algo-1, epoch=13, train binary_classification_accuracy <score>=0.706225352113[0m
[31m[09/18/2018 11:31:17 INFO 139994573981504] #quality_metric: host=algo-1, epoch=13, train binary_classification_cross_entropy <loss>=0.56127275021[0m
[31m[09/18/2018 11:31:17 INFO 139994573981504] #quality_metric: host=algo-1, epoch=13, train binary_f_1.000 <score>=0.113406443934[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 550.4038333892822, "sum": 550.4038333892822, "min": 550.4038333892822}}, "EndTime": 1537270277.878297, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1537270277.327464}
[0m
[31m[09/18/2018 11:31:17 INFO 139994573981504] #progress_metric: host=algo-1, completed 14 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset": 

[31m[09/18/2018 11:31:27 INFO 139994573981504] #quality_metric: host=algo-1, epoch=30, train binary_classification_accuracy <score>=0.776929577465[0m
[31m[09/18/2018 11:31:27 INFO 139994573981504] #quality_metric: host=algo-1, epoch=30, train binary_classification_cross_entropy <loss>=0.484592259367[0m
[31m[09/18/2018 11:31:27 INFO 139994573981504] #quality_metric: host=algo-1, epoch=30, train binary_f_1.000 <score>=0.454050327473[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 562.2560977935791, "sum": 562.2560977935791, "min": 562.2560977935791}}, "EndTime": 1537270287.708179, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1537270287.144431}
[0m
[31m[09/18/2018 11:31:27 INFO 139994573981504] #progress_metric: host=algo-1, completed 31 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset":

[31m[09/18/2018 11:31:37 INFO 139994573981504] #quality_metric: host=algo-1, epoch=47, train binary_classification_accuracy <score>=0.842281690141[0m
[31m[09/18/2018 11:31:37 INFO 139994573981504] #quality_metric: host=algo-1, epoch=47, train binary_classification_cross_entropy <loss>=0.408714198583[0m
[31m[09/18/2018 11:31:37 INFO 139994573981504] #quality_metric: host=algo-1, epoch=47, train binary_f_1.000 <score>=0.66757703497[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 525.3419876098633, "sum": 525.3419876098633, "min": 525.3419876098633}}, "EndTime": 1537270297.422551, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1537270296.896751}
[0m
[31m[09/18/2018 11:31:37 INFO 139994573981504] #progress_metric: host=algo-1, completed 48 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset": 

[31m[09/18/2018 11:31:47 INFO 139994573981504] #quality_metric: host=algo-1, epoch=65, train binary_classification_accuracy <score>=0.905957746479[0m
[31m[09/18/2018 11:31:47 INFO 139994573981504] #quality_metric: host=algo-1, epoch=65, train binary_classification_cross_entropy <loss>=0.338614915875[0m
[31m[09/18/2018 11:31:47 INFO 139994573981504] #quality_metric: host=algo-1, epoch=65, train binary_f_1.000 <score>=0.825232299437[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 585.2491855621338, "sum": 585.2491855621338, "min": 585.2491855621338}}, "EndTime": 1537270307.894712, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1537270307.308864}
[0m
[31m[09/18/2018 11:31:47 INFO 139994573981504] #progress_metric: host=algo-1, completed 66 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset":

[31m[09/18/2018 11:31:57 INFO 139994573981504] #quality_metric: host=algo-1, epoch=82, train binary_classification_accuracy <score>=0.939422535211[0m
[31m[09/18/2018 11:31:57 INFO 139994573981504] #quality_metric: host=algo-1, epoch=82, train binary_classification_cross_entropy <loss>=0.283636722189[0m
[31m[09/18/2018 11:31:57 INFO 139994573981504] #quality_metric: host=algo-1, epoch=82, train binary_f_1.000 <score>=0.893915102484[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 593.1200981140137, "sum": 593.1200981140137, "min": 593.1200981140137}}, "EndTime": 1537270317.851552, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1537270317.257984}
[0m
[31m[09/18/2018 11:31:57 INFO 139994573981504] #progress_metric: host=algo-1, completed 83 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset":

[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, epoch=99, train binary_classification_accuracy <score>=0.959169014085[0m
[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, epoch=99, train binary_classification_cross_entropy <loss>=0.238441539926[0m
[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, epoch=99, train binary_f_1.000 <score>=0.930733758631[0m
[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, train binary_classification_accuracy <score>=0.959169014085[0m
[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, train binary_classification_cross_entropy <loss>=0.238441539926[0m
[31m[09/18/2018 11:32:07 INFO 139994573981504] #quality_metric: host=algo-1, train binary_f_1.000 <score>=0.930733758631[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 599.0359783172607, "sum": 599.0359783172607, "min": 599.0359783172607}}, "EndTim

### Deploy model

In [23]:
fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2018-09-18-11-36-03-018
INFO:sagemaker:Creating endpoint with name factorization-machines-2018-09-18-11-27-58-824


--------------------------------------------------------------!

### Run predictions

In [24]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

In [25]:
predictions = []
for array in np.array_split(X_test.toarray(), 550):
    result = fm_predictor.predict(array)
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)

Three experiments were run
1. One, with all data and considering if a cust bought an item more than thrice in an year then that's a positive feedback on that item. Results was highly biased towards outcome of 0 since the data is heavily scewed towards 0.
2. Second, with all data and considering if a cust bought an item twice or more in an year then that's a positive feedback on that item. Lowering the threshold in the data, increased the percentage of positive labels a bit the result improved a bit but still highly biased
3. The final experiment was by under-sampling the data (remmoving random 50% of the 0 labeled records) This gave even better results. 


Result of Experiement number 3

In [26]:
pd.crosstab(Y_test, predictions, rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11299,756
1.0,4085,1391


Result of experiment number 2

In [112]:
pd.crosstab(Y_test, predictions, rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,23264,702
1.0,4594,974


Result of experiment number 1

In [88]:
pd.crosstab(Y_test, predictions, rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,28579,98
1.0,799,58


## Analysis
1. Transaction data is highly skewed against repeat sale. So making the model learn positive results will involve a lot of negative sampling.
2. We should have better way of gathering implicit metric for dislikes. One way is from click stream, if we capture that these the 5 pizzas shown to the user and this is the one she chose.
3. We should also do a content-based analysis if we capture the metadata  for different menu like bread_type,toppings, duration in oven etc. Then we can create similarity of ingredients and do a content-based recommendation
4. We are also doing Market Basket Analysis, to see whether there is any affinity of two types of Menu occuring together. So that if one is bought we can recommend the other. But the data doesn't show any such trend.