In [43]:
#Model training with XGboost
!pip install xgboost

import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer


# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import roc_auc_score

def train(dataFrame, y, max_depth, eta):
    # Hot Encoding
    dicts = dataFrame.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(dicts)
    features = dv.get_feature_names()
    dtrain = xgb.DMatrix(X, label=y, feature_names=features)

    # train
    xgb_params = {
        'eta': eta,
        'max_depth': max_depth,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed':1,
        'verbosity':0
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=10)
    return dv, model


def predict(dataFrame, dv, model):
    dicts = dataFrame.to_dict(orient="records")
    X = dv.transform(dicts)
    features = dv.get_feature_names()
    dval = xgb.DMatrix(X, feature_names=features)
    y_pred = model.predict(dval)
    return y_pred, X





In [65]:
import numpy as np
import pandas as pd

df = pd.DataFrame([
    {'restaurantId': "1001", 'menu_items': 3, 'bill': 50},
    {'restaurantId': "1001", 'menu_items': 2, 'bill': 30},
    {'restaurantId': "1001", 'menu_items': 1, 'bill': 10},
    {'restaurantId': "1002", 'menu_items': 1, 'bill': 15},
    {'restaurantId': "1003", 'menu_items': 2, 'bill': 25},
])

df


Unnamed: 0,restaurantId,menu_items,bill
0,1001,3,50
1,1001,2,30
2,1001,1,10
3,1002,1,15
4,1003,2,25


In [96]:
#Training after  hotenrestaurantId 
df_train = df[['restaurantId', 'menu_items']]
y_train = df[['bill']]

display("Train", df_train)
display("Y", y_train)


dv, model = train(df_train, y_train, 6, 1)
y_pred_val, X_val = predict(df_train, dv, model)


df.assign(predicted_bill = np.round(y_pred_val).astype('int'))

'Train'

Unnamed: 0,restaurantId,menu_items
0,1001,3
1,1001,2
2,1001,1
3,1002,1
4,1003,2


'Y'

Unnamed: 0,bill
0,50
1,30
2,10
3,15
4,25


Unnamed: 0,restaurantId,menu_items,bill,predicted_bill
0,1001,3,50,50
1,1001,2,30,30
2,1001,1,10,10
3,1002,1,15,15
4,1003,2,25,25


In [95]:
#Training without restaurantId
df_train = df[['menu_items']]
y_train = df[['bill']]

display("Train", df_train)
display("Y", y_train)


dv, model = train(df_train, y_train, 6, 1)
y_pred_val, X_val = predict(df_train, dv, model)


df.assign(predicted_bill = np.round(y_pred_val).astype('int'))

'Train'

Unnamed: 0,menu_items
0,3
1,2
2,1
3,1
4,2


'Y'

Unnamed: 0,bill
0,50
1,30
2,10
3,15
4,25


Unnamed: 0,restaurantId,menu_items,bill,predicted_bill
0,1001,3,50,50
1,1001,2,30,28
2,1001,1,10,12
3,1002,1,15,12
4,1003,2,25,28


In [97]:
#Target encoding formula
def get_target_encoding(df, field, target_field):
    unique_fields = df[field].unique()
    targets = {}
    for f in unique_fields:
        targets[f] = df[df[field]==f][target_field].mean()
    return targets    

In [101]:
#Training after target encoding restaurantId
df2 = df.copy()
targets = get_target_encoding(df2, 'restaurantId', 'bill')
df2['mean_bill'] = df2['restaurantId'].apply(lambda x: targets[x])
display(targets)


df_train2 = df2[['menu_items', 'mean_bill']]
y_train2 = df2[['bill']]

display("df", df2)
display("Train", df_train2)
display("Y", y_train2)

dv2, model2 = train(df_train2, y_train2, 6, 1)
y_pred_val2, X_val2 = predict(df_train2, dv2, model2)


df2.assign(predicted_bill = np.round(y_pred_val2).astype('int'))

{'1001': 30.0, '1002': 15.0, '1003': 25.0}

'df'

Unnamed: 0,restaurantId,menu_items,bill,mean_bill
0,1001,3,50,30.0
1,1001,2,30,30.0
2,1001,1,10,30.0
3,1002,1,15,15.0
4,1003,2,25,25.0


'Train'

Unnamed: 0,menu_items,mean_bill
0,3,30.0
1,2,30.0
2,1,30.0
3,1,15.0
4,2,25.0


'Y'

Unnamed: 0,bill
0,50
1,30
2,10
3,15
4,25


Unnamed: 0,restaurantId,menu_items,bill,mean_bill,predicted_bill
0,1001,3,50,30.0,50
1,1001,2,30,30.0,30
2,1001,1,10,30.0,10
3,1002,1,15,15.0,15
4,1003,2,25,25.0,25


In [100]:
#Training after target encoding restaurantId with another restaurant having same mean_bill
df3 = df.copy()
df3 = df3.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 30}, ignore_index = True)


targets = get_target_encoding(df3, 'restaurantId', 'bill')
df3['mean_bill'] = df3['restaurantId'].apply(lambda x: targets[x])
display(targets)


df_train3 = df3[['menu_items', 'mean_bill']]
y_train3 = df3[['bill']]

display("df3", df3)
display("Train3", df_train3)
display("Y3", y_train3)


dv3, model3 = train(df_train3, y_train3, 6, 1)
y_pred_val3, X_val3 = predict(df_train3, dv3, model3)


df3.assign(predicted_bill = np.round(y_pred_val3).astype('int'))

{'1001': 30.0, '1002': 15.0, '1003': 25.0, '1004': 30.0}

'df3'

Unnamed: 0,restaurantId,menu_items,bill,mean_bill
0,1001,3,50,30.0
1,1001,2,30,30.0
2,1001,1,10,30.0
3,1002,1,15,15.0
4,1003,2,25,25.0
5,1004,1,30,30.0


'Train3'

Unnamed: 0,menu_items,mean_bill
0,3,30.0
1,2,30.0
2,1,30.0
3,1,15.0
4,2,25.0
5,1,30.0


'Y3'

Unnamed: 0,bill
0,50
1,30
2,10
3,15
4,25
5,30


Unnamed: 0,restaurantId,menu_items,bill,mean_bill,predicted_bill
0,1001,3,50,30.0,50
1,1001,2,30,30.0,30
2,1001,1,10,30.0,20
3,1002,1,15,15.0,15
4,1003,2,25,25.0,25
5,1004,1,30,30.0,20


In [144]:
#Training after target encoding restaurantId with another restaurant having mean_bill very very close
df4 = df.copy()
df4 = df4.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 31.1}, ignore_index = True)
df4 = df4.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 29}, ignore_index = True)


targets = get_target_encoding(df4, 'restaurantId', 'bill')
df4['mean_bill'] = df4['restaurantId'].apply(lambda x: targets[x])
display(targets)


df_train4 = df4[['menu_items', 'mean_bill']]
y_train4 = df4[['bill']]

display("df4", df4)
display("Train4", df_train4)
display("Y4", y_train4)


dv4, model4 = train(df_train4, y_train4, 6, 1)
y_pred_val4, X_val4 = predict(df_train4, dv4, model4)


df4.assign(predicted_bill = np.round(y_pred_val4).astype('int'))

#a slight difference in `mean_bill` will keep the model accurate but this depends on the hyper parameters of the model

{'1001': 30.0, '1002': 15.0, '1003': 25.0, '1004': 30.05}

Unnamed: 0,restaurantId,menu_items,bill,mean_bill,predicted_bill
0,1001,3,50.0,30.0,50
1,1001,2,30.0,30.0,30
2,1001,1,10.0,30.0,10
3,1002,1,15.0,15.0,15
4,1003,2,25.0,25.0,25
5,1004,1,31.1,30.05,30
6,1004,1,29.0,30.05,30


In [146]:
#Training after target encoding restaurantId with another restaurant having same mean_bill and mean_bill_deviation
df5 = df.copy()
df5 = df5.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 30}, ignore_index = True)
df5 = df5.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 31}, ignore_index = True)
df5 = df5.append({'restaurantId': "1004", 'menu_items': 1, 'bill': 29}, ignore_index = True)


targets = get_target_encoding(df5, 'restaurantId', 'bill')
df5['mean_bill'] = df5['restaurantId'].apply(lambda x: targets[x])
df5['mean_bill_deviation'] = df5['bill'] - df5['mean_bill']
display(targets)


df_train5 = df5[['menu_items', 'mean_bill', 'mean_bill_deviation']]
y_train5 = df5[['bill']]

display("df5", df5)
display("Train5", df_train5)
display("Y5", y_train5)


dv5, model5 = train(df_train5, y_train5, 6, 1)
y_pred_val5, X_val5 = predict(df_train5, dv5, model5)


df5.assign(predicted_bill = np.round(y_pred_val5).astype('int'))

# using both `mean_bill` and `mean_bill_deviation` keeps the model accurate

{'1001': 30.0, '1002': 15.0, '1003': 25.0, '1004': 30.0}

'df5'

Unnamed: 0,restaurantId,menu_items,bill,mean_bill,mean_bill_deviation
0,1001,3,50,30.0,20.0
1,1001,2,30,30.0,0.0
2,1001,1,10,30.0,-20.0
3,1002,1,15,15.0,0.0
4,1003,2,25,25.0,0.0
5,1004,1,30,30.0,0.0
6,1004,1,31,30.0,1.0
7,1004,1,29,30.0,-1.0


'Train5'

Unnamed: 0,menu_items,mean_bill,mean_bill_deviation
0,3,30.0,20.0
1,2,30.0,0.0
2,1,30.0,-20.0
3,1,15.0,0.0
4,2,25.0,0.0
5,1,30.0,0.0
6,1,30.0,1.0
7,1,30.0,-1.0


'Y5'

Unnamed: 0,bill
0,50
1,30
2,10
3,15
4,25
5,30
6,31
7,29


Unnamed: 0,restaurantId,menu_items,bill,mean_bill,mean_bill_deviation,predicted_bill
0,1001,3,50,30.0,20.0,50
1,1001,2,30,30.0,0.0,30
2,1001,1,10,30.0,-20.0,10
3,1002,1,15,15.0,0.0,15
4,1003,2,25,25.0,0.0,25
5,1004,1,30,30.0,0.0,30
6,1004,1,31,30.0,1.0,31
7,1004,1,29,30.0,-1.0,29
