In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc

In [2]:
orders = pd.read_csv('../data/driver/driver_order.csv')
orders = orders[orders['counter'] > 1]
products = pd.read_csv('../data/driver/driver_order_products.csv')
print(products.shape)
products = products.merge(orders, on='order_id', how='inner')
products = products.drop(['eval_set'], axis=1)
print(products.shape)

(34010012, 6)
(32616793, 12)


In [3]:
aggregate = {}
aggregate['reordered'] = np.sum
aggregate['counter'] = np.count_nonzero
aggregate['order_id'] = pd.Series.nunique
aggregate['product_id'] = pd.Series.nunique
aggregate['aisle_id'] = pd.Series.nunique
aggregate['department_id'] = pd.Series.nunique
aggregate['days_since_prior_order'] = np.median
user_common = products.groupby('user_id').agg(aggregate).reset_index()
features  = ['usr_sum_rdr', 'usr_cnt_prd', 'usr_cnt_ord','usr_cds_prd']
features +=['usr_cds_ais','usr_cds_dep','usr_med_dysc']
user_common.columns = ['user_id'] + features

In [4]:
cart_length = products.groupby(['user_id','order_id'])['product_id'].count().reset_index()
cart_length = cart_length.groupby(['user_id'])['product_id'].mean().reset_index()
cart_length = cart_length.rename(columns={'product_id':'cartlen'})

In [5]:
cart_diverse = products.groupby(['user_id','order_id'])['aisle_id'].apply(pd.Series.nunique).reset_index()
cart_diverse = cart_diverse.groupby(['user_id'])['aisle_id'].mean().reset_index()
cart_diverse = cart_diverse.rename(columns={'aisle_id':'cartdiv'})

In [6]:
avg_reorder = products.groupby(['user_id','order_id'])['reordered'].mean().reset_index()
avg_reorder = avg_reorder.groupby(['user_id'])['reordered'].mean().reset_index()
avg_reorder = avg_reorder.rename(columns={'reordered':'usr_avg_rdr'})

In [7]:
lag_reorder = pd.read_csv('../data/model/dependent/dependent_n_1.csv')
lag_reorder = lag_reorder.groupby('user_id')['reordered'].mean().reset_index()
lag_reorder = lag_reorder.rename(columns={'reordered':'usr_lag_rdr'})

In [8]:
user_profile = pd.read_csv('../data/driver/driver_user.csv')
user_profile = user_profile.merge(user_common, on='user_id',  how='left')
user_profile = user_profile.merge(cart_length, on='user_id', how='left')
user_profile = user_profile.merge(cart_diverse, on='user_id', how='left')
user_profile = user_profile.merge(avg_reorder, on='user_id', how='left')
user_profile = user_profile.merge(lag_reorder, on='user_id', how='left')

In [9]:
user_profile.to_csv('../data/profile/user_profile.csv', index=False)

In [10]:
target = pd.read_csv('../data/model/dependent/dependent_n.csv')
target = target.merge(user_profile, on='user_id', how='inner')

for feat in user_profile.columns[2:]:
    fpr, tpr, thresholds = roc_curve(target['reordered'], target[feat])
    print('feat:', feat, 'auc:', round(100*(2*auc(fpr,tpr) - 1),2))

feat: usr_sum_rdr auc: -10.91
feat: usr_cnt_prd auc: -14.45
feat: usr_cnt_ord auc: -19.81
feat: usr_cds_prd auc: -19.95
feat: usr_cds_ais auc: -19.44
feat: usr_cds_dep auc: -18.54
feat: usr_med_dysc auc: 13.25
feat: cartlen auc: 7.58
feat: cartdiv auc: 7.06
feat: usr_avg_rdr auc: -0.94
feat: usr_lag_rdr auc: 27.71
