In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc

In [2]:
orders = pd.read_csv('../data/driver/driver_order.csv')
products = pd.read_csv('../data/driver/driver_order_products.csv')
orders = orders[['user_id','order_id','counter','days_since_prior_order']]
orders['days'] = orders.groupby('user_id')['days_since_prior_order'].shift(1)
orders = orders[orders['counter'] > 1].drop('days_since_prior_order',axis=1)
orders['cum_days'] = orders.groupby('user_id')['days'].cumsum()
orders = orders.drop('days',axis=1)
data = products.merge(orders, on=['order_id'])

In [3]:
data.head()

Unnamed: 0,order_id,product_id,aisle_id,department_id,add_to_cart_order,reordered,user_id,counter,cum_days
0,2,33120,86,16,1,1,202279,7,153.0
1,2,28985,83,4,2,1,202279,7,153.0
2,2,9327,104,13,3,0,202279,7,153.0
3,2,45918,19,13,4,1,202279,7,153.0
4,2,30035,17,13,5,0,202279,7,153.0


### product : time b/w orders

In [4]:
product_time = data[['user_id','product_id','cum_days','reordered']]
product_time = product_time.sort_values(by=['user_id','product_id','cum_days'], ascending=[True,True,False])
product_time['shift_days'] = product_time.groupby(['user_id','product_id'])['cum_days'].shift(1)
product_time = product_time[np.logical_not(product_time['shift_days'].isnull())]
product_time['days_diff'] = product_time['shift_days'] - product_time['cum_days']
product_time = product_time.groupby('product_id')['days_diff'].apply(np.median).reset_index()
product_time.columns = ['product_id','prd_med_dydiff']
product_time['prd_med_dydiff'].describe()

count    45306.000000
mean        26.857028
std         23.224859
min          0.000000
25%         14.000000
50%         20.500000
75%         30.000000
max        356.000000
Name: prd_med_dydiff, dtype: float64

### product : avg # of continuous orders

In [5]:
product_order = data[['user_id','product_id','counter']]
product_order = product_order.sort_values(by=['user_id','product_id','counter'])
product_order['shift_counter'] = product_order.groupby(['user_id','product_id'])['counter'].shift(1)
product_order = product_order[np.logical_not(product_order['shift_counter'].isnull())]
product_order['counter_diff'] = product_order['counter'] - product_order['shift_counter']
product_order = product_order.groupby('product_id')['counter_diff'].apply(np.nanmedian).reset_index()
product_order.columns = ['product_id','prd_med_orddiff']
product_order['prd_med_orddiff'].describe()

count    45306.000000
mean         2.969397
std          3.319074
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         94.000000
Name: prd_med_orddiff, dtype: float64

In [7]:
profile = pd.read_csv('../data/driver/driver_product.csv')
profile = profile.merge(product_time, on='product_id',how='left')
profile = profile.merge(product_order, on='product_id',how='left')
profile = profile.drop(['department_id','aisle_id'],axis=1).fillna(0.)
profile.to_csv('../data/profile/product_time_profile.csv', index=False)

In [8]:
target = pd.read_csv('../data/model/dependent/dependent_n_1.csv')
target = target.merge(profile, on='product_id', how='inner')

for feat in profile.columns[1:]:
    fpr, tpr, thresholds = roc_curve(target['reordered'], target[feat])
    print('feat:', feat, 'auc:', round(100*(2*auc(fpr,tpr) - 1),2))

feat: prd_med_dydiff auc: -23.9
feat: prd_med_orddiff auc: -22.31
