In [86]:
import numpy as np
import pandas as pd
import lightgbm as lgb

# Data Read

In [87]:
products = pd.read_csv("data/products.csv")
aisles = pd.read_csv("data/aisles.csv")
departments = pd.read_csv("data/departments.csv")
orders = pd.read_csv("data/orders.csv")
order_products_prior = pd.read_csv("data/order_products__prior.csv")
order_products_train = pd.read_csv("data/order_products__train.csv")

### orders.csv

In [88]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### order_products_prior.csv

In [89]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


### products.csv

In [90]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


# Products에 새로운 features 추가

In [91]:
prods = pd.DataFrame()
prods['product_id'] = order_products_prior.groupby(order_products_prior.product_id).size().index
prods['total_orders'] = order_products_prior.groupby(order_products_prior.product_id).size().values
prods['total_reorders'] = order_products_prior['reordered'].groupby(order_products_prior.product_id).sum().values
prods['reorder_rate'] = (prods.total_reorders / prods.total_orders)
prods.head()

Unnamed: 0,product_id,total_orders,total_reorders,reorder_rate
0,1,1852,1136,0.613391
1,2,90,12,0.133333
2,3,277,203,0.732852
3,4,329,147,0.446809
4,5,15,9,0.6


In [92]:
products = products.merge(prods, on='product_id', how='inner')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,total_orders,total_reorders,reorder_rate
0,1,Chocolate Sandwich Cookies,61,19,1852,1136,0.613391
1,2,All-Seasons Salt,104,13,90,12,0.133333
2,3,Robust Golden Unsweetened Oolong Tea,94,7,277,203,0.732852
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,329,147,0.446809
4,5,Green Chile Anytime Sauce,5,13,15,9,0.6


In [94]:
len(products)

49677

# [order_id기준] prior와 orders 통합

In [93]:
priors = order_products_prior.merge(orders, on='order_id')
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [95]:
len(priors)

32434489

# [user_id기준] 미완성

In [96]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)


In [104]:
orders.head(30)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [97]:
usr

Unnamed: 0_level_0,average_days_between_orders,nb_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,19.000000,11
2,16.285715,15
3,12.000000,13
4,17.000000,6
5,11.500000,5
6,13.333333,4
7,10.450000,21
8,23.333334,4
9,22.000000,4
10,21.799999,6


In [106]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [98]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)


In [99]:
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23
6,14,"{40992, 27521, 20323, 48679, 8424, 45007, 2190...",12
7,206,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",68
8,49,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...",36
9,76,"{8834, 38277, 33787, 5002, 11790, 38159, 7952,...",58
10,143,"{36865, 20995, 13829, 43014, 11782, 18441, 476...",94


In [100]:
users = users.join(usr)
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5
6,14,"{40992, 27521, 20323, 48679, 8424, 45007, 2190...",12,13.333333,4
7,206,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",68,10.450000,21
8,49,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...",36,23.333334,4
9,76,"{8834, 38277, 33787, 5002, 11790, 38159, 7952,...",58,22.000000,4
10,143,"{36865, 20995, 13829, 43014, 11782, 18441, 476...",94,21.799999,6


In [101]:
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

user f (206209, 6)


In [107]:
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.000000
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6,3.000000
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5,7.400000
6,14,"{40992, 27521, 20323, 48679, 8424, 45007, 2190...",12,13.333333,4,3.500000
7,206,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",68,10.450000,21,9.809524
8,49,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...",36,23.333334,4,12.250000
9,76,"{8834, 38277, 33787, 5002, 11790, 38159, 7952,...",58,22.000000,4,19.000000
10,143,"{36865, 20995, 13829, 43014, 11782, 18441, 476...",94,21.799999,6,23.833334


In [108]:
priors['user_product'] = priors.product_id + priors.user_id * 100000
priors

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
0,2,33120,1,1,202279,prior,3,5,9,8.0,20227933120
1,2,28985,2,1,202279,prior,3,5,9,8.0,20227928985
2,2,9327,3,0,202279,prior,3,5,9,8.0,20227909327
3,2,45918,4,1,202279,prior,3,5,9,8.0,20227945918
4,2,30035,5,0,202279,prior,3,5,9,8.0,20227930035
5,2,17794,6,1,202279,prior,3,5,9,8.0,20227917794
6,2,40141,7,1,202279,prior,3,5,9,8.0,20227940141
7,2,1819,8,1,202279,prior,3,5,9,8.0,20227901819
8,2,43668,9,0,202279,prior,3,5,9,8.0,20227943668
9,3,33754,1,1,205970,prior,16,5,17,12.0,20597033754


In [109]:
len(priors['user_product'].unique())

13307953

In [115]:
for row in priors.itertuples():
    print(row.user_product)
    break;

20227933120


In [125]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in order_products_train.index for product in user_products]
            
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    
    return (df, labels)

In [126]:
df, labels = features(orders[orders.eval_set=='train'], labels_given=True)

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000


In [122]:
df

Unnamed: 0,order_id,product_id
0,1187899,17122
1,1187899,196
2,1187899,26405
3,1187899,46149
4,1187899,14084
5,1187899,13032
6,1187899,26088
7,1187899,39657
8,1187899,12427
9,1187899,25133


In [134]:
for t in labels:
    if(t!=0): print(a)