In [3]:
import numpy as np
import pandas as pd

In [4]:
orders = pd.read_csv("data/orders.csv")
priors = pd.read_csv("data/order_products__prior.csv")
train = pd.read_csv("data/order_products__train.csv")

# Preprocessing

In [5]:
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

In [6]:
users = pd.DataFrame()
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)

In [7]:
users.head()

Unnamed: 0_level_0,all_products
user_id,Unnamed: 1_level_1
1,"{17122, 196, 26405, 46149, 14084, 13032, 26088..."
2,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
3,"{17668, 44683, 48523, 21903, 14992, 21137, 324..."
4,"{21573, 42329, 17769, 35469, 37646, 1200, 1905..."
5,"{11777, 40706, 28289, 48775, 20754, 6808, 1398..."


In [8]:
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [9]:
train_orders.head()

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1187899,1187899,1,train,11,4,8,14.0
1492625,1492625,2,train,15,1,11,30.0
2196797,2196797,5,train,5,0,11,6.0
525192,525192,7,train,21,2,11,6.0
880375,880375,8,train,4,1,14,10.0


# Key Feature Generation
* **order_id와 유저(user_id)의 주문제품목록(users.all_products) 간의 조합**
* **따라서 (order_id, product_id)의 쌍(pair)을 Unique Key로 가짐.**  
<br>
* user_id : 유저번호
* order_id : 주문번호
* product_id : 제품번호
* products : 각 주문(order_id)에 해당 제품(product_id)을 포함시켰는지 여부, **즉 target class이다!**

In [10]:
def features(selected_orders, labels_given=False):
    user_list = []
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        user_list += [user_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'user_id':user_list, 'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    
    return (df, labels)

In [11]:
df_train, labels = features(train_orders, labels_given=True)

order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000


In [12]:
df_train['products'] = labels
df_train = df_train[['user_id','order_id','product_id','products']]
df_train.head()

Unnamed: 0,user_id,order_id,product_id,products
0,1,1187899,17122,0
1,1,1187899,196,1
2,1,1187899,26405,1
3,1,1187899,46149,1
4,1,1187899,14084,0


In [13]:
df_test, _ = features(test_orders, labels_given=False)

order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000


In [14]:
df_test = df_test[['user_id','order_id','product_id']]
df_test.head()

Unnamed: 0,user_id,order_id,product_id
0,1,1187899,17122
1,1,1187899,196
2,1,1187899,26405
3,1,1187899,46149
4,1,1187899,14084


# save as csv files

In [15]:
df_train.to_csv('data/frame_train.csv', index=False)
df_test.to_csv('data/frame_test.csv', index=False)