# LightGBM 기반 UP(userXproduct) feature 추가 (LB 0.343645)

In [1]:
import numpy as np
import pandas as pd

# 0. Introduction of Data

In [2]:
products = pd.read_csv("data/products.csv")
aisles = pd.read_csv("data/aisles.csv")
departments = pd.read_csv("data/departments.csv")
orders = pd.read_csv("data/orders.csv")
prior = pd.read_csv("data/order_products__prior.csv")
train = pd.read_csv("data/order_products__train.csv")
frame_train = pd.read_csv("data/frame_train.csv")
frame_test = pd.read_csv("data/frame_test.csv")
userXproduct = pd.read_csv("data/userXproduct.csv")

### orders.csv

In [3]:
print("Size of orders : ", len(orders))
orders.head()

Size of orders :  3421083


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### order_products_prior.csv

In [4]:
print("Size of prior : ", len(prior))
prior.head()

Size of prior :  32434489


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


### products.csv

In [5]:
print("Size of Products : ", len(products))
products.head()

Size of Products :  49688


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


### frame_train.csv

In [6]:
print("Size of frame of train : ", len(frame_train))
frame_train.head()

Size of frame of train :  8474661


Unnamed: 0,user_id,order_id,product_id,products
0,1,1187899,17122,0
1,1,1187899,196,1
2,1,1187899,26405,1
3,1,1187899,46149,1
4,1,1187899,14084,0


# 1. orders Information - categorization of order hour

In [7]:
orders['categorized_order_hour_of_day'] = 0
orders.loc[orders['order_hour_of_day']>=9,'categorized_order_hour_of_day'] = 1
orders.loc[orders['order_hour_of_day']>=13,'categorized_order_hour_of_day'] = 2
orders.loc[orders['order_hour_of_day']>=19,'categorized_order_hour_of_day'] = 3
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_order_hour_of_day
0,2539329,1,prior,1,2,8,,0
1,2398795,1,prior,2,3,7,15.0,0
2,473747,1,prior,3,3,12,21.0,1
3,2254736,1,prior,4,4,7,29.0,0
4,431534,1,prior,5,4,15,28.0,2


In [8]:
orders = orders.join(pd.get_dummies(orders['categorized_order_hour_of_day'], prefix='categorized_hour'))
orders.drop(['categorized_order_hour_of_day'], axis=1, inplace=True)

In [9]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,categorized_hour_1,categorized_hour_2,categorized_hour_3
0,2539329,1,prior,1,2,8,,1,0,0,0
1,2398795,1,prior,2,3,7,15.0,1,0,0,0
2,473747,1,prior,3,3,12,21.0,0,1,0,0
3,2254736,1,prior,4,4,7,29.0,1,0,0,0
4,431534,1,prior,5,4,15,28.0,0,0,1,0


# 2. Integrate Products Information
* products와 prior에서 얻을 수 있는 제품 관련 정보를 통합
* 기존의 attribute인 product_id, product_name, aisle_id, department_id
* 여기에 prior에서 얻을 수 있는 해당 제품에 판매에 관련된 attribute를 추가
* 총주문량(total_orders), 총재주문량(total_reorders), 재주문율(reorder_rate)을 추가함

In [10]:
productsXprior = pd.DataFrame()
productsXprior['product_id'] = prior.groupby(prior.product_id).size().index
productsXprior['product_orders'] = prior.groupby(prior.product_id).size().values
productsXprior['product_reorders'] = prior['reordered'].groupby(prior.product_id).sum().values
productsXprior['product_reorder_rate'] = (productsXprior.product_reorders / productsXprior.product_orders)
print("Size of productsXprior : ", len(productsXprior))
productsXprior.head()

Size of productsXprior :  49677


Unnamed: 0,product_id,product_orders,product_reorders,product_reorder_rate
0,1,1852,1136,0.613391
1,2,90,12,0.133333
2,3,277,203,0.732852
3,4,329,147,0.446809
4,5,15,9,0.6


In [11]:
Products = products.merge(productsXprior, on='product_id', how='outer')
print("Size of Products : ", len(Products))
Products.head()

Size of Products :  49688


Unnamed: 0,product_id,product_name,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
0,1,Chocolate Sandwich Cookies,61,19,1852.0,1136.0,0.613391
1,2,All-Seasons Salt,104,13,90.0,12.0,0.133333
2,3,Robust Golden Unsweetened Oolong Tea,94,7,277.0,203.0,0.732852
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,329.0,147.0,0.446809
4,5,Green Chile Anytime Sauce,5,13,15.0,9.0,0.6


In [12]:
Products[Products.isnull().any(axis=1)]

Unnamed: 0,product_id,product_name,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
3629,3630,Protein Granola Apple Crisp,57,14,,,
3717,3718,Wasabi Cheddar Spreadable Cheese,21,16,,,
7044,7045,Unpeeled Apricot Halves in Heavy Syrup,88,13,,,
25382,25383,Chocolate Go Bites,61,19,,,
27498,27499,Non-Dairy Coconut Seven Layer Bar,100,21,,,
36232,36233,Water With Electrolytes,100,21,,,
37702,37703,Ultra Sun Blossom Liquid 90 loads Fabric Enhan...,75,17,,,
43724,43725,Sweetart Jelly Beans,100,21,,,
45970,45971,12 Inch Taper Candle White,101,17,,,
46624,46625,Single Barrel Kentucky Straight Bourbon Whiskey,31,7,,,


In [13]:
Products = Products.replace(np.NaN, 0)
print("Size of Products : ", len(Products))
Products.head()

Size of Products :  49688


Unnamed: 0,product_id,product_name,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
0,1,Chocolate Sandwich Cookies,61,19,1852.0,1136.0,0.613391
1,2,All-Seasons Salt,104,13,90.0,12.0,0.133333
2,3,Robust Golden Unsweetened Oolong Tea,94,7,277.0,203.0,0.732852
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,329.0,147.0,0.446809
4,5,Green Chile Anytime Sauce,5,13,15.0,9.0,0.6


# 3. Integrate_Users_Information
* prior와 orders를 통합(priorXorders)하여 user_id, order_id, product_id를 하나로 모음.
* priorXorders를 기반으로 하여 유저 정보를 담고 있는 Users를 생성.

In [14]:
priorXorders = prior.merge(orders, on='order_id')
print("Size of Prior : ", len(priorXorders))
priorXorders.head()

Size of Prior :  32434489


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,categorized_hour_1,categorized_hour_2,categorized_hour_3
0,2,33120,1,1,202279,prior,3,5,9,8.0,0,1,0,0
1,2,28985,2,1,202279,prior,3,5,9,8.0,0,1,0,0
2,2,9327,3,0,202279,prior,3,5,9,8.0,0,1,0,0
3,2,45918,4,1,202279,prior,3,5,9,8.0,0,1,0,0
4,2,30035,5,0,202279,prior,3,5,9,8.0,0,1,0,0


In [15]:
user_orders = pd.DataFrame()
user_orders['user_avg_orders_period'] = orders.groupby('user_id')['days_since_prior_order'].mean()
user_orders['user_total_orders'] = orders.groupby('user_id').size()

user_orders['user_id'] = user_orders.index
user_orders.index.name = None

print("Size of user_orders : ", len(user_orders))
user_orders.head()

Size of user_orders :  206209


Unnamed: 0,user_avg_orders_period,user_total_orders,user_id
1,19.0,11,1
2,16.285714,15,2
3,12.0,13,3
4,17.0,6,4
5,11.5,5,5


In [16]:
user_prior = pd.DataFrame()
user_prior['user_total_products'] = priorXorders.groupby('user_id').size()
user_prior['ordered_products_set'] = priorXorders.groupby('user_id')['product_id'].apply(set)
user_prior['user_unique_products'] = (user_prior.ordered_products_set.map(len))
user_prior = user_prior[['user_total_products', 'user_unique_products', 'ordered_products_set']]

user_prior['user_id'] = user_prior.index
user_prior.index.name = None

print("Size of user_prior : ", len(user_prior))
user_prior.head()

Size of user_prior :  206209


Unnamed: 0,user_total_products,user_unique_products,ordered_products_set,user_id
1,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",1
2,195,102,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",2
3,88,33,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",3
4,18,17,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",4
5,37,23,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",5


In [17]:
Users = user_prior.merge(user_orders, on='user_id')
print("Size of users : ", len(Users))
Users.head()

Size of users :  206209


Unnamed: 0,user_total_products,user_unique_products,ordered_products_set,user_id,user_avg_orders_period,user_total_orders
0,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",1,19.0,11
1,195,102,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",2,16.285714,15
2,88,33,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",3,12.0,13
3,18,17,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",4,17.0,6
4,37,23,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",5,11.5,5


In [18]:
Users['user_avg_cart'] = (Users.user_total_products / (Users.user_total_orders - 1))
print("Size of users : ", len(Users))
Users.head()

Size of users :  206209


Unnamed: 0,user_total_products,user_unique_products,ordered_products_set,user_id,user_avg_orders_period,user_total_orders,user_avg_cart
0,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",1,19.0,11,5.9
1,195,102,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",2,16.285714,15,13.928571
2,88,33,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",3,12.0,13,7.333333
3,18,17,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",4,17.0,6,3.6
4,37,23,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",5,11.5,5,9.25


# 4. UP(userXproduct) Information

In [19]:
userXproduct.columns = ['user_product','UP_orders','UP_last_order_id','UP_sum_pos_in_cart']
print('Size of user X product', len(userXproduct))
userXproduct.head()

Size of user X product 13307953


Unnamed: 0,user_product,UP_orders,UP_last_order_id,UP_sum_pos_in_cart
0,14126415872,5,843810,63
1,20535312385,1,2699553,9
2,1375731717,7,2844957,51
3,8959033352,4,1924807,57
4,7342828204,3,723315,13


# 5. Train Data Generation
* ### 미리 만들어 둔 frame_train을 기반으로 위의 orders, Products, Users 정보를 통합시켜 train data를 만듬.
* order_prior, order_train은 학습데이터로 포함 X, order_test가 없기 때문에 test데이터에 해당 attribute를 포함시킬 수 없음.

In [20]:
train_df = frame_train.copy()
train_df.set_index(['user_id','order_id','product_id']).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,products
user_id,order_id,product_id,Unnamed: 3_level_1
1,1187899,17122,0
1,1187899,196,1
1,1187899,26405,1
1,1187899,46149,1
1,1187899,14084,0
1,1187899,13032,1
1,1187899,26088,1
1,1187899,39657,1
1,1187899,12427,0
1,1187899,25133,1


### 1) orders Information 통합

In [21]:
train_df = train_df.merge(orders, how='left', on=['user_id','order_id'])
train_df.head()

Unnamed: 0,user_id,order_id,product_id,products,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,categorized_hour_1,categorized_hour_2,categorized_hour_3
0,1,1187899,17122,0,train,11,4,8,14.0,1,0,0,0
1,1,1187899,196,1,train,11,4,8,14.0,1,0,0,0
2,1,1187899,26405,1,train,11,4,8,14.0,1,0,0,0
3,1,1187899,46149,1,train,11,4,8,14.0,1,0,0,0
4,1,1187899,14084,0,train,11,4,8,14.0,1,0,0,0


### 2) Products Information 통합

In [22]:
train_df = train_df.merge(Products, how='left', on='product_id')
train_df.head()

Unnamed: 0,user_id,order_id,product_id,products,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,categorized_hour_1,categorized_hour_2,categorized_hour_3,product_name,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
0,1,1187899,17122,0,train,11,4,8,14.0,1,0,0,0,Honeycrisp Apples,24,4,13880.0,9377.0,0.675576
1,1,1187899,196,1,train,11,4,8,14.0,1,0,0,0,Soda,77,7,35791.0,27791.0,0.77648
2,1,1187899,26405,1,train,11,4,8,14.0,1,0,0,0,XL Pick-A-Size Paper Towel Rolls,54,17,1214.0,536.0,0.441516
3,1,1187899,46149,1,train,11,4,8,14.0,1,0,0,0,Zero Calorie Cola,77,7,8558.0,6953.0,0.812456
4,1,1187899,14084,0,train,11,4,8,14.0,1,0,0,0,Organic Unsweetened Vanilla Almond Milk,91,16,15935.0,12923.0,0.810982


### 3) Users Information 통합

In [23]:
train_df = train_df.merge(Users, how='left', on='user_id')
train_df.head()

Unnamed: 0,user_id,order_id,product_id,products,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,...,department_id,product_orders,product_reorders,product_reorder_rate,user_total_products,user_unique_products,ordered_products_set,user_avg_orders_period,user_total_orders,user_avg_cart
0,1,1187899,17122,0,train,11,4,8,14.0,1,...,4,13880.0,9377.0,0.675576,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",19.0,11,5.9
1,1,1187899,196,1,train,11,4,8,14.0,1,...,7,35791.0,27791.0,0.77648,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",19.0,11,5.9
2,1,1187899,26405,1,train,11,4,8,14.0,1,...,17,1214.0,536.0,0.441516,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",19.0,11,5.9
3,1,1187899,46149,1,train,11,4,8,14.0,1,...,7,8558.0,6953.0,0.812456,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",19.0,11,5.9
4,1,1187899,14084,0,train,11,4,8,14.0,1,...,16,15935.0,12923.0,0.810982,59,18,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",19.0,11,5.9


### Extra Feature) days_since_ratio 추가

In [24]:
train_df['days_since_ratio'] = train_df.days_since_prior_order / train_df.user_avg_orders_period

### 4) UP(userXproduct) Information 추가

In [25]:
train_df['user_product'] = train_df.user_id * 100000 + train_df.product_id
train_df = train_df.merge(userXproduct, how='left', on='user_product')
train_df.drop(['user_product'], axis=1, inplace=True)

In [26]:
train_df['UP_orders_ratio'] = train_df.UP_orders / train_df.user_total_orders
train_df['UP_avg_pos_in_cart'] = train_df.UP_sum_pos_in_cart / train_df.UP_orders
train_df['UP_reorder_rate'] = train_df.UP_orders / train_df.user_total_orders
train_df['UP_orders_since_last'] = train_df.user_total_orders - train_df.UP_last_order_id.map(orders.order_number)
train_df['UP_delta_hour_vs_last'] = abs(train_df.order_hour_of_day - train_df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

train_df.head()

Unnamed: 0,user_id,order_id,product_id,products,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,categorized_hour_0,...,user_avg_cart,days_since_ratio,UP_orders,UP_last_order_id,UP_sum_pos_in_cart,UP_orders_ratio,UP_avg_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,1,1187899,17122,0,train,11,4,8,14.0,1,...,5.9,0.736842,1,431534,6,0.090909,6.0,0.090909,10,6
1,1,1187899,196,1,train,11,4,8,14.0,1,...,5.9,0.736842,10,2550362,14,0.909091,1.4,0.909091,8,5
2,1,1187899,26405,1,train,11,4,8,14.0,1,...,5.9,0.736842,2,2254736,10,0.181818,5.0,0.181818,-8,12
3,1,1187899,46149,1,train,11,4,8,14.0,1,...,5.9,0.736842,3,2550362,9,0.272727,3.0,0.272727,8,5
4,1,1187899,14084,0,train,11,4,8,14.0,1,...,5.9,0.736842,1,2539329,2,0.090909,2.0,0.090909,-32,5


### 모든 데이터 통합 완료 & 사용할 attribute 선정

In [27]:
train_df.columns

Index(['user_id', 'order_id', 'product_id', 'products', 'eval_set',
       'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'categorized_hour_0', 'categorized_hour_1',
       'categorized_hour_2', 'categorized_hour_3', 'product_name', 'aisle_id',
       'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'user_total_products', 'user_unique_products',
       'ordered_products_set', 'user_avg_orders_period', 'user_total_orders',
       'user_avg_cart', 'days_since_ratio', 'UP_orders', 'UP_last_order_id',
       'UP_sum_pos_in_cart', 'UP_orders_ratio', 'UP_avg_pos_in_cart',
       'UP_reorder_rate', 'UP_orders_since_last', 'UP_delta_hour_vs_last'],
      dtype='object')

In [28]:
features = [
    # order information
    'order_hour_of_day',
    'days_since_prior_order', 'days_since_ratio',
    # product information
    'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', 
    # user information
    'user_total_products', 'user_unique_products', 'user_avg_orders_period', 'user_total_orders', 'user_avg_cart', 
    # userXproduct information
    'UP_orders', 'UP_orders_ratio', 'UP_avg_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last', 'UP_delta_hour_vs_last'
]

In [29]:
train_X = train_df[features]
train_y = train_df['products']

# 6. Test Data Generation

In [30]:
test_df = frame_test.copy()
test_df = test_df.merge(orders, how='left', on=['user_id','order_id'])
test_df = test_df.merge(Products, how='left', on='product_id')
test_df = test_df.merge(Users, how='left', on='user_id')

test_df['days_since_ratio'] = test_df.days_since_prior_order / test_df.user_avg_orders_period

test_df['user_product'] = test_df.user_id * 100000 + test_df.product_id
test_df = test_df.merge(userXproduct, how='left', on='user_product')
test_df.drop(['user_product'], axis=1, inplace=True)

test_df['UP_orders_ratio'] = test_df.UP_orders / test_df.user_total_orders
test_df['UP_avg_pos_in_cart'] = test_df.UP_sum_pos_in_cart / test_df.UP_orders
test_df['UP_reorder_rate'] = test_df.UP_orders / test_df.user_total_orders
test_df['UP_orders_since_last'] = test_df.user_total_orders - test_df.UP_last_order_id.map(orders.order_number)
test_df['UP_delta_hour_vs_last'] = abs(test_df.order_hour_of_day - test_df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))


In [31]:
test_X = test_df[features]

# 7. Learning & Prediction

In [32]:
import lightgbm as lgb

In [33]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

### Learning

In [34]:
train_X_ds = lgb.Dataset(train_X, label=train_y, categorical_feature=['aisle_id','department_id'])

In [35]:
bst = lgb.train(params, train_X_ds, ROUNDS)

### Prediction

In [36]:
result = bst.predict(test_X)

In [37]:
result = pd.concat([frame_test,pd.DataFrame(result, columns=['predict'])], axis=1)

In [38]:
result.head()

Unnamed: 0,user_id,order_id,product_id,predict
0,3,2774568,17668,0.313816
1,3,2774568,44683,0.124779
2,3,2774568,48523,0.104822
3,3,2774568,21903,0.570984
4,3,2774568,14992,0.132532


# 8. Prediction Probability 기반 최종 products list 생성

In [39]:
sorted_result = result.sort_values(['user_id', 'predict'], ascending=[True,False])

In [40]:
sorted_result.head()

Unnamed: 0,user_id,order_id,product_id,predict
9,3,2774568,39190,0.709557
10,3,2774568,47766,0.653427
3,3,2774568,21903,0.570984
17,3,2774568,9387,0.319793
0,3,2774568,17668,0.313816


In [41]:
sorted_result.reset_index(inplace=True)
sorted_result.drop(['index'], axis=1, inplace=True)
sorted_result.head()

Unnamed: 0,user_id,order_id,product_id,predict
0,3,2774568,39190,0.709557
1,3,2774568,47766,0.653427
2,3,2774568,21903,0.570984
3,3,2774568,9387,0.319793
4,3,2774568,17668,0.313816


In [42]:
sorted_result

Unnamed: 0,user_id,order_id,product_id,predict
0,3,2774568,39190,0.709557
1,3,2774568,47766,0.653427
2,3,2774568,21903,0.570984
3,3,2774568,9387,0.319793
4,3,2774568,17668,0.313816
5,3,2774568,43961,0.269238
6,3,2774568,22035,0.233356
7,3,2774568,18599,0.224475
8,3,2774568,16797,0.219011
9,3,2774568,32402,0.217856


In [43]:
ordered_products = sorted_result.groupby(['user_id', 'order_id'])['product_id'].apply(list)
ordered_products

user_id  order_id
3        2774568     [39190, 47766, 21903, 9387, 17668, 43961, 2203...
4        329954      [35469, 19057, 7350, 37646, 36606, 26576, 1776...
6        1528013     [21903, 38293, 27521, 8424, 48679, 45007, 2032...
11       1376945     [8309, 14947, 27959, 34658, 28465, 42585, 3594...
12       1356845     [13176, 7076, 10863, 14992, 21616, 28134, 2035...
15       2161313     [14715, 12427, 11266, 37710, 10441, 27839, 196...
16       1416320     [21903, 5134, 24852, 21137, 41950, 17948, 2898...
19       1735923     [17008, 35123, 15131, 2192, 12108, 196, 18174,...
20       1980631       [13575, 9387, 22362, 46061, 6184, 13914, 41400]
22       139655      [22935, 27845, 13176, 21903, 24964, 32655, 851...
25       1411408     [43758, 22008, 27521, 26452, 10096, 37119, 261...
26       2940603     [7521, 44632, 14947, 19894, 30592, 31615, 1033...
28       1192143     [24852, 24759, 47626, 21513, 21903, 33731, 249...
31       280888      [19213, 41406, 12440, 13966, 48988, 28

In [44]:
d = dict()
for (uid, oid), row in ordered_products.iteritems():
    avg_cart = Users[Users.user_id == uid]['user_avg_cart'].values[0]
    
    row = row[:int(round(avg_cart))]
    
    for prod in row:    
        try:
            d[oid] += ' ' + str(prod)
        except:
            d[oid] = str(prod)

In [45]:
output = pd.DataFrame.from_dict(d, orient='index')
output.reset_index(inplace=True)
output.columns = ['order_id', 'products']
output.head()

Unnamed: 0,order_id,products
0,1654153,39046 13806 5077 24852 35102 30233 6631 19086 ...
1,1572866,19660 37646 27521 47626 47209 42445 27086 4481...
2,1703943,37710
3,1703945,40311 26324 34126 11520 30391
4,3276811,9076 8153 3161 10613 4329 9006 18339 40545


In [46]:
output.to_csv('submission/LightGBM-userXproduct.csv', index=False)