# normalization (LB 0.343645)

In [1]:
import numpy as np
import pandas as pd

# 0. Introduction of Data

In [2]:
products = pd.read_csv("../data/products.csv")
aisles = pd.read_csv("../data/aisles.csv")
departments = pd.read_csv("../data/departments.csv")
orders = pd.read_csv("../data/orders.csv")
prior = pd.read_csv("../data/order_products__prior.csv")
train = pd.read_csv("../data/order_products__train.csv")
frame_train = pd.read_csv("../data/frame_train.csv")
frame_test = pd.read_csv("../data/frame_test.csv")
userXproduct = pd.read_csv("../data/userXproduct.csv")

FileNotFoundError: File b'../data/userXproduct.csv' does not exist

### orders.csv

In [None]:
print("Size of orders : ", len(orders))
orders.head()

### order_products_prior.csv

In [None]:
print("Size of prior : ", len(prior))
prior.head()

### products.csv

In [None]:
print("Size of Products : ", len(products))
products.head()

### frame_train.csv

In [None]:
print("Size of frame of train : ", len(frame_train))
frame_train.head()

# 1. orders Information - categorization of order hour

In [None]:
orders['categorized_order_hour_of_day'] = 0
orders.loc[orders['order_hour_of_day']>=9,'categorized_order_hour_of_day'] = 1
orders.loc[orders['order_hour_of_day']>=13,'categorized_order_hour_of_day'] = 2
orders.loc[orders['order_hour_of_day']>=19,'categorized_order_hour_of_day'] = 3
orders.head()

In [None]:
orders = orders.join(pd.get_dummies(orders['categorized_order_hour_of_day'], prefix='categorized_hour'))
orders.drop(['categorized_order_hour_of_day'], axis=1, inplace=True)

In [None]:
orders.head()

# 2. Integrate Products Information
* products와 prior에서 얻을 수 있는 제품 관련 정보를 통합
* 기존의 attribute인 product_id, product_name, aisle_id, department_id
* 여기에 prior에서 얻을 수 있는 해당 제품에 판매에 관련된 attribute를 추가
* 총주문량(total_orders), 총재주문량(total_reorders), 재주문율(reorder_rate)을 추가함

In [None]:
productsXprior = pd.DataFrame()
productsXprior['product_id'] = prior.groupby(prior.product_id).size().index
productsXprior['product_orders'] = prior.groupby(prior.product_id).size().values
productsXprior['product_reorders'] = prior['reordered'].groupby(prior.product_id).sum().values
productsXprior['product_reorder_rate'] = (productsXprior.product_reorders / productsXprior.product_orders)
print("Size of productsXprior : ", len(productsXprior))
productsXprior.head()

In [None]:
Products = products.merge(productsXprior, on='product_id', how='outer')
print("Size of Products : ", len(Products))
Products.head()

Products[Products.isnull().any(axis=1)]

In [None]:
Products = Products.replace(np.NaN, 0)
print("Size of Products : ", len(Products))
Products.head()

# normalization product_reorders

In [None]:
Products['norm_product_reorders'] =(Products["product_reorders"]-Products["product_reorders"].min())/(Products["product_reorders"].max()-Products["product_reorders"].min())
Products

# 3. Integrate_Users_Information
* prior와 orders를 통합(priorXorders)하여 user_id, order_id, product_id를 하나로 모음.
* priorXorders를 기반으로 하여 유저 정보를 담고 있는 Users를 생성.

In [None]:
priorXorders = prior.merge(orders, on='order_id')
print("Size of Prior : ", len(priorXorders))
priorXorders.head()

In [None]:
user_orders = pd.DataFrame()
user_orders['user_avg_orders_period'] = orders.groupby('user_id')['days_since_prior_order'].mean()
user_orders['user_total_orders'] = orders.groupby('user_id').size()

user_orders['user_id'] = user_orders.index
user_orders.index.name = None

print("Size of user_orders : ", len(user_orders))
user_orders.head()

In [None]:
user_prior = pd.DataFrame()
user_prior['user_total_products'] = priorXorders.groupby('user_id').size()
user_prior['ordered_products_set'] = priorXorders.groupby('user_id')['product_id'].apply(set)
user_prior['user_unique_products'] = (user_prior.ordered_products_set.map(len))
user_prior = user_prior[['user_total_products', 'user_unique_products', 'ordered_products_set']]

user_prior['user_id'] = user_prior.index
user_prior.index.name = None

print("Size of user_prior : ", len(user_prior))
user_prior.head()

In [None]:
Users = user_prior.merge(user_orders, on='user_id')
print("Size of users : ", len(Users))
Users.head()

In [None]:
Users['user_avg_cart'] = (Users.user_total_products / (Users.user_total_orders - 1))
print("Size of users : ", len(Users))
Users.head()

# 4. UP(userXproduct) Information

In [None]:
userXproduct.columns = ['user_product','UP_orders','UP_last_order_id','UP_sum_pos_in_cart']
print('Size of user X product', len(userXproduct))
userXproduct.head()

# 5. Train Data Generation
* ### 미리 만들어 둔 frame_train을 기반으로 위의 orders, Products, Users 정보를 통합시켜 train data를 만듬.
* order_prior, order_train은 학습데이터로 포함 X, order_test가 없기 때문에 test데이터에 해당 attribute를 포함시킬 수 없음.

In [None]:
train_df = frame_train.copy()
train_df.set_index(['user_id','order_id','product_id']).head(10)

### 1) orders Information 통합

In [None]:
train_df = train_df.merge(orders, how='left', on=['user_id','order_id'])
train_df.head()

### 2) Products Information 통합

In [None]:
train_df = train_df.merge(Products, how='left', on='product_id')
train_df.head()

### 3) Users Information 통합

In [None]:
train_df = train_df.merge(Users, how='left', on='user_id')
train_df.head()

### Extra Feature) days_since_ratio 추가

In [None]:
train_df['days_since_ratio'] = train_df.days_since_prior_order / train_df.user_avg_orders_period

### 4) UP(userXproduct) Information 추가

In [None]:
train_df['user_product'] = train_df.user_id * 100000 + train_df.product_id
train_df = train_df.merge(userXproduct, how='left', on='user_product')
train_df.drop(['user_product'], axis=1, inplace=True)

In [None]:
train_df['UP_orders_ratio'] = train_df.UP_orders / train_df.user_total_orders
train_df['UP_avg_pos_in_cart'] = train_df.UP_sum_pos_in_cart / train_df.UP_orders
train_df['UP_reorder_rate'] = train_df.UP_orders / train_df.user_total_orders
train_df['UP_orders_since_last'] = train_df.user_total_orders - train_df.UP_last_order_id.map(orders.order_number)
train_df['UP_delta_hour_vs_last'] = abs(train_df.order_hour_of_day - train_df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))

train_df.head()

### 모든 데이터 통합 완료 & 사용할 attribute 선정

In [None]:
train_df.columns

In [None]:
features = [
    # order information
    'order_hour_of_day',
    'days_since_prior_order', 'days_since_ratio',
    # product information
    'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', 
    # user information
    'user_total_products', 'user_unique_products', 'user_avg_orders_period', 'user_total_orders', 'user_avg_cart', 
    # userXproduct information
    'UP_orders', 'UP_orders_ratio', 'UP_avg_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last', 'UP_delta_hour_vs_last'
]

In [None]:
train_X = train_df[features]
train_y = train_df['products']

# 6. Test Data Generation

In [None]:
test_df = frame_test.copy()
test_df = test_df.merge(orders, how='left', on=['user_id','order_id'])
test_df = test_df.merge(Products, how='left', on='product_id')
test_df = test_df.merge(Users, how='left', on='user_id')

test_df['days_since_ratio'] = test_df.days_since_prior_order / test_df.user_avg_orders_period

test_df['user_product'] = test_df.user_id * 100000 + test_df.product_id
test_df = test_df.merge(userXproduct, how='left', on='user_product')
test_df.drop(['user_product'], axis=1, inplace=True)

test_df['UP_orders_ratio'] = test_df.UP_orders / test_df.user_total_orders
test_df['UP_avg_pos_in_cart'] = test_df.UP_sum_pos_in_cart / test_df.UP_orders
test_df['UP_reorder_rate'] = test_df.UP_orders / test_df.user_total_orders
test_df['UP_orders_since_last'] = test_df.user_total_orders - test_df.UP_last_order_id.map(orders.order_number)
test_df['UP_delta_hour_vs_last'] = abs(test_df.order_hour_of_day - test_df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))


In [34]:
test_X = test_df[features]

# 7. Learning & Prediction

In [3]:
import lightgbm as lgb

In [4]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

### Learning

In [5]:
train_X_ds = lgb.Dataset(train_X, label=train_y, categorical_feature=['aisle_id','department_id'])

NameError: name 'train_X' is not defined

In [None]:
bst = lgb.train(params, train_X_ds, ROUNDS)

### Prediction

In [None]:
result = bst.predict(test_X)

In [None]:
result = pd.concat([frame_test,pd.DataFrame(result, columns=['predict'])], axis=1)

In [None]:
result.head()

# 8. Prediction Probability 기반 최종 products list 생성

In [None]:
sorted_result = result.sort_values(['user_id', 'predict'], ascending=[True,False])

In [None]:
sorted_result.head()

In [None]:
sorted_result.reset_index(inplace=True)
sorted_result.drop(['index'], axis=1, inplace=True)
sorted_result.head()

In [None]:
sorted_result

In [None]:
ordered_products = sorted_result.groupby(['user_id', 'order_id'])['product_id'].apply(list)
ordered_products

In [None]:
d = dict()
for (uid, oid), row in ordered_products.iteritems():
    avg_cart = Users[Users.user_id == uid]['user_avg_cart'].values[0]
    
    row = row[:int(round(avg_cart))]
    
    for prod in row:    
        try:
            d[oid] += ' ' + str(prod)
        except:
            d[oid] = str(prod)

In [None]:
output = pd.DataFrame.from_dict(d, orient='index')
output.reset_index(inplace=True)
output.columns = ['order_id', 'products']
output.head()

In [None]:
output.to_csv('LightGBM-normalization.csv', index=False)