### 1. Importing packages

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
import time

warnings.filterwarnings("ignore")

### 2. Features creation

###### Loading initial data

In [None]:
path = 'C:/Users/AM000110/Desktop/ML/Kaggle/Instacart_Market_Basket_Analysis/input_data/'

In [None]:
# Load Data ---------------------------------------------------------------

aisles = pd.read_csv(path + 'aisles.csv', dtype={'aisle_id': np.uint8, 'aisle': 'category'})
departments = pd.read_csv(path + 'departments.csv',
                              dtype={'department_id': np.uint8, 'department': 'category'})
orderp = pd.read_csv(path + 'order_products__prior.csv', dtype={'order_id': np.uint32,
                                                                                      'product_id': np.uint16,
                                                                                      'add_to_cart_order': np.uint8,
                                                                                      'reordered': bool})
ordert = pd.read_csv(path + 'order_products__train.csv', dtype={'order_id': np.uint32,
                                                                                      'product_id': np.uint16,
                                                                                      'add_to_cart_order': np.uint8,
                                                                                      'reordered': bool})
orders = pd.read_csv(path + 'orders.csv', dtype={'order_id': np.uint32,
                                                                  'user_id': np.uint32,
                                                                  'eval_set': 'category',
                                                                  'order_number': np.uint8,
                                                                  'order_dow': np.uint8,
                                                                  'order_hour_of_day': np.uint8
                                                                  })
products = pd.read_csv(path + 'products.csv', dtype={'product_id': np.uint16,
                                                                      'aisle_id': np.uint8,
                                                                      'department_id': np.uint8})

###### Joining to product aisle and department names

In [None]:
products = pd.merge(products, aisles, on='aisle_id')
products = pd.merge(products, departments, on='department_id')
products = products[['product_id', 'aisle_id', 'department_id']]

del aisles, departments
gc.collect()

###### Adding user_ids to order_train

In [None]:
orders_users = orders[['order_id', 'user_id']]
orders_users = orders_users[orders_users['order_id'].isin(ordert.order_id.values)]
ordert = pd.merge(ordert, orders_users, on='order_id')

del orders_users
gc.collect()

###### Joining orders data with prior data

In [None]:
orders_products = pd.merge(orders, orderp, on='order_id')
orders_products = pd.merge(orders_products, products, on='product_id')

del orderp, products
gc.collect()

###### Adding order f1_score and order_size

In [None]:
# f1_score = pd.read_csv('C:/Users/AM000110/Desktop/ML/Kaggle/Instacart_Market_Basket_Analysis/reordered_basket_size_prediction/f1_score.csv')
# f1_score = f1_score[['order_id', 'average_f1_score']]

# orders_products = pd.merge(orders_products, f1_score, on='order_id')
# del f1_score
# gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby('order_id')['order_id'].size())
temp.rename(columns={'order_id' : 'order_size'}, inplace=True)
temp.order_size = temp.order_size.astype(np.int32)
temp.reset_index(inplace=True)

orders_products = pd.merge(orders_products, temp, on='order_id')

orders_products.days_since_prior_order.fillna(value=9999, inplace=True)
del temp
gc.collect()

###### Product general features

for each product:
* total number of orders (prod_orders)
* sum of reorders (prod_reorders)
* number of unique users who buy it (prod_first_orders)
* number of unque users who buy it more than 1 time (prod_second_orders)
* product reorder probability = prod_second_orders / prod_first_orders
* prod_reorder_times = 1 + prod_reorders / prod_first_orders
* prod_reorder_ratio = prod_reorders / prod_orders
* prod_average_order_hour_of_day (from me)
* prod_average_order_dow (from me)
* prod_frequency = prod_orders / total number of orders (from me)
* prod_reorder_frequency = prod_reorders / sum of total reorders (from me)
* prod_most_frequent_reoder_day (from me)
* product second reorder probability (from me)
* product aisle market share = aisle_orders / total_orders (from me)
* product deparment market share = department orders / total orders (from me)
* product market share in aisle = prod_orders / aisle_orders (from me)
* product market share in department = prod_orders / department_orders (from me)
* product order mean size (from me)
* product mean share within order (from me)
* product reorder std within users

In [None]:
prd = pd.DataFrame()

In [None]:
prd['prod_total_orders'] = orders_products.groupby('product_id')['order_id'].size().astype(np.int32)
prd['prod_total_reorders'] = orders_products.groupby('product_id')['reordered'].sum().astype(np.int32)
prd['prod_reorder_ratio'] = prd.prod_total_reorders / prd.prod_total_orders 
prd['prod_reorder_ratio'] = prd['prod_reorder_ratio'].astype(np.float32)

prd['prod_unique_users'] = orders_products.groupby('product_id')['user_id'].nunique().astype(np.int32)
prd['prod_unique_reorder_users'] = orders_products[orders_products.reordered > 0].groupby('product_id')['user_id'].nunique().astype(np.int32)
prd.prod_unique_reorder_users.fillna(value=0, inplace=True)
prd['prod_unique_reorder_users'] = prd['prod_unique_reorder_users'].astype(np.int32)

prd['prod_reorder_general_probability'] = prd.prod_unique_reorder_users / prd.prod_unique_users
prd['prod_reorder_general_probability'] = prd['prod_reorder_general_probability'].astype(np.float32)

prd['prod_reorder_times'] = 1 + prd.prod_total_reorders / prd.prod_unique_users
prd['prod_reorder_times'] = prd['prod_reorder_times'].astype(np.float32)

prd['prod_mean_add_to_cart_order'] = orders_products.groupby('product_id')['add_to_cart_order'].mean().astype(np.float32)
prd['prod_median_add_to_cart_order'] = orders_products.groupby('product_id')['add_to_cart_order'].median().astype(np.float32)
prd['prod_reorder_mean_add_to_cart_order'] = orders_products[orders_products.reordered > 0].groupby('product_id')['add_to_cart_order'].mean()
prd['prod_reorder_mean_add_to_cart_order'] = prd['prod_reorder_mean_add_to_cart_order'].astype(np.float32)
prd['prod_reorder_median_add_to_cart_order'] = orders_products[orders_products.reordered > 0].groupby('product_id')['add_to_cart_order'].median()
prd['prod_reorder_median_add_to_cart_order'] = prd['prod_reorder_median_add_to_cart_order'].astype(np.float32)

prd['prod_mean_days_since_prior'] = orders_products.groupby('product_id')['days_since_prior_order'].mean().astype(np.float32)
prd['prod_median_days_since_prior'] = orders_products.groupby('product_id')['days_since_prior_order'].median().astype(np.float32)

prd['prod_reoreder_mean_days_since_prior'] = orders_products[orders_products.reordered > 0].groupby('product_id')['days_since_prior_order'].mean()
prd['prod_reoreder_mean_days_since_prior'] = prd['prod_reoreder_mean_days_since_prior'].astype(np.float32)

prd['prod_reoreder_median_days_since_prior'] = orders_products[orders_products.reordered > 0].groupby('product_id')['days_since_prior_order'].median().astype(np.float32) 
prd.prod_reoreder_median_days_since_prior.fillna(value=9999, inplace=True)
prd['prod_reoreder_median_days_since_prior'] = prd['prod_reoreder_median_days_since_prior'].astype(np.int32)

prd['prod_reoreder_var_days_since_prior'] = orders_products[orders_products.reordered > 0].groupby('product_id')['days_since_prior_order'].var()
prd['prod_reoreder_var_days_since_prior'] = prd['prod_reoreder_var_days_since_prior'].astype(np.float32)

prd['prod_period'] = orders_products.groupby('product_id')['days_since_prior_order'].sum().astype(np.int32)

prd['prod_mean_order_hour_of_day'] = orders_products.groupby('product_id')['order_hour_of_day'].mean().astype(np.float32)
prd['prod_mean_order_dow'] = orders_products.groupby('product_id')['order_dow'].mean().astype(np.float32)

# prd['prod_mean_f1_score'] = orders_products.groupby('product_id')['average_f1_score'].mean().astype(np.float32)

prd['prod_mean_order_number'] = orders_products.groupby('product_id')['order_number'].mean().astype(np.float32)
prd['prod_reorder_mean_order_number'] = orders_products[orders_products.reordered > 0].groupby('product_id')['order_number'].mean().astype(np.float32)
prd['prod_reorder_mean_order_number'] = prd['prod_reorder_mean_order_number'].astype(np.float32)

prd['prod_share_in_orders'] = prd.prod_total_orders / orders_products.order_id.nunique()
prd['prod_share_in_orders'] = prd['prod_share_in_orders'].astype(np.float32)
prd['prod_share_in_reorders'] = prd.prod_total_reorders / orders_products.reordered.sum()
prd['prod_share_in_reorders'] = prd['prod_share_in_reorders'].astype(np.float32)

prd['prod_share_in_users'] = prd.prod_unique_users / orders_products.user_id.nunique()
prd['prod_share_in_users'] = prd['prod_share_in_users'].astype(np.float32)

prd['aisle_id'] = orders_products.groupby('product_id')['aisle_id'].min().astype(np.int16)
prd['department_id'] = orders_products.groupby('product_id')['department_id'].min().astype(np.int16)

In [None]:
temp = orders_products.groupby(['product_id', 'user_id']).agg({'reordered':'sum', 'user_id': 'size'})
temp.rename(columns={'sum': 'reordered','user_id': 'total'}, inplace=True)
temp['reorder_prob'] = temp.reordered / temp.total

prd['prod_reorder_up_mean_probability'] = temp.groupby('product_id')['reorder_prob'].mean().astype(np.float32)

del temp
gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby('product_id')['user_id'].value_counts())
temp.rename(columns={'user_id' : 'counts'}, inplace=True)
temp.reset_index(inplace=True)

prd['prod_unique_second_reorder_users'] = temp[temp['counts'] > 2].groupby('product_id')['counts'].size().astype(np.int32)

del temp
gc.collect()

In [None]:
temp_temp = pd.DataFrame()
temp = orders_products[['product_id','user_id', 'order_id', 'order_number', 'reordered']]

temp_temp['first_order'] = temp.groupby(['product_id', 'user_id'])['order_number'].min()
temp_temp['last_order'] = temp.groupby(['product_id', 'user_id'])['order_number'].max()
temp_temp['dif'] = temp_temp.last_order - temp_temp.first_order
temp_temp['reorder_order_numbers'] = temp[temp.reordered > 0].groupby(['product_id', 'user_id'])['order_number'].apply(list)

del temp
gc.collect()

temp = temp_temp.dropna().copy()    
temp['mean'] = temp['dif']

means = []
for row in temp[temp.reorder_order_numbers.map(len) > 1].iterrows():

    row[1].reorder_order_numbers.insert(0, row[1].first_order)
    means.append(np.mean(np.array(row[1].reorder_order_numbers)[1:] - np.array(row[1].reorder_order_numbers)[:-1]))    
temp.loc[temp.reorder_order_numbers.map(len) > 1, 'mean'] = means

temp_temp.reset_index(inplace=True)
temp.reset_index(inplace=True)

temp = temp[['product_id', 'user_id', 'mean',]]
temp_temp = pd.merge(temp_temp, temp, on=['product_id', 'user_id'], how='left')
temp_temp.fillna(value=0, inplace=True)

prd['prod_average_first_last_orders_dif_mean'] = temp_temp.groupby('product_id')['dif'].mean().astype(np.float32)
prd['prod_average_first_last_orders_dif_meadian'] = temp_temp.groupby('product_id')['dif'].median().astype(np.float32)
prd['prod_reorder_order_since_prior_mean_mean'] = temp_temp.groupby('product_id')['mean'].mean().astype(np.float32)
prd['prod_reorder_order_since_prior_mean_var'] = temp_temp.groupby('product_id')['mean'].var().astype(np.float32)

In [None]:
data = pd.DataFrame()

data['up_reorder_order_since_prior_mean'] = temp_temp.groupby(['user_id', 'product_id'])['mean'].min()
data['up_reorder_order_since_prior_mean'] = data['up_reorder_order_since_prior_mean'].astype(np.float32)

data['up_prod_first_last_orders_dif'] = temp_temp.groupby(['user_id', 'product_id'])['dif'].min()
data['up_prod_first_last_orders_dif'] = data['up_prod_first_last_orders_dif'].astype(np.float32)

del temp_temp, temp, means,
gc.collect()

In [None]:
temp = pd.DataFrame()
temp['aisle_orders'] = orders_products.groupby('aisle_id', as_index=False)['order_id'].size()
temp['prod_aisle_market_share'] = temp.aisle_orders / orders_products.order_id.nunique()
temp.reset_index(inplace = True)
temp_temp = pd.DataFrame(orders_products.groupby('product_id')['aisle_id'].min())
temp_temp.reset_index(inplace=True)
temp_temp = temp_temp.merge(temp, on='aisle_id')

prd['prod_aisle_share_in_orders']  = temp_temp.groupby('product_id')['prod_aisle_market_share'].min().astype(np.float32)
prd['prod_aisle_orders']  = temp_temp.groupby('product_id')['aisle_orders'].min().astype(np.int32)

del temp, temp_temp
gc.collect()

In [None]:
temp = pd.DataFrame()
temp['department_orders'] = orders_products.groupby('department_id', as_index=False)['order_id'].size()
temp['prod_department_market_share'] = temp.department_orders / len(np.unique(orders_products.order_id.values))
temp.reset_index(inplace = True)
temp_temp = pd.DataFrame(orders_products.groupby('product_id')['department_id'].min())
temp_temp.reset_index(inplace=True)
temp_temp = temp_temp.merge(temp, on='department_id')

prd['prod_department_share_in_orders']  = temp_temp.groupby('product_id')['prod_department_market_share'].min().astype(np.float32)
prd['prod_department_orders']  = temp_temp.groupby('product_id')['department_orders'].min().astype(np.int32)

del temp, temp_temp
gc.collect()

In [None]:
prd['prod_share_in_aisle'] = prd.prod_total_orders / prd.prod_aisle_orders
prd['prod_share_in_aisle'] = prd['prod_share_in_aisle'].astype(np.float32)

prd['prod_share_in_department'] = prd.prod_total_orders / prd.prod_department_orders
prd['prod_share_in_department'] = prd['prod_share_in_department'].astype(np.float32)

In [None]:
temp = pd.DataFrame(orders_products.groupby('order_id', as_index=False)['product_id'].size())
temp.columns = ['order_size']
temp.reset_index(inplace=True)

temp_temp = orders_products[['order_id', 'product_id']]
temp_temp = pd.merge(temp_temp, temp, on='order_id')

prd['prod_mean_order_size'] = temp_temp.groupby('product_id')['order_size'].mean().astype(np.float32)

del temp, temp_temp
gc.collect()

In [None]:
prd.fillna(value=0, inplace=True)
gc.collect()

###### User general features

for each user:
* total number of orders (user_orders)
* sum of days_since_prior_order (user_period)
* mean of days_since_prior_order (user_mean_days_since_prior)
* total number of ordered products (user_total_products)
* user_reorder_ratio = sum(user_reordered_products) / (total number of products without first order)
* total number of ordered unique products (user_distinct_products)
* user max reorder rate (from me)
* user mean reorder rate (from me)
* user average reorder (from me)
* user average hour of order (from me)
* user average order dow (from me)


In [None]:
users = pd.DataFrame()

In [None]:
users['user_orders'] = orders_products.groupby('user_id')['order_number'].max().astype(np.int32)
users['user_period'] = orders_products.groupby('user_id')['days_since_prior_order'].sum().astype(np.int32)
users['user_days_since_prior_mean'] = orders_products.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['user_days_since_prior_median'] = orders_products.groupby('user_id')['days_since_prior_order'].median().astype(np.int32)

users['user_total_products'] = orders_products.groupby('user_id')['user_id'].size().astype(np.int32)
users['user_distinct_products'] = orders_products.groupby('user_id')['product_id'].nunique().astype(np.int32)
users['user_distinct_aisles'] = orders_products.groupby('user_id')['aisle_id'].nunique().astype(np.int32)
users['user_distinct_departments'] = orders_products.groupby('user_id')['department_id'].nunique().astype(np.int32)
users['user_mean_order_hour_of_day'] = orders_products.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['user_mean_order_dow'] = orders_products.groupby('user_id')['order_dow'].mean().astype(np.float32)

users['user_reorder_mean_add_to_cart_order'] = orders_products[orders_products.reordered > 0].groupby('user_id')['add_to_cart_order'].mean()
users.user_reorder_mean_add_to_cart_order.fillna(value=9999, inplace=True)
users['user_reorder_mean_add_to_cart_order'] = users['user_reorder_mean_add_to_cart_order'].astype(np.float32)

users['user_reorder_median_add_to_cart_order'] = orders_products[orders_products.reordered > 0].groupby('user_id')['add_to_cart_order'].median()
users.user_reorder_median_add_to_cart_order.fillna(value = 0, inplace = True)
users['user_reorder_median_add_to_cart_order'] = users['user_reorder_median_add_to_cart_order'].astype(np.int32)

users['user_reoreder_days_since_prior_mean'] = orders_products[orders_products.reordered > 0].groupby('user_id')['days_since_prior_order'].mean() 
users.user_reoreder_days_since_prior_mean.fillna(value=9999, inplace=True)
users['user_reoreder_days_since_prior_mean'] = users['user_reoreder_days_since_prior_mean'].astype(np.float32)

users['user_reorder_total_products'] = orders_products[orders_products.reordered > 0].groupby('user_id')['user_id'].size().astype(np.int32)
users.user_reorder_total_products.fillna(value=0, inplace=True)
users['user_reorder_total_products'] = users['user_reorder_total_products'].astype(np.int32)

users['user_reorder_distinct_products'] = orders_products[orders_products.reordered > 0].groupby('user_id')['product_id'].nunique().astype(np.int32)
users.user_reorder_distinct_products.fillna(value=0, inplace=True)
users['user_reorder_distinct_products'] = users['user_reorder_distinct_products'].astype(np.int32)

users['user_reorder_distinct_aisles'] = orders_products[orders_products.reordered > 0].groupby('user_id')['aisle_id'].nunique().astype(np.int32)
users.user_reorder_distinct_aisles.fillna(value=0, inplace=True)
users['user_reorder_distinct_aisles'] = users['user_reorder_distinct_aisles'].astype(np.int32)

users['user_reorder_distinct_departments'] = orders_products[orders_products.reordered > 0].groupby('user_id')['department_id'].nunique().astype(np.int32)
users.user_reorder_distinct_departments.fillna(value=0, inplace=True)
users['user_reorder_distinct_departments'] = users['user_reorder_distinct_departments'].astype(np.int32)

# users['user_mean_f1_score'] = orders_products.groupby('user_id')['average_f1_score'].mean().astype(np.float32)

users['user_total_reorders'] = orders_products.groupby('user_id')['reordered'].sum()
users.user_total_reorders.fillna(value=0, inplace=True)
users['user_total_reorders'] = users['user_total_reorders'].astype(np.int32)

users['user_reorder_general_ratio'] = users.user_total_reorders / users.user_total_products
users['user_reorder_general_ratio'] = users['user_reorder_general_ratio'].astype(np.float32)

users['user_general_average_basket'] = users.user_total_products / users.user_orders
users['user_general_average_basket'] = users['user_general_average_basket'].astype(np.float32)

In [None]:
temp = orders_products[['user_id', 'order_id', 'order_size', 'reordered']]
temp_temp = pd.DataFrame(temp.groupby('order_id')['reordered'].sum())
temp_temp.rename(columns = {'reordered' : 'order_reorder'}, inplace=True)
temp_temp.reset_index(inplace=True)
temp = pd.merge(temp, temp_temp, on = 'order_id')

del temp_temp
gc.collect()

temp['order_reorder_ratio'] = temp.order_reorder / temp.order_size
users['user_order_reorder_ratio_mean'] = temp.groupby('user_id')['order_reorder_ratio'].mean().astype(np.float32)
users['user_order_reorder_ratio_median'] = temp.groupby('user_id')['order_reorder_ratio'].median().astype(np.float32)

del temp
gc.collect()

In [None]:
temp = orders_products[['user_id', 'order_id', 'order_number', 'order_size']]
temp.drop_duplicates(inplace=True)
temp_temp = pd.DataFrame(temp.groupby('user_id')['order_number'].max())
temp_temp.rename(columns = {'order_number': 'user_orders'}, inplace=True)
temp_temp.reset_index(inplace=True)
temp = pd.merge(temp_temp, temp, on='user_id')

del temp_temp
gc.collect()

temp['weighted_order_size'] = temp.order_number / temp.user_orders * temp.order_size
users['user_weighted_average_basket'] = temp.groupby('user_id')['weighted_order_size'].mean().astype(np.float32)

del temp
gc.collect()

###### Joining test and train orders data

In [None]:
temp = orders[orders['eval_set'] != 'prior'][['user_id', 'order_id', 'eval_set', 'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'order_number']]  
temp.set_index('user_id', inplace=True)

users = users.join(temp)

users['days_since_prior_order'] = users['days_since_prior_order'].astype(np.int32)

del temp
gc.collect()

###### Database for training

for each user + product pair:
* total number of orders by user ordered the product (up_orders)
* first order number of the product for user (up_first_order)
* last order number of the product for user (up_last_order)
* mean add_to_cart_order of product for user (up_average_cart_position)
* up_order_rate = up_orders / user_orders
* up_orders_since_last_order = user_orders - up_last_order
* up_order_rate_since_first_order = up_orders/(user_orders - up_first_order + 1)
* up_sum_cart_position (from me)
* up_reoreder_frequency (from me)
* up_porduct_score (form me)

In [None]:
# data = pd.DataFrame()

In [None]:
orders_products['add_to_cart_order_inverted'] = orders_products.order_size - orders_products.add_to_cart_order
orders_products['add_to_cart_order_relative'] = orders_products.add_to_cart_order / orders_products.order_size

In [None]:
data['up_orders'] = orders_products.groupby(['user_id', 'product_id'])['order_id'].size().astype(np.int32)
data['up_first_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].min().astype(np.int32)
data['up_last_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].max().astype(np.int32)
data['up_last_reorder_order'] = orders_products[orders_products.reordered > 0].groupby(['user_id', 'product_id'])['order_number'].max()
data.up_last_reorder_order.fillna(value=0, inplace=True)
data['up_last_reorder_order'] = data['up_last_reorder_order'].astype(np.int32)

data['up_mean_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].mean().astype(np.float32)
data['up_median_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].median().astype(np.int32)
data['up_sum_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].sum().astype(np.int32)

data['up_days_since_prior_order_mean'] = orders_products.groupby(['user_id', 'product_id'])['days_since_prior_order'].mean().astype(np.float32)
data['up_days_since_prior_order_median'] = orders_products.groupby(['user_id', 'product_id'])['days_since_prior_order'].median().astype(np.float32)
data['up_days_since_prior_order_sum'] = orders_products.groupby(['user_id', 'product_id'])['days_since_prior_order'].sum().astype(np.float32)

data['up_order_dow_mean'] = orders_products.groupby(['user_id', 'product_id'])['order_dow'].mean().astype(np.float32)
data['up_order_dow_median'] = orders_products.groupby(['user_id', 'product_id'])['order_dow'].median().astype(np.int32)

data['up_order_hour_of_day_mean'] = orders_products.groupby(['user_id', 'product_id'])['order_hour_of_day'].mean().astype(np.float32)
data['up_order_hour_of_day_median'] = orders_products.groupby(['user_id', 'product_id'])['order_hour_of_day'].median().astype(np.int32)

data['up_order_hour_of_day_mean'] = orders_products.groupby(['user_id', 'product_id'])['order_hour_of_day'].mean().astype(np.float32)
data['up_order_hour_of_day_median'] = orders_products.groupby(['user_id', 'product_id'])['order_hour_of_day'].median().astype(np.int32)

data['up_add_to_cart_order_inverted_mean'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order_inverted'].mean().astype(np.float32)
data['up_add_to_cart_order_inverted_median'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order_inverted'].median().astype(np.int32)

data['up_add_to_cart_order_relative_mean'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order_relative'].mean().astype(np.float32)
data['up_add_to_cart_order_relative_median'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order_relative'].median().astype(np.float32)

data['up_reorder_sum'] = orders_products.groupby(['user_id', 'product_id'])['reordered'].sum().astype(np.int32)
# data['up_average_f1_score'] = orders_products.groupby(['user_id', 'product_id'])['average_f1_score'].mean().astype(np.float32)
data['up_order_number_mean'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].mean().astype(np.float32)

In [None]:
temp = pd.DataFrame(orders_products.groupby(['user_id'])['order_number'].max())
temp.reset_index(inplace=True)
temp_temp = data[['up_order_number_mean']]
temp_temp.reset_index(inplace=True)
temp_temp = pd.merge(temp_temp, temp)
temp_temp['up_product_order_number_skew'] = temp_temp.up_order_number_mean / temp_temp.order_number

data['up_product_order_number_skew'] = temp_temp.groupby(['user_id', 'product_id'])['up_product_order_number_skew'].min().astype(np.float32)

del temp, temp_temp
gc.collect()

In [None]:
temp = data[['up_reorder_sum']]
temp.reset_index(inplace=True)
temp_temp = pd.DataFrame(temp.groupby('user_id')['up_reorder_sum'].sum())
temp_temp.rename(columns = {'up_reorder_sum' : 'user_reorders'}, inplace=True)
temp_temp.reset_index(inplace=True)

temp = pd.merge(temp, temp_temp)

del temp_temp
gc.collect()

temp['up_reorder_share_in_user'] = temp.up_reorder_sum / temp.user_reorders
data['up_reorder_share_in_user'] = temp.groupby(['user_id', 'product_id'])['up_reorder_share_in_user'].min().astype(np.float32)
data.up_reorder_share_in_user.fillna(value=0, inplace=True)

del temp
gc.collect()

In [None]:
temp = orders_products[['user_id', 'product_id', 'order_number', 'days_since_prior_order', 'add_to_cart_order' ]]
temp.fillna(value=1, inplace=True)
temp['num_orders'] = temp.groupby(['user_id'])['order_number'].transform(max)
temp['days_since_prior_order'] = np.power(temp['days_since_prior_order'], 1/6)
temp['add_to_cart_order'] = np.power(temp['add_to_cart_order'], 1/6)
temp['up_product_score'] = (temp['order_number'] * temp['days_since_prior_order']
                             / (temp['add_to_cart_order'] * temp['num_orders']))

data['up_product_score'] = temp.groupby(['user_id', 'product_id'])['up_product_score'].sum().astype(np.float32)

del temp
gc.collect()

In [None]:
dep = pd.DataFrame()
dep['dep_products'] = orders_products.groupby(['user_id', 'department_id'])['product_id'].nunique()
dep['dep_reordered'] = orders_products.groupby(['user_id', 'department_id'])['reordered'].sum()
dep.reset_index(inplace=True)
dep.fillna(value=0, inplace=True)
dep['dep_products'] = dep['dep_products'].astype(np.int32)
dep['dep_reordered'] = dep['dep_reordered'].astype(np.int32)

In [None]:
ais = pd.DataFrame()
ais['ais_products'] = orders_products.groupby(['user_id', 'aisle_id'])['product_id'].nunique()
ais['ais_reordered'] = orders_products.groupby(['user_id', 'aisle_id'])['reordered'].sum()
ais.reset_index(inplace=True)
ais.fillna(value=0, inplace=True)
ais['ais_products'] = ais['ais_products'].astype(np.int32)
ais['ais_reordered'] = ais['ais_reordered'].astype(np.int32)

In [None]:
del orders_products, orders
gc.collect()

###### Joining data with prd and users, dep and ais

In [None]:
data.reset_index(inplace=True)
prd.reset_index(inplace=True)
users.reset_index(inplace=True)

In [None]:
data = pd.merge(data, prd, on='product_id')
del prd
gc.collect()

data = pd.merge(data, users, on='user_id')
del users
gc.collect()

data = pd.merge(data, dep, on = ['user_id', 'department_id'])
del dep
gc.collect()

data = pd.merge(data, ais, on = ['user_id', 'aisle_id'])
del ais
gc.collect()

In [None]:
data['user_product_reordered_ratio'] = (data.up_reorder_sum + 1.0) / data.up_orders
data['user_product_reordered_ratio'] = data['user_product_reordered_ratio'].astype(np.float32)

data['up_order_rate'] = data.up_orders / data.user_orders
data['up_order_rate'] = data['up_order_rate'].astype(np.float32)

data['up_orders_since_last_order'] = data.up_orders - data.up_last_order
data['up_orders_since_last_order'] = data['up_orders_since_last_order'].astype(np.int32)

data['up_order_rate_since_first_order'] = data.up_orders / (data.user_orders - data.up_first_order + 1)
data['up_order_rate_since_first_order'] = data['up_order_rate_since_first_order'].astype(np.float32)

data['up_orders_since_last_reorder_order'] = data.order_number - data.up_last_reorder_order
data['up_orders_since_last_reorder_order'] = data['up_orders_since_last_reorder_order'].astype(np.float32)

data['up_days_since_last_order'] = data.user_period - data.up_days_since_prior_order_sum
data['up_days_since_last_order'] = data['up_days_since_last_order'].astype(np.float32)

data['up_hours_since_last_order'] = (data.user_period - data.up_days_since_prior_order_sum)*24 + data.order_hour_of_day
data['up_hours_since_last_order'] = data['up_hours_since_last_order'].astype(np.float32)

# data['user_mean_f1_average_basket'] = data.user_mean_f1_score * data.user_general_average_basket
# data['user_mean_f1_average_basket'] = data['user_mean_f1_average_basket'].astype(np.float32)

In [None]:
data = pd.merge(data, ordert[['user_id', 'product_id', 'reordered']], how='left', on=['user_id', 'product_id'])

del ordert
gc.collect()

In [None]:
for column in data.columns.values:
    if data[column].isnull().any() == True:
        print(column)

In [None]:
data.reordered.fillna(value=0, inplace=True)
data['reordered'] = data['reordered'].astype(np.int16)
gc.collect()

###### Train and Test Datasets

In [None]:
train = data[data['eval_set'] == 'train']
train = train.drop(['eval_set'], axis=1)
train_columns = train.columns

np.savetxt(path + 'xgb_train_columns_4.csv', train_columns, delimiter=",", fmt='%s', header='')
train.to_pickle(path + 'xgb_train_features_4')

del train
gc.collect()

test = data[data['eval_set'] == 'test']
test = test.drop(['eval_set', 'reordered'], axis=1)
test_columns = test.columns

np.savetxt(path + 'xgb_test_columns_4.csv', test_columns, delimiter=",", fmt='%s', header='')
test.to_pickle(path + 'xgb_test_features_4')

del data
gc.collect()