### 1. Importing packages

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
import matplotlib.pyplot as plt
import time

%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
gc.collect()

### 2. Features creation

###### Loading initial data

In [None]:
path = 'C:/Users/AM000110/Desktop/ML/Kaggle/Instacart_Market_Basket_Analysis/input_data/'

In [None]:
# Load Data ---------------------------------------------------------------

aisles = pd.read_csv(path + 'aisles.csv')
departments = pd.read_csv(path + 'departments.csv')
orderp = pd.read_csv(path + 'order_products__prior.csv')
ordert = pd.read_csv(path + 'order_products__train.csv')
orders = pd.read_csv(path + 'orders.csv')
products = pd.read_csv(path + 'products.csv')

###### Joining to product aisle and department names

In [None]:
products = pd.merge(products, aisles, on='aisle_id')
products = pd.merge(products, departments, on='department_id')
products = products[['product_id', 'aisle_id', 'department_id']]

del aisles, departments
gc.collect()

###### Adding user_ids to order_train

In [None]:
orders_users = orders[['order_id', 'user_id']]
orders_users = orders_users[orders_users['order_id'].isin(ordert.order_id.values)]
ordert = pd.merge(ordert, orders_users, on='order_id')

orders_users
gc.collect()

###### Joining orders data with prior data

In [None]:
orders_products = pd.merge(orders, orderp, on='order_id')
orders_products = pd.merge(orders_products, products, on='product_id')

del orderp, products
gc.collect()

In [None]:
f1_score = pd.read_csv('C:/Users/AM000110/Desktop/ML/Kaggle/Instacart_Market_Basket_Analysis/reordered_basket_size_prediction/f1_score.csv')
f1_score = f1_score[['order_id', 'average_f1_score']]

orders_products = pd.merge(orders_products, f1_score, on='order_id')
del f1_score
gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby('user_id', as_index=False)['order_number'].max())
temp.rename(columns={'order_number' : 'max_order_number'}, inplace=True)

orders_products = pd.merge(orders_products, temp, on='user_id')
del temp
gc.collect()

In [None]:
orders_products['dif'] = orders_products.max_order_number - orders_products.order_number
prior_train = orders_products[orders_products.dif == 0]
prior_train = prior_train[['user_id', 'order_id', 'product_id', 'reordered', 'days_since_prior_order', 'order_dow', 'order_hour_of_day']]
prior_train.rename(columns={'days_since_prior_order' : 'time_since_last_order'}, inplace=True)

In [None]:
orders_products = orders_products[orders_products.dif > 0]
gc.collect()

###### Product general features

for each product:
* total number of orders (prod_orders)
* sum of reorders (prod_reorders)
* number of unique users who buy it (prod_first_orders)
* number of unque users who buy it more than 1 time (prod_second_orders)
* product reorder probability = prod_second_orders / prod_first_orders
* prod_reorder_times = 1 + prod_reorders / prod_first_orders
* prod_reorder_ratio = prod_reorders / prod_orders
* prod_average_order_hour_of_day (from me)
* prod_average_order_dow (from me)
* prod_frequency = prod_orders / total number of orders (from me)
* prod_reorder_frequency = prod_reorders / sum of total reorders (from me)
* prod_most_frequent_reoder_day (from me)
* product second reorder probability (from me)
* product aisle market share = aisle_orders / total_orders (from me)
* product deparment market share = department orders / total orders (from me)
* product market share in aisle = prod_orders / aisle_orders (from me)
* product market share in department = prod_orders / department_orders (from me)
* product order mean size (from me)
* product mean share within order (from me)
* product reorder std within users

In [None]:
prd = pd.DataFrame()

In [None]:
prd['prod_orders'] = orders_products.groupby('product_id')['order_id'].size()
prd['prod_reorders'] = orders_products.groupby('product_id')['reordered'].sum()
prd['prod_first_orders'] = orders_products.groupby('product_id')['user_id'].nunique()
prd['aisle_id'] = orders_products.groupby('product_id')['aisle_id'].min().astype(int)
prd['department_id'] = orders_products.groupby('product_id')['department_id'].min().astype(int)
prd['prod_average_order_hour_of_day'] = orders_products.groupby('product_id')['order_hour_of_day'].mean()
prd['prod_average_order_dow'] = orders_products.groupby('product_id')['order_dow'].mean()
prd['prod_period'] = orders_products.groupby('product_id')['days_since_prior_order'].sum()
# prd['prod_average_f1_score'] = orders_products.groupby('product_id')['average_f1_score'].mean()
# prd['prod_max_f1_score'] = orders_products.groupby('product_id')['average_f1_score'].max()
# prd['prod_std_f1_score'] = orders_products.groupby('product_id')['average_f1_score'].std()
prd['prod_reoreder_mean_days'] = orders_products[orders_products.reordered > 0].groupby('product_id')['days_since_prior_order'].mean() 
prd['prod_reoreder_mean_hours'] = orders_products[orders_products.reordered > 0].groupby('product_id')['order_hour_of_day'].mean()
prd['prod_reoreder_mean_dow'] = orders_products[orders_products.reordered > 0].groupby('product_id')['order_dow'].mean()
prd['prod_mean_order_number'] = orders_products.groupby('product_id')['order_number'].mean()
prd['prod_reorder_mean_order_number'] = orders_products[orders_products.reordered > 0].groupby('product_id')['order_number'].mean()
prd['prod_reorder_user_share'] = orders_products[orders_products.reordered > 0].groupby('product_id')['user_id'].nunique() / orders_products.user_id.nunique()
prd['prod_reorder_mean_add_to_cart_order'] = orders_products[orders_products.reordered > 0].groupby('product_id')['add_to_cart_order'].mean()

In [None]:
temp = pd.DataFrame(orders_products.groupby('product_id')['user_id'].value_counts())
temp.rename(columns={'user_id' : 'counts'}, inplace=True)
temp.reset_index(inplace=True)

prd['prod_second_orders'] = temp[temp['counts'] > 1].groupby('product_id')['counts'].size()
prd['prod_third_orders'] = temp[temp['counts'] > 2].groupby('product_id')['counts'].size()

del temp
gc.collect()

In [None]:
prd['prod_reorder_probability'] = prd.prod_second_orders / prd.prod_first_orders
prd['prod_reorder_times'] = 1 + prd.prod_reorders / prd.prod_first_orders
prd['prod_reorder_ratio'] = prd.prod_reorders / prd.prod_orders
prd['prod_second_reorder_probability'] = prd.prod_third_orders / prd.prod_second_orders
prd['prod_frequency'] = prd.prod_orders / len(np.unique(orders_products.order_id.values))
prd['prod_reorder_frequency'] = prd.prod_reorders / orders_products.reordered.sum()

In [None]:
temp = orders_products[orders_products['reordered'] != 0]
temp = pd.DataFrame(temp.groupby('product_id')['order_dow'].value_counts())
temp.rename(columns={'order_dow' : 'order_dow_counts'}, inplace=True)
temp.reset_index(inplace=True)
temp['max'] = temp.groupby(['product_id'])['order_dow_counts'].transform(max)
temp['dif'] = temp['max'] - temp['order_dow_counts']
temp = temp[temp['dif'] == 0]
temp = pd.DataFrame(temp.groupby('product_id')['order_dow'].min())
temp.rename(columns={'order_dow' : 'prod_most_frequent_reroder_order_dow'}, inplace=True)

prd = prd.join(temp)

del temp
gc.collect()

In [None]:
temp = orders_products[orders_products['reordered'] != 0]
temp = pd.DataFrame(temp.groupby('product_id')['order_hour_of_day'].value_counts())
temp.rename(columns={'order_hour_of_day' : 'order_hour_of_day_counts'}, inplace=True)
temp.reset_index(inplace=True)
temp['max'] = temp.groupby(['product_id'])['order_hour_of_day'].transform(max)
temp['dif'] = temp['max'] - temp['order_hour_of_day_counts']
temp = temp[temp['dif'] == 0]
temp = pd.DataFrame(temp.groupby('product_id')['order_hour_of_day'].min())
temp.rename(columns={'order_hour_of_day' : 'prod_most_frequent_reroder_order_hour_of_day'}, inplace=True)

prd = prd.join(temp)

del temp
gc.collect()

In [None]:
temp_temp = pd.DataFrame()
temp = orders_products[['product_id','user_id', 'order_id', 'order_number', 'reordered']]

temp_temp['first_order'] = temp.groupby(['product_id', 'user_id'])['order_number'].min()
temp_temp['last_order'] = temp.groupby(['product_id', 'user_id'])['order_number'].max()
temp_temp['dif'] = temp_temp.last_order - temp_temp.first_order
temp_temp['reorder_order_numbers'] = temp[temp.reordered > 0].groupby(['product_id', 'user_id'])['order_number'].apply(list)

del temp
gc.collect()

In [None]:
a = temp_temp.dropna().copy()    
a['mean'] = a['dif']

In [None]:
means = []
i = 1

t = time.time()
for row in a[a.reorder_order_numbers.map(len) > 1].iterrows():

    row[1].reorder_order_numbers.insert(0, row[1].first_order)
    means.append(np.mean(np.array(row[1].reorder_order_numbers)[1:] - np.array(row[1].reorder_order_numbers)[:-1]))

print(time.time() - t)
a.loc[a.reorder_order_numbers.map(len) > 1, 'mean'] = means

In [None]:
temp_temp.reset_index(inplace=True)
a.reset_index(inplace=True)

a = a[['product_id', 'user_id', 'mean']]
temp_temp = pd.merge(temp_temp, a, on=['product_id', 'user_id'], how='left')
temp_temp.fillna(value=0, inplace=True)

In [None]:
prd['prod_average_first_last_orders_dif'] = temp_temp.groupby('product_id')['dif'].mean()
prd['prod_reorder_order_since_prior_mean'] = temp_temp.groupby('product_id')['mean'].mean()

In [None]:
data = pd.DataFrame()
data['up_reorder_order_since_prior_mean'] = temp_temp.groupby(['user_id', 'product_id'])['mean'].min()

In [None]:
temp.reset_index(inplace=True)
data = pd.merge(data, temp, on = ['user_id', 'product_id'], how = 'left')

In [None]:
del temp_temp, a, means
gc.collect()

In [None]:
temp = orders_products[['user_id', 'aisle_id', 'reordered']]

temp_temp = pd.DataFrame()
temp_temp['aisle_first_orders'] = temp.groupby('aisle_id')['user_id'].nunique()
temp_temp['aisle_second_orders'] = temp[temp.reordered > 0].groupby('aisle_id')['user_id'].nunique()
temp_temp['aisle_reorder_probability'] = temp_temp.aisle_second_orders / temp_temp.aisle_first_orders
temp_temp = temp_temp[['aisle_reorder_probability']]
temp_temp.reset_index(inplace=True)

del temp
gc.collect()

In [None]:
temp = orders_products[['product_id', 'aisle_id']]
temp.drop_duplicates(inplace=True)

temp = pd.merge(temp, temp_temp, on = 'aisle_id', how = 'left')
prd['prod_aisle_reorder_probability'] = temp.groupby('product_id')['aisle_reorder_probability'].min()

del temp
gc.collect()

In [None]:
temp = orders_products[['user_id', 'department_id', 'reordered']]

temp_temp = pd.DataFrame()
temp_temp['department_first_orders'] = temp.groupby('department_id')['user_id'].nunique()
temp_temp['department_second_orders'] = temp[temp.reordered > 0].groupby('department_id')['user_id'].nunique()
temp_temp['department_reorder_probability'] = temp_temp.department_second_orders / temp_temp.department_first_orders
temp_temp = temp_temp[['department_reorder_probability']]
temp_temp.reset_index(inplace=True)

del temp
gc.collect()

In [None]:
temp = orders_products[['product_id', 'department_id']]
temp.drop_duplicates(inplace=True)

temp = pd.merge(temp, temp_temp, on = 'department_id', how = 'left')
prd['prod_department_reorder_probability'] = temp.groupby('product_id')['department_reorder_probability'].min()

del temp
gc.collect()

In [None]:
temp = pd.DataFrame()
temp['aisle_orders'] = orders_products.groupby('aisle_id', as_index=False)['order_id'].size()
temp['prod_aisle_market_share'] = temp.aisle_orders / len(np.unique(orders_products.order_id.values))
temp.reset_index(inplace = True)
temp_temp = pd.DataFrame(orders_products.groupby('product_id')['aisle_id'].min())
temp_temp.reset_index(inplace=True)
temp_temp = temp_temp.merge(temp, on='aisle_id')

prd['prod_aisle_market_share']  = temp_temp.groupby('product_id')['prod_aisle_market_share'].min()
prd['prod_aisle_orders']  = temp_temp.groupby('product_id')['aisle_orders'].min()

del temp, temp_temp
gc.collect()

In [None]:
temp = pd.DataFrame()
temp['department_orders'] = orders_products.groupby('department_id', as_index=False)['order_id'].size()
temp['prod_department_market_share'] = temp.department_orders / len(np.unique(orders_products.order_id.values))
temp.reset_index(inplace = True)
temp_temp = pd.DataFrame(orders_products.groupby('product_id')['department_id'].min())
temp_temp.reset_index(inplace=True)
temp_temp = temp_temp.merge(temp, on='department_id')

prd['prod_department_market_share']  = temp_temp.groupby('product_id')['prod_department_market_share'].min()
prd['prod_department_orders']  = temp_temp.groupby('product_id')['department_orders'].min()

del temp, temp_temp
gc.collect()

In [None]:
prd['prod_market_share_in_aisle'] = prd.prod_orders / prd.prod_aisle_orders
prd['prod_market_share_in_department'] = prd.prod_orders / prd.prod_department_orders

In [None]:
temp = pd.DataFrame(orders_products.groupby(['product_id', 'user_id'], as_index=False)['reordered'].sum())
temp = pd.DataFrame(temp.groupby(['product_id'])['reordered'].apply(list))

std_var = []
N = orders_products.user_id.nunique()
for row in temp.iterrows():
    all_users_data = row[1][0] + [0]*(N - len(row[1][0]))
    std_var.append((row[0], np.std(all_users_data), np.var(all_users_data)))

std_var = pd.DataFrame(std_var, columns=['product_id', 'prod_reorder_std', 'prod_reorder_var'])
prd['prod_reorder_std'] = std_var.groupby('product_id')['prod_reorder_std'].min()
prd['prod_reorder_variance'] = std_var.groupby('product_id')['prod_reorder_var'].min()

del temp, std_var
gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby('order_id', as_index=False)['product_id'].size())
temp.columns = ['order_size']
temp['product_share'] = 1 / temp.order_size
temp.reset_index(inplace=True)

temp_temp = orders_products[['order_id', 'product_id']]
temp_temp = pd.merge(temp_temp, temp, on='order_id')

prd['prod_mean_order_size'] = temp_temp.groupby('product_id')['order_size'].mean()
prd['prod_mean_order_share'] = temp_temp.groupby('product_id')['product_share'].mean()

del temp, temp_temp
gc.collect()

In [None]:
# prd.drop(['prod_reorders', 'prod_first_orders', 'prod_second_orders', 'prod_aisle_orders', 'prod_department_orders'], axis=1, inplace=True)

###### User general features

for each user:
* total number of orders (user_orders)
* sum of days_since_prior_order (user_period)
* mean of days_since_prior_order (user_mean_days_since_prior)
* total number of ordered products (user_total_products)
* user_reorder_ratio = sum(user_reordered_products) / (total number of products without first order)
* total number of ordered unique products (user_distinct_products)
* user max reorder rate (from me)
* user mean reorder rate (from me)
* user average reorder (from me)
* user average hour of order (from me)
* user average order dow (from me)


In [None]:
users = pd.DataFrame()

In [None]:
users['user_distinct_aisles'] = orders_products.groupby('user_id')['aisle_id'].nunique()
users['user_distinct_departments'] = orders_products.groupby('user_id')['department_id'].nunique()
users['user_average_order_hour_of_day'] = orders_products.groupby('user_id')['order_hour_of_day'].mean()
users['user_average_order_dow'] = orders_products.groupby('user_id')['order_dow'].mean()

# users['user_average_f1_score'] = orders_products.groupby('user_id')['average_f1_score'].mean()
# users['user_max_f1_score'] = orders_products.groupby('user_id')['average_f1_score'].max()
# users['user_std_f1_score'] = orders_products.groupby('user_id')['average_f1_score'].std()
users['user_reoreder_mean_days'] = orders_products[orders_products.reordered > 0].groupby('user_id')['days_since_prior_order'].mean() 
users['user_reoreder_mean_hours'] = orders_products[orders_products.reordered > 0].groupby('user_id')['order_hour_of_day'].mean()
users['user_reoreder_mean_dow'] = orders_products[orders_products.reordered > 0].groupby('user_id')['order_dow'].mean()

users['user_reorder_distinct_products'] = orders_products[orders_products.reordered > 0].groupby('user_id')['product_id'].nunique()
users['user_reorder_distinct_aisles'] = orders_products[orders_products.reordered > 0].groupby('user_id')['aisle_id'].nunique()
users['user_reorder_distinct_departments'] = orders_products[orders_products.reordered > 0].groupby('user_id')['department_id'].nunique()


In [None]:
temp = orders[orders['eval_set'] == 'prior']
users['user_orders'] = temp.groupby('user_id')['order_number'].max()
users['user_period'] = temp.groupby('user_id')['days_since_prior_order'].sum()
users['user_mean_days_since_prior'] = temp.groupby('user_id')['days_since_prior_order'].mean()

del temp
gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby(['user_id', 'order_id'])['reordered'].sum())
temp.reset_index(inplace=True)

temp_products = pd.DataFrame(orders_products.groupby('order_id')['product_id'].apply(list))
temp_products.reset_index(inplace=True)
temp_products.columns = ['order_id', 'products_list']
temp_products['order_size'] = temp_products['products_list'].str.len() # defining order size
temp_products.drop(['products_list'], axis=1, inplace=True)

temp = pd.merge(temp, temp_products, on = 'order_id') 
temp['reorder_ratio'] = temp.reordered / temp.order_size

users['user_max_reorder_rate'] = temp.groupby('user_id')['reorder_ratio'].max()
users['user_mean_reorder_rate'] = temp.groupby('user_id')['reorder_ratio'].mean()
users['user_average_reorder'] = temp.groupby('user_id')['reordered'].mean()

del temp, temp_products
gc.collect()

In [None]:
us = pd.DataFrame()
us['user_total_products'] = orders_products.groupby('user_id')['product_id'].size()
us['user_distinct_products'] = orders_products.groupby('user_id')['product_id'].nunique()

In [None]:
temp = orders_products[orders_products['order_number'] > 1]
us['user_reorder_ratio'] = (orders_products.groupby('user_id')['reordered'].sum()
                           / temp.groupby('user_id')['product_id'].size())

del temp
gc.collect()

In [None]:
users = users.join(us)
users['user_average_basket'] = users.user_total_products / users.user_orders

del us
gc.collect()

###### Joining test and train orders data

In [None]:
temp = orders[orders['eval_set'] != 'prior'][['user_id', 'order_id', 'eval_set', 'days_since_prior_order', 'order_dow', 'order_hour_of_day']]  
temp.rename(columns={'days_since_prior_order' : 'time_since_last_order'}, inplace=True)
temp.set_index('user_id', inplace=True)

users = users.join(temp)

del temp
gc.collect()

###### Database for training

for each user + product pair:
* total number of orders by user ordered the product (up_orders)
* first order number of the product for user (up_first_order)
* last order number of the product for user (up_last_order)
* mean add_to_cart_order of product for user (up_average_cart_position)
* up_order_rate = up_orders / user_orders
* up_orders_since_last_order = user_orders - up_last_order
* up_order_rate_since_first_order = up_orders/(user_orders - up_first_order + 1)
* up_sum_cart_position (from me)
* up_reoreder_frequency (from me)
* up_porduct_score (form me)

In [None]:
# data = pd.DataFrame()

In [None]:
data['up_orders'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].size()
data['up_first_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].min()
data['up_last_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].max()
data['up_average_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].mean()
data['up_sum_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].sum()
data['up_product_period'] = orders_products.groupby(['user_id', 'product_id'])['days_since_prior_order'].sum()
data['up_product_sum_order_number'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].sum()
# data['up_average_f1_scorer'] = orders_products.groupby(['user_id', 'product_id'])['average_f1_score'].mean()


In [None]:
temp = pd.DataFrame(orders_products.groupby(['user_id', 'product_id'])['reordered'].sum())
temp['total_reordered'] = temp.groupby('user_id')['reordered'].transform(sum)
temp['up_reoreder_frequency'] = temp.reordered / temp.total_reordered
temp = temp[['up_reoreder_frequency']]
data = data.join(temp)

del temp
gc.collect()

In [None]:
temp = orders_products[['user_id', 'product_id', 'order_number', 'reordered', 'days_since_prior_order', 'add_to_cart_order' ]]
temp.fillna(value=1, inplace=True)
temp['num_orders'] = temp.groupby(['user_id'])['order_number'].transform(max)
temp['days_since_prior_order'] = np.power(temp['days_since_prior_order'], 1/6)
temp['add_to_cart_order'] = np.power(temp['add_to_cart_order'], 1/6)
temp['up_product_score'] = (temp['reordered'] * temp['order_number'] * temp['days_since_prior_order']
                             / (temp['add_to_cart_order'] * temp['num_orders']))

data['up_product_score'] = temp.groupby(['user_id', 'product_id'])['up_product_score'].sum()

del temp
gc.collect()

In [None]:
temp = orders_products[['user_id', 'product_id', 'order_number', 'days_since_prior_order', 'add_to_cart_order' ]]
temp.fillna(value=1, inplace=True)
temp['num_orders'] = temp.groupby(['user_id'])['order_number'].transform(max)
temp['days_since_prior_order'] = np.power(temp['days_since_prior_order'], 1/6)
temp['add_to_cart_order'] = np.power(temp['add_to_cart_order'], 1/6)
temp['up_product_score2'] = (temp['order_number'] * temp['days_since_prior_order']
                             / (temp['add_to_cart_order'] * temp['num_orders']))

data['up_product_score2'] = temp.groupby(['user_id', 'product_id'])['up_product_score2'].sum()

del temp
gc.collect()

In [None]:
temp = pd.DataFrame(orders_products.groupby(['user_id'])['product_id'].size())
temp_temp = pd.DataFrame(orders_products.groupby(['user_id', 'product_id'])['order_id'].size())

temp.rename(columns={'product_id' : 'order_size'}, inplace=True)
temp_temp.reset_index(inplace=True)
temp.reset_index(inplace=True)
temp_temp = pd.merge(temp_temp, temp, on = 'user_id')
temp_temp['up_product_user_share'] = temp_temp.order_id / temp_temp.order_size

data['up_product_user_share'] = temp_temp.groupby(['user_id', 'product_id'])['up_product_user_share'].min()

del temp, temp_temp
gc.collect()

In [None]:
del orders_products, orders
gc.collect()

###### Joining data with prd and users

In [None]:
data.reset_index(inplace=True)
prd.reset_index(inplace=True)
users.reset_index(inplace=True)

In [None]:
data = pd.merge(data, prd, on='product_id')

del prd
gc.collect()

data = pd.merge(data, users, on='user_id')

del users
gc.collect()

In [None]:
data['up_order_rate'] = data.up_orders / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.up_orders / (data.user_orders - data.up_first_order + 1)
data['up_days_since_last_order'] = data.user_period - data.up_product_period
data['up_product_average_sum_order_numbers'] = data.up_product_sum_order_number / (data.user_orders * data.up_orders)
# data['user_mean_f1_average_basket'] = data.user_average_f1_score * data.user_average_basket

In [None]:
data = pd.merge(data, prior_train, how='left', on=['user_id', 'product_id'])

del ordert, prior_train
gc.collect()

###### Changing data types for memory optimization

In [None]:
data_dtypes = pd.read_csv(path + 'xgb_dtype_columns.csv', names=['column', 'dtype'], header=None)

In [None]:
data.fillna(value=0, inplace=True)
for row in data_dtypes.itertuples():
    if row.dtype != 'np.float64':
        data[row.column] = data[row.column].astype(eval(row.dtype))
        print(row.column, row.dtype)

###### Saving created features

In [None]:
del orders, orders_products, orders_users, ordert
gc.collect()

In [None]:
data.to_pickle(path + 'xgb_prior_train')

In [None]:
# train = pd.read_pickle(path + 'xgb_prior_train')

In [None]:
# orders = pd.read_csv(path + 'orders.csv')
# orders = orders[orders.eval_set == 'test']
# test_users = orders.user_id.values

# del orders
# gc.collect()

In [None]:
# test_user_data = train[train.user_id.isin(test_users)][['user_id', 'order_id', 'product_id', 'reordered']]
# test_user_data.to_pickle(path + 'xgb_prior_train_test')

# del test_user_data
# gc.collect()

In [None]:
# # test_user_test = train[train.user_id.isin(test_users)]
# test_user_test.to_pickle(path + 'xgb_prior_train_test_data')

# del test_user_data
# gc.collect()