### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import random
import gensim
from sklearn.cluster import KMeans
from collections import Counter
%matplotlib inline

In [5]:
import pickle as pkl
scores = pkl.load(open("../Data/prediction_arboretum (1).pkl", 'rb'))

ValueError: unsupported pickle protocol: 4

### Read Data

In [2]:
aisles = pd.read_csv("../Data/aisles.csv", dtype = {"aisle_id" : np.int16})
departments = pd.read_csv("../Data/departments.csv", dtype = {"department_id" : np.int16})
order_products_prior = pd.read_csv("../Data/order_products__prior.csv", dtype = {
                                                    "order_id" : np.int32,
                                                    "product_id" : np.int32,
                                                    "add_to_cart_order" : np.int16,
                                                    "reordered" : np.int16})
order_products_train = pd.read_csv("../Data/order_products__train.csv", dtype = {
                                                    "order_id" : np.int32,
                                                    "product_id" : np.int32,
                                                    "add_to_cart_order" : np.int16,
                                                    "reordered" : np.int16})
orders = pd.read_csv("../Data/orders.csv", dtype = {
                                                    "order_id" : np.int32,
                                                    "user_id" : np.int32,
                                                    "order_number" : np.int16,
                                                    "order_dow" : np.int16,
                                                    "order_hour_of_day" : np.int16,
                                                    "day_since_prior_order" : np.int16})
products = pd.read_csv("../Data/products.csv", dtype = {
                                                    "aisle_id" : np.int16,
                                                    "product_id" : np.int32,
                                                    "department_id" : np.int16})
# aisles_map = {row.aisle_id:row.aisle for row in aisles.itertuples()}
# departments_map = {row.department_id:row.department for row in departments.itertuples()}

### Enhance Products data
 - Add 'None' as a new product, we will add this product to every order which has no reorderes
 - Add additional product metadata variables extracted from word2vec clusters

In [3]:
products = pd.read_csv("../Data/products.csv")
products = pd.DataFrame({"product_name" : ["None"], "product_id" : [0], "aisle_id" : [0], "department_id" : [0]}
                       ).append(products).reset_index(drop = True)
products['product_name_length'] = products.product_name.map(lambda x : len(x.split()))
products['is_organic'] = products.product_name.map(lambda x : 1.0 if 'organic' in x.lower() else 0.0)
products['is_gluten_free'] = products.product_name.map(lambda x : 1.0 if 'gluten' in x.lower() and 'free' in x.lower() else 0.0)

num_products = products.shape[0]
product_name_idf = {w:np.log(num_products*1.0/(c + 20.0)) for w,c in Counter([word.lower() for product in 
                                    products.product_name.map(lambda x : x.split()) for word in product]).items()}
products['product_name_idf'] = products.product_name.map(lambda x : np.mean([product_name_idf[w] for w in x.lower().split()]))
products['product_name_idf_max'] = products.product_name.map(lambda x : np.max([product_name_idf[w] for w in x.lower().split()]))
wv_model = gensim.models.Doc2Vec.load("../Data/instacart.word2vec")
def get_product_vectors(p):
    try:
        return wv_model.wv['p' + str(p)]
    except KeyError:
        return None
products['vector'] = products.product_id.map(get_product_vectors)

kmeans = KMeans(n_clusters=150, random_state=2, n_jobs = 50)
kmeans.fit(np.vstack(products[products.vector.isnull() == False].vector))
product_clusters = products[products.vector.isnull() == False][['product_id']]
product_clusters['product_wv_cluster'] = kmeans.predict(np.vstack(products[products.vector.isnull() == False].vector))
products = products.merge(product_clusters, on = 'product_id', how = 'left')
products['product_wv_cluster'].fillna(-1, inplace = True)
del products['vector']

In [4]:
users = orders[['user_id']].drop_duplicates().reset_index(drop = True)
def get_user_vectors(u):
    try:
        return wv_model.docvecs['u' + str(u)]
    except KeyError:
        return None
users['vector'] = users.user_id.map(get_user_vectors)
kmeans = KMeans(n_clusters=275, random_state=2, n_jobs = 50)
kmeans.fit(np.vstack(users.vector))
users['user_cluster'] = kmeans.predict(np.vstack(users.vector))
del users['vector']
users.user_cluster.value_counts()

26     7764
23     6675
73     5473
190    4415
203    4178
49     3296
159    3033
179    2897
72     2851
181    2754
11     2735
258    2584
53     2564
107    2436
125    2410
68     2221
140    2176
114    2039
61     1981
271    1969
136    1951
66     1864
236    1863
178    1773
109    1769
112    1667
238    1646
205    1637
245    1601
83     1594
       ... 
226     169
154     163
247     158
119     158
56      157
101     153
239     153
104     150
199     149
197     144
246     144
175     143
220     136
237     134
60      133
137     130
100     123
222     120
177     120
59      118
74      116
214     111
21      100
256     100
103      65
227      52
90       52
260      10
235       5
165       4
Name: user_cluster, Length: 275, dtype: int64

### Add product 'None' to orders with no reordered

In [5]:
orders_without_reorder =  order_products_prior[['order_id', 'reordered']].append(order_products_train[['order_id', 
                                                'reordered']]).groupby('order_id').max().reset_index()
# first_orders = set(orders[orders.order_number == 1].order_id)
orders_without_reorder = set(orders_without_reorder[orders_without_reorder.reordered == 0].order_id)# - first_orders

prior_orders_with_reorder = order_products_prior[order_products_prior.order_id.isin(orders_without_reorder) == False]
train_orders_with_reorder = order_products_train[order_products_train.order_id.isin(orders_without_reorder) == False]

prior_orders_without_reorder = order_products_prior[order_products_prior.order_id.isin(orders_without_reorder)]
train_orders_without_reorder = order_products_train[order_products_train.order_id.isin(orders_without_reorder)]

orders_with_none = pd.DataFrame({"order_id"          : list(orders_without_reorder), 
                                 'product_id'        : list(np.zeros(len(orders_without_reorder))), 
                                 'add_to_cart_order' : list(np.zeros(len(orders_without_reorder))), 
                                 'reordered'         : list(np.ones(len(orders_without_reorder)))})
prior_orders_without_reorder = prior_orders_without_reorder.append(orders_with_none[orders_with_none.order_id.isin(
                                                                        set(prior_orders_without_reorder.order_id))])

train_orders_without_reorder = train_orders_without_reorder.append(orders_with_none[orders_with_none.order_id.isin(
                                                                        set(train_orders_without_reorder.order_id))])
order_products_prior = prior_orders_with_reorder.append(prior_orders_without_reorder).sort_values(by = 
                                                                                    ['order_id', 'add_to_cart_order'])
order_products_train = train_orders_with_reorder.append(train_orders_without_reorder).sort_values(by = 
                                                                                    ['order_id', 'add_to_cart_order'])
order_products_train[order_products_train.order_id.isin(orders_without_reorder)].head()

Unnamed: 0,add_to_cart_order,order_id,product_id,reordered
292,0.0,719,0.0,1.0
180,1.0,719,45683.0,0.0
363,0.0,904,0.0,1.0
223,1.0,904,8013.0,0.0
224,2.0,904,46149.0,0.0


### Create Day and Days ago variable....This will be our proxy for timestamp

In [6]:
orders['day'] = orders.groupby('user_id')['days_since_prior_order'].cumsum()
orders['day'].fillna(0.0, inplace = True)
user_day_max = orders[['user_id', 'day']].groupby('user_id').max().reset_index()
user_day_max.columns = ['user_id', 'day_max']
orders = orders.merge(user_day_max, on = 'user_id', how = 'inner')
orders['days_ago'] = orders['day_max'] - orders['day']
del orders['day_max']

orders.order_hour_of_day = orders.order_hour_of_day.map(lambda x : int(x/3))
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,day,days_ago
0,2539329,1,prior,1,2,2,,0.0,190.0
1,2398795,1,prior,2,3,2,15.0,15.0,175.0
2,473747,1,prior,3,3,4,21.0,36.0,154.0
3,2254736,1,prior,4,4,2,29.0,65.0,125.0
4,431534,1,prior,5,4,5,28.0,93.0,97.0


### Merge orders data with order_products data

In [7]:
prior_orders = orders.merge(order_products_prior, on = 'order_id', how = 'inner').merge(
    products, on = 'product_id', how = 'inner')
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner').merge(
    products, on = 'product_id', how = 'inner')

### Some utility functions to help us in feature engineering later on
 - Kinetic Energy is kind of like entropy measure, KE = 1 implies no diversity in data lower the KE higher diversity

In [8]:
def get_kinetic_energy(x):
    sm = sum(x)
    p = [xi*1.0/sm for xi in x]
    return sum([pi**2 for pi in p]) 

### Create user specific features

In [9]:
user_cluster_with_low_users = set([u for u,c in users.user_cluster.value_counts().to_dict().items() if c < 100])
users.user_cluster = users.user_cluster.map(lambda x : -1 if x in user_cluster_with_low_users else x)

In [10]:
prior_orders_p = prior_orders[(prior_orders.order_number != 1) | (prior_orders.product_id != 0)].sort_values(by = 
                                                                                ['user_id', 'order_number'])
user_cluster_value_counts = users.user_cluster.value_counts().to_dict()
users['user_cluster_size'] = users.user_cluster.map(lambda x : user_cluster_value_counts[x])

prior_orders_pu = prior_orders[prior_orders.order_number != 1][
                    ['user_id', 'reordered', 'order_id', 'product_id']].merge(users, on = 'user_id', how = 'inner')
user_cluster_reorder_rate = {row[0]: row[1] for row in 
                            prior_orders_pu[['user_cluster', 'reordered']].groupby('user_cluster').mean().itertuples()}
over_all_reorder_rate = prior_orders_pu.reordered.mean()
over_all_reorder_rate_std = prior_orders_pu.reordered.std()

max_cluster_size = users.user_cluster_size.max()
users['user_cluster_reorder_rate'] = users['user_cluster'].map(lambda x : user_cluster_reorder_rate[x])
users['user_cluster_reorder_rate'] = users.apply(lambda row : row['user_cluster_reorder_rate']*float(row[
    'user_cluster_size'])/max_cluster_size + over_all_reorder_rate*float(max_cluster_size - 
    row['user_cluster_size'])/max_cluster_size + over_all_reorder_rate_std*np.random.uniform(), axis = 1)

user_cluster_mean_reorder_count = {row[0]: row[1] for row in 
                            prior_orders_pu[['user_cluster', 'order_id', 'reordered']].groupby(['user_cluster', 
                            'order_id']).sum().reset_index()[['user_cluster', 'reordered']].groupby(
                            'user_cluster').mean().itertuples()}
users['user_cluster_mean_reorder_count'] = users['user_cluster'].map(lambda x : user_cluster_mean_reorder_count[x])
users['user_organic_count'] = prior_orders_p[['user_id', 'product_id', 'is_organic']].drop_duplicates().groupby(
    'user_id').sum().reset_index()['is_organic']
users['user_organic_fraction'] = prior_orders_p[['user_id', 'product_id', 'is_organic']].drop_duplicates().groupby(
    'user_id').mean().reset_index()['is_organic']

users['user_organic_recency'] = prior_orders_p[['user_id','is_organic','days_ago']].groupby(['user_id', 'is_organic']).mean().reset_index()[
    ['user_id','days_ago']].groupby('user_id').aggregate(lambda x: tuple(x)).reset_index()['days_ago'].map(lambda x : x[1]/x[0] if len(x) == 2 else 0.0)
users['user_organic_recency'].fillna(users['user_organic_recency'].max(),inplace = True)
users['user_gluten_free_count'] = prior_orders_p[['user_id', 'product_id', 'is_gluten_free']].drop_duplicates().groupby(
    'user_id').sum().reset_index()['is_gluten_free']
users['user_gluten_free_fraction'] = prior_orders_p[['user_id', 'product_id', 'is_gluten_free']].drop_duplicates().groupby(
    'user_id').mean().reset_index()['is_gluten_free']
users['user_gluten_free_recency'] = prior_orders_p[['user_id','is_gluten_free','days_ago']].groupby(['user_id', 'is_gluten_free']).mean().reset_index()[
    ['user_id','days_ago']].groupby('user_id').aggregate(lambda x: tuple(x)).reset_index()['days_ago'].map(lambda x : x[1]/x[0] if len(x) == 2 else 0.0)
users['user_gluten_free_recency'].fillna(users['user_gluten_free_recency'].max(),inplace = True)

prior_orders_pu = prior_orders[(prior_orders.order_number != 1) | (prior_orders.product_id != 0)][
                    ['user_id', 'reordered', 'order_id', 'product_id']].merge(users, on = 'user_id', how = 'inner')
user_cluster_mean_product_counts = {row[0]: row[1] for row in 
                            prior_orders_pu[['user_cluster', 'product_id', 'user_id']].groupby(['user_cluster', 
                            'user_id']).count().reset_index()[['user_cluster', 'product_id']].groupby(
                            'user_cluster').mean().itertuples()}
users['user_cluster_mean_product_counts'] = users['user_cluster'].map(lambda x : user_cluster_mean_product_counts[x])

prior_orders_p = prior_orders_p.merge(users, on = 'user_id', how = 'inner')

user_features = prior_orders_p[['user_id', 'product_id']].drop_duplicates().groupby('user_id').count().reset_index()
user_features.columns = ['user_id', 'user_product_count']

user_features['user_order_frequency'] = orders[orders.days_since_prior_order.isnull() == False][
            ['user_id', 'days_since_prior_order']].groupby('user_id').median().reset_index()['days_since_prior_order']
user_features['user_order_frequency_std'] = orders[orders.days_since_prior_order.isnull() == False][
            ['user_id', 'days_since_prior_order']].groupby('user_id').std().reset_index()['days_since_prior_order']
user_features['user_order_frequency_std'] = user_features['user_order_frequency_std'] / user_features[
                    'user_order_frequency'].map(lambda x : x+1)
user_features['user_order_recency'] = orders[orders.eval_set == 'prior'][['user_id', 'days_ago']].groupby('user_id'
                            ).aggregate(lambda x : np.sum(np.exp(-np.array(x)/30.0))).reset_index()['days_ago']
user_features['user_order_recency_norm'] = orders[orders.eval_set == 'prior'][['user_id', 'days_ago']].groupby('user_id'
                            ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/30.0))).reset_index()['days_ago']
user_features['user_day_kinetic_energy'] = orders[['user_id','order_dow', 'order_id']].groupby(['user_id', 
                                'order_dow']).count().reset_index()[['user_id', 'order_id']].groupby(
                                'user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
user_features['user_tod_kinetic_energy'] = orders[['user_id','order_hour_of_day', 'order_id']].groupby(['user_id', 
                            'order_hour_of_day']).count().reset_index()[['user_id', 'order_id']].groupby(
                            'user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

user_features['user_aisle_count'] = prior_orders_p[['user_id','aisle_id']].drop_duplicates().groupby('user_id'
                                                                        ).count().reset_index()['aisle_id']
user_features['user_department_count'] = prior_orders_p[['user_id','department_id']].drop_duplicates().groupby(
                                                                'user_id').count().reset_index()['department_id']
user_features['user_cluster_count'] = prior_orders_p[['user_id','product_wv_cluster']].drop_duplicates().groupby(
                                                                'user_id').count().reset_index()['product_wv_cluster']
user_features['user_product_kinetic_energy'] = prior_orders_p[['user_id', 'product_id', 'order_id']].groupby(
                        ['user_id', 'product_id']).count().reset_index()[['user_id', 'order_id'
                        ]].groupby('user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
user_features['user_product_kinetic_energy_norm'] = user_features['user_product_kinetic_energy']*user_features[
                                                                                                    'user_product_count']

user_features['user_aisle_kinetic_energy'] = prior_orders_p[['user_id', 'aisle_id', 'order_id']].groupby(['user_id', 
                                    'aisle_id']).count().reset_index()[['user_id', 'order_id']].groupby(
                                    'user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
user_features['user_department_kinetic_energy'] = prior_orders_p[['user_id', 'department_id', 'order_id']].groupby(
                                    ['user_id', 'department_id']).count().reset_index()[['user_id', 'order_id']
                            ].groupby('user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
user_features['user_cluster_kinetic_energy'] = prior_orders_p[['user_id', 'product_wv_cluster', 'order_id']].groupby(
                                ['user_id', 'product_wv_cluster']).count().reset_index()[['user_id', 'order_id']
                            ].groupby('user_id').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

# user_features['user_reorder_frequency_mean'] = prior_orders[prior_orders.order_number != 1][
#                                     ['user_id', 'reordered']].groupby('user_id').mean().reset_index()['reordered']
# user_features['user_reorder_frequency_std'] = prior_orders[prior_orders.order_number != 1][
#                                     ['user_id', 'reordered']].groupby('user_id').std().reset_index()['reordered']


user_features['user_reorder_fraction_mean'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_id', 'reordered']].groupby(['user_id', 'order_id']
                                    ).mean().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).mean().reset_index()['reordered']
user_features['user_reorder_fraction_std'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_id', 'reordered']].groupby(['user_id', 'order_id']
                                    ).mean().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).std().reset_index()['reordered']

user_features['user_reorder_fraction_last'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_number', 'reordered']].groupby(['user_id', 'order_number']
                                    ).mean().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).last().reset_index()['reordered']

user_features['user_reorder_count_mean'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_id', 'reordered']].groupby(['user_id', 'order_id']
                                    ).sum().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).mean().reset_index()['reordered']
user_features['user_reorder_count_std'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_id', 'reordered']].groupby(['user_id', 'order_id']
                                    ).sum().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).std().reset_index()['reordered']
user_features['user_reorder_count_last'] = prior_orders[prior_orders.order_number != 1][
                                    ['user_id','order_number', 'reordered']].groupby(['user_id', 'order_number']
                                    ).sum().reset_index()[['user_id', 'reordered']].groupby('user_id'
                                    ).last().reset_index()['reordered']
user_features['user_mean_basket_size'] = prior_orders_p[['user_id','order_id', 'product_id']].groupby(['user_id',
                                'order_id']).count().reset_index().groupby('user_id').mean().reset_index()['product_id']
user_features['user_std_basket_size'] = prior_orders_p[['user_id','order_id', 'product_id']].groupby(['user_id',
                                'order_id']).count().reset_index().groupby('user_id').std().reset_index()['product_id']
user_features['user_last_basket_size'] = prior_orders_p[['user_id','order_number', 'product_id']].groupby(['user_id',
                                'order_number']).count().reset_index().groupby('user_id').last().reset_index()['product_id']

user_features['user_tenure'] = orders[['user_id', 'days_since_prior_order']].groupby('user_id').sum().reset_index()[
                                                                                            'days_since_prior_order']
user_features['user_mean_product_name_length'] = prior_orders_p[['user_id', 'product_name_length']].groupby(
                                                'user_id').median().reset_index()['product_name_length']
user_features['user_std_product_name_length'] = prior_orders_p[['user_id', 'product_name_length']].groupby(
                                                'user_id').std().reset_index()['product_name_length']

user_features['user_mean_product_name_idf'] = prior_orders_p[['user_id', 'product_name_idf']].groupby(
                                                'user_id').median().reset_index()['product_name_idf']
user_features['user_mean_product_name_idf_max'] = prior_orders_p[['user_id', 'product_name_idf_max']].groupby(
                                                'user_id').median().reset_index()['product_name_idf_max']
user_features['user_std_product_name_idf'] = prior_orders_p[['user_id', 'product_name_idf']].groupby(
                                                'user_id').std().reset_index()['product_name_idf']
user_features['user_last_order_dow_diff'] = prior_orders_p[['user_id', 'order_dow']].groupby('user_id'
                                ).last().reset_index()['order_dow']
user_features['user_last_order_tod_diff'] = prior_orders_p[['user_id', 'order_hour_of_day']].groupby('user_id'
                                ).last().reset_index()['order_hour_of_day']

user_features.head()



Unnamed: 0,user_id,user_product_count,user_order_frequency,user_order_frequency_std,user_order_recency,user_order_recency_norm,user_day_kinetic_energy,user_tod_kinetic_energy,user_aisle_count,user_department_count,...,user_std_basket_size,user_last_basket_size,user_tenure,user_mean_product_name_length,user_std_product_name_length,user_mean_product_name_idf,user_mean_product_name_idf_max,user_std_product_name_idf,user_last_order_dow_diff,user_last_order_tod_diff
0,1,18,19.5,0.440527,1.372945,0.137295,0.272727,0.371901,12,7,...,1.523884,9,190.0,3.0,1.221519,5.23759,6.536873,1.009275,4,2
1,2,103,13.0,0.733494,0.769246,0.054946,0.297778,0.76,34,14,...,5.656854,16,228.0,4.0,2.349182,5.161159,6.719194,0.695488,3,3
2,3,33,11.0,0.427879,2.036934,0.169745,0.301775,0.538462,16,9,...,2.103388,6,144.0,3.0,1.467161,4.5561,6.40682,0.957862,1,5
3,4,18,19.0,0.548862,1.128509,0.225702,0.277778,0.388889,15,10,...,1.643168,4,85.0,4.0,2.080522,5.33908,6.921719,1.137373,5,4
4,5,23,10.5,0.473627,1.780547,0.445137,0.36,0.28,16,9,...,3.095696,12,46.0,3.0,1.364501,4.878953,7.099967,0.927473,1,6


In [12]:
user_orders = prior_orders_p[['user_id', 'order_number', 'product_id']].groupby(['user_id', 'order_number']).aggregate(
                    lambda x : set(tuple(x))).reset_index()[['user_id', 'product_id']].groupby('user_id').aggregate(
                    lambda x : tuple(x)).reset_index()

In [16]:
user_features['user_last_2_order_sim'] = user_orders.product_id.map(lambda x : len(x[-1].intersection(x[-2]))*1.0/(len(x[-1].union(x[-2]))))

In [28]:
user_features['user_last_n_order_sim'] = user_orders.product_id.map(lambda x : np.mean([len(x[-i].intersection(x[-i-1]))*1.0/(len(x[-i].union(x[-i-1]))) for i in range(len(x) -1, 0, -1)]))

In [32]:
user_features['user_num_product_with_no_reorder'] =  prior_orders_p[['user_id', 'product_id', 'reordered']].groupby(
                                                ['user_id', 'product_id']).max().reset_index()[['user_id', 'reordered']
                                                  ].groupby('user_id').sum().reset_index()['reordered']
user_features['user_fraction_product_with_no_reorder'] =  prior_orders_p[['user_id', 'product_id', 'reordered']].groupby(
                                                ['user_id', 'product_id']).max().reset_index()[['user_id', 'reordered']
                                                  ].groupby('user_id').mean().reset_index()['reordered']
user_features['user_num_orders_with_no_reorder'] =  prior_orders_p[prior_orders_p.product_id !=0][['user_id', 
                                                'order_id', 'reordered']].groupby(['user_id', 'order_id']
                                                ).max().reset_index()[['user_id', 'reordered']
                                                ].groupby('user_id').sum().reset_index()['reordered']
user_features['user_fraction_orders_with_no_reorder'] =  prior_orders_p[prior_orders_p.product_id !=0][['user_id', 
                                                'order_id', 'reordered']].groupby(['user_id', 'order_id']
                                                ).max().reset_index()[['user_id', 'reordered']
                                                ].groupby('user_id').mean().reset_index()['reordered']

In [33]:
user_daily_order_rate = orders[['user_id', 'order_dow']]
user_daily_order_rate['user_daily_order_rate'] = 1.0
user_daily_order_rate = user_daily_order_rate.groupby(['user_id', 'order_dow']).sum().reset_index()

user_hourly_order_rate = orders[['user_id', 'order_hour_of_day']]
user_hourly_order_rate['user_hourly_order_rate'] = 1.0
user_hourly_order_rate = user_hourly_order_rate.groupby(['user_id', 'order_hour_of_day']).sum().reset_index()

product_daily_order_rate = prior_orders_p[['product_id', 'order_dow']]
product_daily_order_rate['product_daily_order_rate'] = 1.0
product_daily_order_rate = product_daily_order_rate.groupby(['product_id', 'order_dow']).sum().reset_index()

product_hourly_order_rate = prior_orders_p[['product_id', 'order_hour_of_day']]
product_hourly_order_rate['product_hourly_order_rate'] = 1.0
product_hourly_order_rate = product_hourly_order_rate.groupby(['product_id', 'order_hour_of_day']).sum().reset_index()

user_product_daily_order_rate = prior_orders[['user_id', 'product_id', 'order_dow']]
user_product_daily_order_rate['user_product_daily_order_rate'] = 1.0
user_product_daily_order_rate = user_product_daily_order_rate.groupby(['user_id', 'product_id', 'order_dow']
                                                                     ).sum().reset_index()

user_product_hourly_order_rate = prior_orders[['user_id', 'product_id', 'order_hour_of_day']]
user_product_hourly_order_rate['user_product_hourly_order_rate'] = 1.0
user_product_hourly_order_rate = user_product_hourly_order_rate.groupby(['user_id', 'product_id', 
                                                                         'order_hour_of_day']).sum().reset_index()
prior_orders_p = prior_orders[(prior_orders.order_number != 1)]
user_daily_reorder_rate = prior_orders_p[['user_id', 'order_dow', 'reordered']].groupby(
                                        ['user_id', 'order_dow']).mean().reset_index()
user_daily_reorder_rate.columns = ['user_id', 'order_dow', 'user_daily_reorder_rate']
user_hourly_reorder_rate = prior_orders_p[['user_id', 'order_hour_of_day', 'reordered']
                                            ].groupby(['user_id', 'order_hour_of_day']).mean().reset_index()
user_hourly_reorder_rate.columns = ['user_id', 'order_hour_of_day', 'user_hourly_reorder_rate']

product_daily_reorder_rate = prior_orders_p[['product_id', 'order_dow', 'reordered']].groupby(
                                        ['product_id', 'order_dow']).mean().reset_index()
product_daily_reorder_rate.columns = ['product_id', 'order_dow', 'product_daily_reorder_rate']
product_hourly_reorder_rate = prior_orders_p[['product_id', 'order_hour_of_day', 'reordered']
                                            ].groupby(['product_id', 'order_hour_of_day']).mean().reset_index()
product_hourly_reorder_rate.columns = ['product_id', 'order_hour_of_day','product_hourly_reorder_rate']

user_product_daily_reorder_rate = prior_orders_p[['user_id', 'product_id', 
            'order_dow', 'reordered']].groupby(['user_id', 'product_id', 'order_dow']).mean().reset_index()
user_product_daily_reorder_rate.columns = ['user_id', 'product_id', 'order_dow','user_product_daily_reorder_rate']
user_product_hourly_reorder_rate = prior_orders_p[['user_id', 'product_id', 
                        'order_hour_of_day', 'reordered']].groupby(
                        ['user_id', 'product_id', 'order_hour_of_day']).mean().reset_index()
user_product_hourly_reorder_rate.columns = ['user_id', 'product_id', 'order_hour_of_day','user_product_hourly_reorder_rate']

user_aisle_daily_reorder_rate = prior_orders_p[['user_id', 'aisle_id', 
            'order_dow', 'reordered']].groupby(['user_id', 'aisle_id', 'order_dow']).mean().reset_index()
user_aisle_daily_reorder_rate.columns = ['user_id', 'aisle_id', 'order_dow','user_aisle_daily_reorder_rate']
user_aisle_hourly_reorder_rate = prior_orders_p[['user_id', 'aisle_id', 
                        'order_hour_of_day', 'reordered']].groupby(
                        ['user_id', 'aisle_id', 'order_hour_of_day']).mean().reset_index()
user_aisle_hourly_reorder_rate.columns = ['user_id', 'aisle_id', 'order_hour_of_day','user_aisle_hourly_reorder_rate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/sta

### Product Features

In [34]:
prior_orders_p = prior_orders[(prior_orders.order_number != 1) | (prior_orders.product_id != 0)].sort_values(by = 
                                                                                ['user_id', 'order_number'])
prior_orders_pr = prior_orders[(prior_orders.order_number != 1)].sort_values(by = 
                                                                                ['user_id', 'order_number'])
prior_orders_p = prior_orders_p.merge(users, on = 'user_id', how = 'inner')
product_features = prior_orders_p[['user_id','product_id']].drop_duplicates().groupby('product_id').count().reset_index()
product_features.columns = ['product_id', 'product_user_count']
product_features.product_user_count = product_features.product_user_count.map(lambda x : np.log(x +1))
product_features['product_order_count'] = prior_orders_p[['order_id','product_id']].drop_duplicates().groupby(
                                            'product_id').count().reset_index()['order_id']
product_features['product_user_cluster_count'] = prior_orders_p[['user_cluster','product_id']].drop_duplicates().groupby(
                                            'product_id').count().reset_index()['user_cluster']

product_features['product_order_per_user'] = prior_orders_p[['order_id','product_id', 'user_id']].groupby(['user_id', 
                                'product_id']).count().reset_index()[['product_id', 'order_id']].groupby('product_id'
                                ).mean().reset_index()['order_id']
product_features['product_fraction_users_with_no_reorders'] = prior_orders_p[['reordered','product_id', 'user_id']].groupby(['user_id', 
                                'product_id']).max().reset_index()[['product_id', 'reordered']].groupby('product_id'
                                ).mean().reset_index()['reordered']
product_features['product_num_users_with_no_reorders'] = prior_orders_p[['reordered','product_id', 'user_id']].groupby(['user_id', 
                                'product_id']).max().reset_index()[['product_id', 'reordered']].groupby('product_id'
                                ).sum().reset_index()['reordered']

product_features['product_dow_kinetic_energy'] = prior_orders_p[['product_id','order_dow', 'order_id']].groupby(
                                        ['product_id', 'order_dow']).count().reset_index()[['product_id', 'order_id']
                                        ].groupby('product_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
product_features['product_tod_kinetic_energy'] = prior_orders_p[['product_id','order_hour_of_day', 'order_id']].groupby(
                                        ['product_id', 'order_hour_of_day']).count().reset_index()[['product_id', 
                                        'order_id']].groupby('product_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

# product_features['product_reorder_frequency'] = prior_orders_pr[['product_id', 'user_id', 'reordered']
#                                 ].groupby(['product_id', 'user_id']).mean().reset_index().groupby(
#                                 'product_id').mean().reset_index()['reordered']
product_features['product_mean_add_to_cart_order'] = prior_orders_p[['product_id','add_to_cart_order']].groupby(
                                                            'product_id').mean().reset_index()['add_to_cart_order']
# product_days_between_orders = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['product_id', 'days_since_prior_order']].groupby('product_id').mean().reset_index()[[
#                         'product_id', 'days_since_prior_order']]
# product_days_between_orders.columns = ['product_id', 'product_mean_days_between_orders']
# product_days_between_orders['product_std_days_between_orders'] = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['product_id', 'days_since_prior_order']].groupby('product_id').std().reset_index()[
#                         'days_since_prior_order']
# product_features = product_features.merge(product_days_between_orders, on = 'product_id', how = 'left')
# product_features['product_mean_days_between_orders'].fillna(30, inplace = True)
# product_features['product_std_days_between_orders'].fillna(0, inplace = True)
num_users = prior_orders.user_id.nunique()
product_features['product_idf'] = product_features.product_user_count.map(lambda x : np.log(num_users/(x + 50.0)))
product_features['product_recency'] = prior_orders_p[['product_id', 'days_ago']].groupby('product_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'days_ago']
product_features['product_recency_norm'] = prior_orders_p[['product_id', 'days_ago']].groupby('product_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/15.0))).reset_index()[
                                'days_ago']
product_features['product_order_recency'] = prior_orders_p[['product_id', 'order_number']].groupby('product_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'order_number']
product_features['product_order_recency_norm'] = prior_orders_p[['product_id', 'order_number']].groupby('product_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/3.0))).reset_index()[
                                'order_number']
product_features = product_features.merge(products, on = 'product_id', how = 'inner')
product_features.head()

Unnamed: 0,product_id,product_user_count,product_order_count,product_user_cluster_count,product_order_per_user,product_fraction_users_with_no_reorders,product_num_users_with_no_reorders,product_dow_kinetic_energy,product_tod_kinetic_energy,product_mean_add_to_cart_order,...,product_order_recency_norm,aisle_id,department_id,product_name,product_name_length,is_organic,is_gluten_free,product_name_idf,product_name_idf_max,product_wv_cluster
0,0,11.408687,182304,270,2.023352,1.0,90100.0,0.143799,0.203099,0.0,...,0.254642,0,0,,1,0.0,0.0,7.769016,7.769016,21.0
1,1,6.575076,1852,39,2.586592,0.385475,276.0,0.156298,0.208981,5.801836,...,0.119517,61,19,Chocolate Sandwich Cookies,3,0.0,0.0,4.26085,5.280149,63.0
2,2,4.369448,90,56,1.153846,0.102564,8.0,0.152346,0.231852,9.888889,...,0.093998,104,13,All-Seasons Salt,2,0.0,0.0,6.022208,7.769016,28.0
3,3,4.317488,277,45,3.743243,0.486486,36.0,0.164227,0.188495,6.415162,...,0.142834,94,7,Robust Golden Unsweetened Oolong Tea,5,0.0,0.0,5.974132,7.481334,100.0
4,4,5.209486,329,45,1.807692,0.351648,64.0,0.15398,0.194723,9.507599,...,0.23562,38,1,Smart Ones Classic Favorites Mini Rigatoni Wit...,10,0.0,0.0,5.284316,7.124659,37.0


### Aisle Features

In [35]:
aisle_features = prior_orders_p[['user_id','aisle_id']].drop_duplicates().groupby('aisle_id').count().reset_index()
aisle_features.columns = ['aisle_id', 'aisle_user_count']
aisle_features.aisle_user_count = aisle_features.aisle_user_count.map(lambda x : np.log(x +1))
aisle_features['aisle_order_per_user'] = prior_orders_p[['order_id','aisle_id', 'user_id']].drop_duplicates().groupby(
                        ['user_id', 'aisle_id']).count().reset_index()[['aisle_id', 'order_id']].groupby('aisle_id'
                                ).mean().reset_index()['order_id']

# aisle_features['aisle_order_count'] = prior_orders_p[['order_id','aisle_id']].drop_duplicates().groupby(
#                                             'aisle_id').count().reset_index()['order_id'].map(lambda x : np.log(x +1))
aisle_features['aisle_dow_kinetic_energy'] = prior_orders_p[['aisle_id','order_dow', 'order_id']].groupby(
                                        ['aisle_id', 'order_dow']).count().reset_index()[['aisle_id', 'order_id']
                                        ].groupby('aisle_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
aisle_features['aisle_tod_kinetic_energy'] = prior_orders_p[['aisle_id','order_hour_of_day', 'order_id']].groupby(
                                        ['aisle_id', 'order_hour_of_day']).count().reset_index()[['aisle_id', 
                                                                                                  
                                        'order_id']].groupby('aisle_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

aisle_features['aisle_reorder_frequency'] = prior_orders_p[
                                            ['aisle_id', 'reordered']].groupby('aisle_id').mean().reset_index()['reordered']
aisle_features['aisle_mean_add_to_cart_order'] = prior_orders_p[['aisle_id','add_to_cart_order']].groupby(
                                                        'aisle_id').mean().reset_index()['add_to_cart_order']
aisle_features['aisle_product_count'] = products[['aisle_id', 'product_id']].drop_duplicates().groupby('aisle_id').count().reset_index()[
                                                                                                        'product_id']
# aisle_days_between_orders = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['aisle_id', 'days_since_prior_order']].groupby('aisle_id').mean().reset_index()[[
#                         'aisle_id', 'days_since_prior_order']]
# aisle_days_between_orders.columns = ['aisle_id', 'aisle_mean_days_between_orders']
# aisle_days_between_orders['aisle_std_days_between_orders'] = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['aisle_id', 'days_since_prior_order']].groupby('aisle_id').std().reset_index()[
#                         'days_since_prior_order']
# aisle_features = aisle_features.merge(aisle_days_between_orders, on = 'aisle_id', how = 'left')
# aisle_features['aisle_mean_days_between_orders'].fillna(30, inplace = True)
# aisle_features['aisle_std_days_between_orders'].fillna(0, inplace = True)
aisle_features['aisle_recency'] = prior_orders_p[['aisle_id', 'days_ago']].groupby('aisle_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'days_ago']
aisle_features['aisle_recency_norm'] = prior_orders_p[['aisle_id', 'days_ago']].groupby('aisle_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/15.0))).reset_index()[
                                'days_ago']
aisle_features['aisle_order_recency'] = prior_orders_p[['aisle_id', 'order_number']].groupby('aisle_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'order_number']
aisle_features['aisle_order_recency_norm'] = prior_orders_p[['aisle_id', 'order_number']].groupby('aisle_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/3.0))).reset_index()[
                                'order_number']
aisle_features['aisle_idf'] = aisle_features.aisle_user_count.map(lambda x : np.log(num_users/(x + 50.0)))

aisle_features.head()

Unnamed: 0,aisle_id,aisle_user_count,aisle_order_per_user,aisle_dow_kinetic_energy,aisle_tod_kinetic_energy,aisle_reorder_frequency,aisle_mean_add_to_cart_order,aisle_product_count,aisle_recency,aisle_recency_norm,aisle_order_recency,aisle_order_recency_norm,aisle_idf
0,0,11.408687,2.023352,0.143799,0.203099,1.0,0.0,1,25792.553734,0.059412,194.472294,0.254642,8.119094
1,1,9.938469,3.047414,0.14855,0.201706,0.596597,8.16764,146,26663.247553,0.05863,603.542932,0.144337,8.143327
2,2,10.34891,2.471687,0.14666,0.199703,0.489326,9.275497,271,26265.178044,0.060298,571.980786,0.142674,8.136503
3,3,11.060259,4.373994,0.150279,0.204585,0.598007,9.571935,832,28296.839084,0.056813,613.596388,0.137316,8.124784
4,4,10.894756,3.071718,0.150333,0.198443,0.489533,10.16145,543,24789.633036,0.060402,475.872329,0.159026,8.127498


### Cluster Features

In [36]:
cluster_features = prior_orders_p[['user_id','product_wv_cluster']].drop_duplicates().groupby('product_wv_cluster'
                                                                                            ).count().reset_index()
cluster_features.columns = ['product_wv_cluster', 'cluster_user_count']
cluster_features.cluster_user_count = cluster_features.cluster_user_count.map(lambda x : np.log(x +1))
cluster_features['cluster_order_per_user'] = prior_orders_p[['order_id','product_wv_cluster', 'user_id']].groupby(['user_id', 
                                'product_wv_cluster']).count().reset_index()[['product_wv_cluster', 'order_id']].groupby('product_wv_cluster'
                                ).mean().reset_index()['order_id']

# cluster_features['cluster_order_count'] = prior_orders_p[['order_id','product_wv_cluster']].drop_duplicates().groupby(
#                                     'product_wv_cluster').count().reset_index()['order_id'].map(lambda x : np.log(x +1))
cluster_features['cluster_dow_kinetic_energy'] = prior_orders_p[['product_wv_cluster','order_dow', 'order_id']].groupby(
                        ['product_wv_cluster', 'order_dow']).count().reset_index()[['product_wv_cluster', 'order_id']
                                        ].groupby('product_wv_cluster'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
cluster_features['cluster_tod_kinetic_energy'] = prior_orders_p[['product_wv_cluster','order_hour_of_day', 
                                'order_id']].groupby(['product_wv_cluster', 'order_hour_of_day'
                                ]).count().reset_index()[['product_wv_cluster', 'order_id']].groupby(
                            'product_wv_cluster').aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

cluster_features['cluster_reorder_frequency'] = prior_orders_p[['product_wv_cluster', 'reordered']
                                                ].groupby('product_wv_cluster').mean().reset_index()['reordered']
cluster_features['cluster_mean_add_to_cart_order'] = prior_orders_p[['product_wv_cluster','add_to_cart_order']].groupby(
                                                        'product_wv_cluster').mean().reset_index()['add_to_cart_order']
cluster_features['cluster_product_count'] = products[['product_wv_cluster', 'product_id']].drop_duplicates().groupby('product_wv_cluster'
                                                        ).count().reset_index()['product_id']
cluster_features['cluster_aisle_count'] = products[['product_wv_cluster', 'aisle_id']].drop_duplicates().groupby('product_wv_cluster'
                                                        ).count().reset_index()['aisle_id']

# cluster_days_between_orders = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['product_wv_cluster', 'days_since_prior_order']].groupby('product_wv_cluster').mean().reset_index()[[
#                         'product_wv_cluster', 'days_since_prior_order']]
# cluster_days_between_orders.columns = ['product_wv_cluster', 'cluster_mean_days_between_orders']
# cluster_days_between_orders['cluster_std_days_between_orders'] = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['product_wv_cluster', 'days_since_prior_order']].groupby('product_wv_cluster').std().reset_index()[
#                         'days_since_prior_order']
# cluster_features = cluster_features.merge(cluster_days_between_orders, on = 'product_wv_cluster', how = 'left')
# cluster_features['cluster_mean_days_between_orders'].fillna(30, inplace = True)
# cluster_features['cluster_std_days_between_orders'].fillna(0, inplace = True)

cluster_features['cluster_recency'] = prior_orders_p[['product_wv_cluster', 'days_ago']].groupby('product_wv_cluster'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'days_ago']
cluster_features['cluster_recency_norm'] = prior_orders_p[['product_wv_cluster', 'days_ago']].groupby('product_wv_cluster'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/15.0))).reset_index()[
                                'days_ago']
cluster_features['cluster_order_recency'] = prior_orders_p[['product_wv_cluster', 'order_number']].groupby('product_wv_cluster'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'order_number']
cluster_features['cluster_order_recency_norm'] = prior_orders_p[['product_wv_cluster', 'order_number']].groupby('product_wv_cluster'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/3.0))).reset_index()[
                                'order_number']

cluster_features['cluster_idf'] =cluster_features.cluster_user_count.map(lambda x : np.log(num_users/(x + 50.0)))

cluster_features.head()

Unnamed: 0,product_wv_cluster,cluster_user_count,cluster_order_per_user,cluster_dow_kinetic_energy,cluster_tod_kinetic_energy,cluster_reorder_frequency,cluster_mean_add_to_cart_order,cluster_product_count,cluster_aisle_count,cluster_recency,cluster_recency_norm,cluster_order_recency,cluster_order_recency_norm,cluster_idf
0,0.0,9.546455,3.182864,0.14948,0.194068,0.52601,9.413754,11,8,26732.204598,0.058991,562.773714,0.138995,8.149889
1,1.0,10.997071,2.976833,0.151796,0.19582,0.440547,10.020387,148,62,26354.241932,0.059123,527.441622,0.156587,8.12582
2,2.0,10.339902,3.341639,0.144419,0.191979,0.632668,6.202105,173,24,24807.245703,0.06597,660.63643,0.136636,8.136652
3,3.0,8.838407,2.8542,0.148968,0.204716,0.438142,9.332469,2431,126,20433.497814,0.069355,282.829826,0.204433,8.161851
4,4.0,8.546169,4.727944,0.144963,0.195797,0.597452,6.935265,126,38,23792.947061,0.059474,385.571147,0.168889,8.16683


### Department Features

In [37]:
department_features = prior_orders_p[['user_id','department_id']].drop_duplicates().groupby('department_id').count().reset_index()
department_features.columns = ['department_id', 'department_user_count']
department_features.department_user_count = department_features.department_user_count.map(lambda x : np.log(x +1))
department_features['department_order_per_user'] = prior_orders_p[['order_id','department_id', 'user_id']].groupby(['user_id', 
                                'department_id']).count().reset_index()[['department_id', 'order_id']].groupby('department_id'
                                ).mean().reset_index()['order_id']


# department_features['department_order_count'] = prior_orders_p[['order_id','department_id']].drop_duplicates().groupby(
#                                             'department_id').count().reset_index()['order_id'].map(lambda x : np.log(x +1))
department_features['department_dow_kinetic_energy'] = prior_orders_p[['department_id','order_dow', 'order_id']].groupby(
                                        ['department_id', 'order_dow']).count().reset_index()[['department_id', 'order_id']
                                        ].groupby('department_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']
department_features['department_tod_kinetic_energy'] = prior_orders_p[['department_id','order_hour_of_day', 'order_id']].groupby(
                                        ['department_id', 'order_hour_of_day']).count().reset_index()[['department_id', 
                                        'order_id']].groupby('department_id'
                                        ).aggregate(lambda x : get_kinetic_energy(x)).reset_index()['order_id']

department_features['department_reorder_frequency'] = prior_orders_p[
                                            ['department_id', 'reordered']].groupby('department_id').mean().reset_index()['reordered']
department_features['department_mean_add_to_cart_order'] = prior_orders[['department_id','add_to_cart_order']].groupby(
                                                        'department_id').mean().reset_index()['add_to_cart_order']
department_features['department_product_count'] = products[['department_id', 'product_id']].drop_duplicates().groupby('department_id').count().reset_index()[
                                                                                                        'product_id']
# department_days_between_orders = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['department_id', 'days_since_prior_order']].groupby('department_id').mean().reset_index()[[
#                         'department_id', 'days_since_prior_order']]
# department_days_between_orders.columns = ['department_id', 'department_mean_days_between_orders']
# department_days_between_orders['department_std_days_between_orders'] = prior_orders[prior_orders.days_since_prior_order.isnull()
#                         == False][['department_id', 'days_since_prior_order']].groupby('department_id').std().reset_index()[
#                         'days_since_prior_order']
# department_features = department_features.merge(department_days_between_orders, on = 'department_id', how = 'left')
# department_features['department_mean_days_between_orders'].fillna(30, inplace = True)
# department_features['department_std_days_between_orders'].fillna(0, inplace = True)
department_features['department_recency'] = prior_orders_p[['department_id', 'days_ago']].groupby('department_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()['days_ago']
department_features['department_recency_norm'] = prior_orders_p[['department_id', 'days_ago']].groupby('department_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/15.0))).reset_index()['days_ago']
department_features['department_order_recency'] = prior_orders_p[['department_id', 'order_number']].groupby('department_id'
                                    ).aggregate(lambda x : np.mean([a**2 for a in x])).reset_index()[
                                'order_number']
department_features['department_order_recency_norm'] = prior_orders_p[['department_id', 'order_number']].groupby('department_id'
                                    ).aggregate(lambda x : np.mean(np.exp(-np.array(x)/3.0))).reset_index()[
                                'order_number']

department_features['department_idf'] = department_features.department_user_count.map(lambda x : np.log(num_users/(x + 50.0)))

department_features.head()

Unnamed: 0,department_id,department_user_count,department_order_per_user,department_dow_kinetic_energy,department_tod_kinetic_energy,department_reorder_frequency,department_mean_add_to_cart_order,department_product_count,department_recency,department_recency_norm,department_order_recency,department_order_recency_norm,department_idf
0,0,11.408687,2.023352,0.143799,0.203099,1.0,0.0,1,25792.553734,0.059412,194.472294,0.254642,8.119094
1,1,12.00294,13.700857,0.147833,0.195011,0.541885,8.996414,4007,26500.694202,0.05871,502.486808,0.157487,8.109464
2,2,9.791214,2.030266,0.144567,0.196812,0.40798,8.277645,548,23112.071891,0.071092,620.08903,0.145864,8.145787
3,3,11.853767,8.369037,0.147772,0.19814,0.628141,8.084397,1516,27047.831417,0.057456,603.982114,0.143623,8.111872
4,4,12.171678,49.055259,0.150597,0.195262,0.649913,8.022875,1684,26369.667005,0.062608,642.476339,0.138724,8.106746


### User Product Interaction Features

In [38]:
user_product_interaction = prior_orders[['user_id', 'product_id', 'order_id']].groupby(
    ['user_id', 'product_id']).count().reset_index()
user_product_interaction.columns = ['user_id', 'product_id', 'user_product_purchase_count']
user_product_interaction['user_product_purchase_count'] = user_product_interaction.apply(lambda row : 
                            row['user_product_purchase_count'] - 1 if row['product_id'] == 0.0 else 
                            row['user_product_purchase_count'], axis = 1)
user_product_interaction['user_product_add_to_cart_order'] = prior_orders[['user_id', 'product_id', 'add_to_cart_order']
                                        ].groupby(['user_id', 'product_id']).mean().reset_index()['add_to_cart_order']
user_product_interaction['user_product_recency'] = prior_orders[['user_id', 'product_id', 'days_ago']
                    ].groupby(['user_id', 'product_id']).mean().reset_index()['days_ago']
user_product_interaction['user_product_recency_order'] = prior_orders[['user_id', 'product_id', 'order_number']
                    ].groupby(['user_id', 'product_id']).mean().reset_index()['order_number']

user_product_interaction['user_product_orders_since_last_order'] = prior_orders[['product_id', 'user_id', 
                            'order_number']].groupby(['user_id', 'product_id']).max().reset_index()['order_number']
user_product_interaction['user_product_last_dow'] = prior_orders[['product_id', 'user_id', 'order_number', 'order_dow'
                            ]].sort_values(by = ['user_id', 'product_id', 'order_number']).groupby(
                    ['user_id', 'product_id']).last().reset_index()['order_dow']
user_product_interaction['user_product_last_tod'] = prior_orders[['product_id', 'user_id', 'order_number', 'order_hour_of_day'
                            ]].sort_values(by = ['user_id', 'product_id', 'order_number']).groupby(
                    ['user_id', 'product_id']).last().reset_index()['order_hour_of_day']

product_daily_frequency = prior_orders[['user_id', 'product_id', 'day']].sort_values(['user_id', 'product_id', 'day'])
product_daily_frequency['days_since_last_purchase'] = product_daily_frequency['day'].diff()
mask = product_daily_frequency.product_id != product_daily_frequency.product_id.shift(1)
product_daily_frequency['days_since_last_purchase'][mask] = 60
product_daily_frequency1 = product_daily_frequency.groupby(['user_id', 'product_id']
                                            ).last().reset_index()[['user_id', 'product_id','days_since_last_purchase']]
product_daily_frequency1['user_product_mean_days_between_purchase'] = product_daily_frequency.groupby(
                                            ['user_id', 'product_id']).mean().reset_index()['days_since_last_purchase']
product_daily_frequency1['user_product_std_days_between_purchase'] = product_daily_frequency.groupby(
                        ['user_id', 'product_id']).std().reset_index()['days_since_last_purchase']

product_daily_frequency1.columns = ['user_id', 'product_id','user_product_days_since_last_order', 
                                   'user_product_mean_days_between_purchase','user_product_std_days_between_purchase']
user_product_interaction = user_product_interaction.merge(product_daily_frequency1, on = ['user_id', 
                                                                                         'product_id'], how = 'inner')
user_product_interaction.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user_id,product_id,user_product_purchase_count,user_product_add_to_cart_order,user_product_recency,user_product_recency_order,user_product_orders_since_last_order,user_product_last_dow,user_product_last_tod,user_product_days_since_last_order,user_product_mean_days_between_purchase,user_product_std_days_between_purchase
0,1,0.0,0.0,0.0,190.0,1.0,1,2,2,60.0,60.0,
1,1,196.0,10.0,1.4,97.9,5.5,10,4,2,30.0,23.6,15.557778
2,1,10258.0,9.0,3.333333,87.666667,6.0,10,4,2,30.0,24.555556,16.187272
3,1,10326.0,1.0,5.0,97.0,5.0,5,4,5,60.0,60.0,
4,1,12427.0,10.0,3.3,97.9,5.5,10,4,2,30.0,23.6,15.557778


In [39]:
user_product_interaction['user_product_std_days_between_purchase'].fillna(0, inplace = True)

In [40]:
def get_corr(series):
    if len(series) < 3:
        return 0.0
    mn = min(series)
    mx = max(series)
    uniform_series = range(int(mn),int(mx+1), int((mx - mn)/len(series)) + 1)
    if len(uniform_series) > len(series):
        uniform_series = uniform_series[:len(series)]
    elif len(uniform_series) < len(series):
        while len(uniform_series) < len(series):
            mx += 1
            uniform_series = range(int(mn),int(mx+1), int((mx - mn)/len(series)) + 1)
    
    return np.corrcoef(series, uniform_series)[0][1]

In [41]:
user_product_interaction['user_product_order_corr'] = prior_orders[['user_id', 'product_id', 'order_number']].groupby(['user_id', 'product_id']).aggregate(
    lambda x : tuple(x)).reset_index()['order_number'].map(lambda x : get_corr(x))

In [42]:
user_product_interaction['user_product_day_corr'] = prior_orders[['user_id', 'product_id', 'day']].groupby(['user_id', 'product_id']).aggregate(
    lambda x : tuple(x)).reset_index()['day'].map(lambda x : get_corr(x))

  c /= stddev[:, None]
  c /= stddev[None, :]


In [43]:
user_product_interaction['user_product_day_corr'].fillna(0, inplace = True)

In [44]:
streak_features = prior_orders[['user_id', 'product_id', 'order_number']].sort_values(
    by = ['user_id', 'product_id', 'order_number'])
streak_features['is_streak'] = streak_features['order_number'] - streak_features['order_number'].shift(1)
streak_features['is_streak'][streak_features['product_id'] != streak_features['product_id'].shift(1)] = 0
def get_streak_id(streaks, product_ids):
    curr_id = 0
    streak_ids = []
    prev_product_id = -1
    for v, p in zip(streaks, product_ids):
        if prev_product_id != p:
            curr_id = 0
        if  v != 1:
            curr_id +=1
        streak_ids.append(curr_id)
        prev_product_id = p
    return streak_ids
            
streak_features['streak_id'] = get_streak_id(list(streak_features['is_streak']), list(streak_features['product_id']))
streak_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,user_id,product_id,order_number,is_streak,streak_id
0,1,0,1,0.0,1
388513,1,196,1,0.0,1
388514,1,196,2,1.0,1
388515,1,196,3,1.0,1
388516,1,196,4,1.0,1
388517,1,196,5,1.0,1
388518,1,196,6,1.0,1
388519,1,196,7,1.0,1
388520,1,196,8,1.0,1
388521,1,196,9,1.0,1


In [45]:
user_product_interaction['user_product_mean_streak_length'] = streak_features[['user_id', 'product_id', 'streak_id', 'is_streak']
                                ].groupby(['user_id', 'product_id', 'streak_id']).count().reset_index()[['user_id', 'product_id',
                                'is_streak']].groupby(['user_id', 'product_id']).mean().reset_index()['is_streak']
user_product_interaction['user_product_max_streak_length'] = streak_features[['user_id', 'product_id', 'streak_id', 'is_streak']
                                ].groupby(['user_id', 'product_id', 'streak_id']).count().reset_index()[['user_id', 'product_id',
                                'is_streak']].groupby(['user_id', 'product_id']).max().reset_index()['is_streak']
user_product_interaction['user_product_last_streak_length'] = streak_features[['user_id', 'product_id', 'streak_id', 'is_streak']
                                ].groupby(['user_id', 'product_id', 'streak_id']).count().reset_index()[['user_id', 'product_id',
                                'is_streak']].groupby(['user_id', 'product_id']).last().reset_index()['is_streak']

user_product_interaction['user_product_num_streak'] = streak_features[['user_id', 'product_id', 'streak_id']
                                ].groupby(['user_id', 'product_id']).max().reset_index()['streak_id'
                                ]/user_product_interaction['user_product_purchase_count']
user_product_interaction['user_product_in_streak'] = streak_features[['user_id', 'product_id', 'is_streak']
                                ].groupby(['user_id', 'product_id']).last().reset_index()['is_streak']
user_product_interaction['user_product_in_streak'] = user_product_interaction.apply(lambda row :1.0 if
                                        row['user_product_in_streak'] == 1.0 and row['user_product_last_streak_length'] > 3
                                        else 0.0, axis = 1)

user_features['user_mean_streak_length'] = user_product_interaction[['user_id', 'user_product_max_streak_length']].groupby(
                                                        'user_id').mean().reset_index()['user_product_max_streak_length']
product_features['product_mean_streak_length'] = user_product_interaction[['product_id', 'user_product_max_streak_length']].groupby(
                                                        'product_id').mean().reset_index()['user_product_max_streak_length']

In [46]:
product_features['product_mean_days_between_orders'] = user_product_interaction[['product_id',
                'user_product_days_since_last_order']].fillna(method = 'bfill').groupby(
    'product_id').mean().reset_index()['user_product_days_since_last_order']
product_features['product_std_days_between_orders'] = user_product_interaction[['product_id',
                'user_product_days_since_last_order']].fillna(method = 'bfill').groupby(
    'product_id').std().reset_index()['user_product_days_since_last_order']


### User Aisle, Department and Cluster Interaction Features

In [47]:
user_aisle_interaction = prior_orders[['user_id', 'aisle_id', 'order_id']].groupby(
    ['user_id','aisle_id']).count().reset_index()
user_aisle_interaction.columns = ['user_id', 'aisle_id', 'user_aisle_purchase_count']
user_aisle_interaction['user_aisle_orders_since_last_order'] = prior_orders[['aisle_id', 'user_id', 
                            'order_number']].groupby(['user_id', 'aisle_id']).last().reset_index()['order_number']
user_aisle_interaction['user_aisle_add_to_cart_order'] = prior_orders[['user_id', 'aisle_id', 'add_to_cart_order']
                                        ].groupby(['user_id', 'aisle_id']).mean().reset_index()['add_to_cart_order']
user_aisle_interaction['user_aisle_recency'] = prior_orders[['user_id', 'aisle_id', 'days_ago']
                    ].fillna(15).groupby(['user_id', 'aisle_id']).mean().reset_index()['days_ago']


user_department_interaction = prior_orders[['user_id', 'department_id', 'order_id']].groupby(
    ['user_id','department_id']).count().reset_index()
user_department_interaction.columns = ['user_id', 'department_id', 'user_department_purchase_count']

user_department_interaction['user_department_orders_since_last_order'] = prior_orders[['department_id', 'user_id', 
                            'order_number']].groupby(['user_id', 'department_id']).last().reset_index()['order_number']
user_department_interaction['user_department_add_to_cart_order'] = prior_orders[['user_id', 'department_id', 
                'add_to_cart_order']].groupby(['user_id', 'department_id']).mean().reset_index()['add_to_cart_order']
user_department_interaction['user_department_recency'] = prior_orders[['user_id', 'department_id', 'days_ago']
                    ].fillna(15).groupby(['user_id', 'department_id']).mean().reset_index()['days_ago']


user_cluster_interaction = prior_orders[['user_id', 'product_wv_cluster', 'order_id']].groupby(
    ['user_id','product_wv_cluster']).count().reset_index()
user_cluster_interaction.columns = ['user_id', 'product_wv_cluster', 'user_cluster_purchase_count']

user_cluster_interaction['user_cluster_orders_since_last_order'] = prior_orders[['product_wv_cluster', 'user_id', 
                    'order_number']].groupby(['user_id', 'product_wv_cluster']).last().reset_index()['order_number']
user_cluster_interaction['user_cluster_add_to_cart_order'] = prior_orders[['user_id', 'product_wv_cluster', 
                'add_to_cart_order']].groupby(['user_id', 'product_wv_cluster']).mean().reset_index()['add_to_cart_order']
user_cluster_interaction['user_cluster_recency'] = prior_orders[['user_id', 'product_wv_cluster', 'days_ago']
                    ].fillna(15).groupby(['user_id', 'product_wv_cluster']).mean().reset_index()['days_ago']


In [48]:
del user_orders
import gc
gc.collect()

542

### Create Final Data

In [49]:
train_users = orders[orders.eval_set == 'train'].user_id
test_users = orders[orders.eval_set == 'test'].user_id

final_train_data = user_product_interaction[user_product_interaction.user_id.isin(train_users)].merge(
                user_features, on = 'user_id', how = 'inner').merge(
                product_features, on = 'product_id', how = 'inner').merge(
                user_aisle_interaction, on = ['user_id', 'aisle_id'], how = 'inner').merge(
                user_department_interaction,on = ['user_id', 'department_id'], how = 'inner').merge(
                user_cluster_interaction, on = ['user_id', 'product_wv_cluster'], how = 'inner').merge(
                orders[orders.eval_set == 'train'], on = 'user_id', how = 'inner').merge(
                user_daily_order_rate, on = ['user_id', 'order_dow'], how = 'left').merge(
                user_hourly_order_rate, on = ['user_id','order_hour_of_day'], how = 'left').merge(
                product_daily_order_rate, on = ['product_id', 'order_dow'], how = 'left').merge(
                product_hourly_order_rate, on = ['product_id','order_hour_of_day'], how = 'left').merge(
                user_product_daily_order_rate, on = ['user_id','product_id', 'order_dow'], how = 'left').merge(
                user_product_hourly_order_rate, on = ['user_id', 'product_id','order_hour_of_day'], how = 'left').merge(
                user_daily_reorder_rate, on = ['user_id', 'order_dow'], how = 'left').merge(
                user_hourly_reorder_rate,on = ['user_id','order_hour_of_day'], how = 'left').merge(
                product_daily_reorder_rate, on = ['product_id','order_dow'], how = 'left').merge(
                product_hourly_reorder_rate, on = ['product_id','order_hour_of_day'], how = 'left').merge(
                user_product_daily_reorder_rate, on = ['user_id','product_id', 'order_dow'], how = 'left').merge(
                user_product_hourly_reorder_rate, on = ['user_id', 'product_id','order_hour_of_day'], how = 'left').merge(
                user_aisle_daily_reorder_rate, on = ['user_id', 'aisle_id', 'order_dow'], how = 'left').merge(
                user_aisle_hourly_reorder_rate, on = ['user_id', 'aisle_id', 'order_hour_of_day'], how = 'left').merge(
                aisle_features, on = 'aisle_id', how = 'inner').merge(
                department_features, on = 'department_id', how = 'inner').merge(
                cluster_features, on = 'product_wv_cluster', how = 'inner').merge(
                order_products_train, on = ['order_id', 'product_id'], how = 'left')
final_test_data = user_product_interaction[user_product_interaction.user_id.isin(test_users)].merge(
                user_features, on = 'user_id', how = 'inner').merge(
                product_features, on = 'product_id', how = 'inner').merge(
                user_aisle_interaction, on = ['user_id', 'aisle_id'], how = 'inner').merge(
                user_department_interaction, on = ['user_id', 'department_id'], how = 'inner').merge(
                user_cluster_interaction, on = ['user_id', 'product_wv_cluster'], how = 'inner').merge(
                orders[orders.eval_set == 'test'], on = 'user_id', how = 'inner').merge(
                user_daily_order_rate, on = ['user_id', 'order_dow'], how = 'left').merge(
                user_hourly_order_rate, on = ['user_id','order_hour_of_day'], how = 'left').merge(
                product_daily_order_rate, on = ['product_id', 'order_dow'], how = 'left').merge(
                product_hourly_order_rate, on = ['product_id','order_hour_of_day'], how = 'left').merge(
                user_product_daily_order_rate, on = ['user_id','product_id', 'order_dow'], how = 'left').merge(
                user_product_hourly_order_rate, on = ['user_id', 'product_id','order_hour_of_day'], how = 'left').merge(
                user_daily_reorder_rate, on = ['user_id', 'order_dow'], how = 'left').merge(
                user_hourly_reorder_rate, on = ['user_id','order_hour_of_day'], how = 'left').merge(
                product_daily_reorder_rate, on = ['product_id', 'order_dow'], how = 'left').merge(
                product_hourly_reorder_rate, on = ['product_id','order_hour_of_day'], how = 'left').merge(
                user_product_daily_reorder_rate, on = ['user_id','product_id', 'order_dow'], how = 'left').merge(
                user_product_hourly_reorder_rate, on = ['user_id', 'product_id','order_hour_of_day'], how = 'left').merge(
                user_aisle_daily_reorder_rate, on = ['user_id', 'aisle_id', 'order_dow'], how = 'left').merge(
                user_aisle_hourly_reorder_rate, on = ['user_id', 'aisle_id', 'order_hour_of_day'], how = 'left').merge(
                aisle_features, on = 'aisle_id', how = 'inner').merge(
                department_features, on = 'department_id', how = 'inner').merge(
                cluster_features, on = 'product_wv_cluster', how = 'inner')
print final_train_data.shape, final_test_data.shape

final_train_data['user_daily_order_rate'] = final_train_data['user_daily_order_rate']*1.0 / final_train_data['order_number']
final_train_data['user_hourly_order_rate'] = final_train_data['user_hourly_order_rate']*1.0 / final_train_data['order_number']
final_train_data['product_daily_order_rate'] = final_train_data['product_daily_order_rate']*1.0 / final_train_data['product_order_count']
final_train_data['product_hourly_order_rate'] = final_train_data['product_hourly_order_rate']*1.0 / final_train_data['product_order_count']
final_train_data['user_product_daily_order_rate'] = final_train_data['user_product_daily_order_rate']*1.0 / final_train_data['user_product_purchase_count']
final_train_data['user_product_hourly_order_rate'] = final_train_data['user_product_hourly_order_rate']*1.0 / final_train_data['user_product_purchase_count']


final_test_data['user_daily_order_rate'] = final_test_data['user_daily_order_rate']*1.0 / final_test_data['order_number']
final_test_data['user_hourly_order_rate'] = final_test_data['user_hourly_order_rate']*1.0 / final_test_data['order_number']
final_test_data['product_daily_order_rate'] = final_test_data['product_daily_order_rate']*1.0 / final_test_data['product_order_count']
final_test_data['product_hourly_order_rate'] = final_test_data['product_hourly_order_rate']*1.0 / final_test_data['product_order_count']
final_test_data['user_product_daily_order_rate'] = final_test_data['user_product_daily_order_rate']*1.0 / final_test_data['user_product_purchase_count']
final_test_data['user_product_hourly_order_rate'] = final_test_data['user_product_hourly_order_rate']*1.0 / final_test_data['user_product_purchase_count']


del final_train_data['add_to_cart_order']
final_train_data['user_product_days_since_last_order'].fillna(60, inplace = True)
final_train_data['reordered'].fillna(0,inplace = True)
final_train_data['user_daily_order_rate'].fillna(0,inplace = True)
final_train_data['user_hourly_order_rate'].fillna(0,inplace = True)
final_train_data['product_daily_order_rate'].fillna(0,inplace = True)
final_train_data['product_hourly_order_rate'].fillna(0,inplace = True)
final_train_data['user_product_daily_order_rate'].fillna(0,inplace = True)
final_train_data['user_product_hourly_order_rate'].fillna(0,inplace = True)
final_train_data['user_daily_reorder_rate'].fillna(0,inplace = True)
final_train_data['user_hourly_reorder_rate'].fillna(0,inplace = True)
final_train_data['product_daily_reorder_rate'].fillna(0,inplace = True)
final_train_data['product_hourly_reorder_rate'].fillna(0,inplace = True)
final_train_data['user_product_daily_reorder_rate'].fillna(0,inplace = True)
final_train_data['user_product_hourly_reorder_rate'].fillna(0,inplace = True)
final_train_data['user_aisle_daily_reorder_rate'].fillna(0,inplace = True)
final_train_data['user_aisle_hourly_reorder_rate'].fillna(0,inplace = True)


final_test_data['user_product_days_since_last_order'].fillna(60, inplace = True)
final_test_data['user_daily_order_rate'].fillna(0,inplace = True)
final_test_data['user_hourly_order_rate'].fillna(0,inplace = True)
final_test_data['product_daily_order_rate'].fillna(0,inplace = True)
final_test_data['product_hourly_order_rate'].fillna(0,inplace = True)
final_test_data['user_product_daily_order_rate'].fillna(0,inplace = True)
final_test_data['user_product_hourly_order_rate'].fillna(0,inplace = True)
final_test_data['user_daily_reorder_rate'].fillna(0,inplace = True)
final_test_data['user_hourly_reorder_rate'].fillna(0,inplace = True)
final_test_data['product_daily_reorder_rate'].fillna(0,inplace = True)
final_test_data['product_hourly_reorder_rate'].fillna(0,inplace = True)
final_test_data['user_product_daily_reorder_rate'].fillna(0,inplace = True)
final_test_data['user_product_hourly_reorder_rate'].fillna(0,inplace = True)
final_test_data['user_aisle_daily_reorder_rate'].fillna(0,inplace = True)
final_test_data['user_aisle_hourly_reorder_rate'].fillna(0,inplace = True)


(8605870, 157) (4908292, 155)


In [50]:
for var in final_train_data.columns:
    if final_train_data[var].dtype == 'float64':
        final_train_data[var] = final_train_data[var].astype(np.float32)
    elif final_train_data[var].dtype == 'int64':
        final_train_data[var] = final_train_data[var].astype(np.int32)
for var in final_test_data.columns:
    if final_test_data[var].dtype == 'float64':
        final_test_data[var] = final_test_data[var].astype(np.float32)
    elif final_test_data[var].dtype == 'int64':
        final_test_data[var] = final_test_data[var].astype(np.int32)

In [51]:
final_train_data['user_product_orders_since_last_order'] = (final_train_data['order_number'] - 
                                                            final_train_data['user_product_orders_since_last_order'] )
final_test_data['user_product_orders_since_last_order'] = (final_test_data['order_number'] - 
                                                           final_test_data['user_product_orders_since_last_order'] )
final_train_data['user_product_last_dow_diff'] = final_train_data['user_product_last_dow'] - final_train_data['order_dow']
final_train_data['user_product_last_dow_diff'] = final_train_data['user_product_last_dow_diff'].map(lambda x: min(abs(x), 7 - abs(x)))
final_test_data['user_product_last_dow_diff'] = final_test_data['user_product_last_dow'] - final_test_data['order_dow']
final_test_data['user_product_last_dow_diff'] = final_test_data['user_product_last_dow_diff'].map(lambda x: min(abs(x), 7 - abs(x)))

final_train_data['user_product_last_tod_diff'] = final_train_data['user_product_last_tod'] - final_train_data['order_hour_of_day']
final_train_data['user_product_last_tod_diff'] = final_train_data['user_product_last_tod_diff'].map(lambda x: min(abs(x), 8 - abs(x)))
final_test_data['user_product_last_tod_diff'] = final_test_data['user_product_last_tod'] - final_test_data['order_hour_of_day']
final_test_data['user_product_last_tod_diff'] = final_test_data['user_product_last_tod_diff'].map(lambda x: min(abs(x), 8 - abs(x)))



final_train_data['user_last_order_dow_diff'] = final_train_data['user_last_order_dow_diff'] - final_train_data['order_dow']
final_train_data['user_last_order_dow_diff'] = final_train_data['user_last_order_dow_diff'].map(lambda x: min(abs(x), 7 - abs(x)))
final_test_data['user_last_order_dow_diff'] = final_test_data['user_last_order_dow_diff'] - final_test_data['order_dow']
final_test_data['user_last_order_dow_diff'] = final_test_data['user_last_order_dow_diff'].map(lambda x: min(abs(x), 7 - abs(x)))

final_train_data['user_last_order_tod_diff'] = final_train_data['user_last_order_tod_diff'] - final_train_data['order_hour_of_day']
final_train_data['user_last_order_tod_diff'] = final_train_data['user_last_order_tod_diff'].map(lambda x: min(abs(x), 8 - abs(x)))
final_test_data['user_last_order_tod_diff'] = final_test_data['user_last_order_tod_diff'] - final_test_data['order_hour_of_day']
final_test_data['user_last_order_tod_diff'] = final_test_data['user_last_order_tod_diff'].map(lambda x: min(abs(x), 8 - abs(x)))


In [52]:
final_train_data['user_product_purchase_due'] = final_train_data[['user_product_days_since_last_order',
                'product_mean_days_between_orders','product_std_days_between_orders']].apply(lambda row : 1.0 if 
                row['user_product_days_since_last_order'] > row['product_mean_days_between_orders'] + 
                row['product_std_days_between_orders'] else 0.0, axis = 1)

final_test_data['user_product_purchase_due'] = final_test_data[['user_product_days_since_last_order',
                'product_mean_days_between_orders','product_std_days_between_orders']].apply(lambda row : 1.0 if 
                row['user_product_days_since_last_order'] > row['product_mean_days_between_orders'] + 
                row['product_std_days_between_orders'] else 0.0, axis = 1)

In [53]:
final_train_data['user_product_purchase_ratio'] = final_train_data['user_product_purchase_count']*1.0/final_train_data['order_number']
final_test_data['user_product_purchase_ratio'] = final_test_data['user_product_purchase_count']*1.0/final_test_data['order_number']


final_train_data['posterior_probability_product'] = (final_train_data['product_order_count']/final_train_data['product_user_count'])*(
                                                    final_train_data['user_product_purchase_ratio'])
final_test_data['posterior_probability_product'] = (final_test_data['product_order_count']/final_test_data['product_user_count'])*(
                                                    final_test_data['user_product_purchase_ratio'])

final_train_data['posterior_probability_aisle'] = final_train_data['aisle_reorder_frequency']*final_train_data[
                    'user_aisle_purchase_count']/final_train_data['order_number']
final_test_data['posterior_probability_aisle'] = final_test_data['aisle_reorder_frequency']*final_test_data[
                    'user_aisle_purchase_count']/final_test_data['order_number']

final_train_data['posterior_probability_cluster'] = final_train_data['cluster_reorder_frequency']*final_train_data[
                    'user_cluster_purchase_count']/final_train_data['order_number']
final_test_data['posterior_probability_cluster'] = final_test_data['cluster_reorder_frequency']*final_test_data[
                    'user_cluster_purchase_count']/final_test_data['order_number']

final_train_data['posterior_probability_department'] = final_train_data['department_reorder_frequency']*final_train_data[
                    'user_department_purchase_count']/final_train_data['order_number']
final_test_data['posterior_probability_department'] = final_test_data['department_reorder_frequency']*final_test_data[
                    'user_department_purchase_count']/final_test_data['order_number']

# final_train_data['posterior_probability_jj_daily'] = final_train_data['user_daily_order_rate']*final_train_data['product_daily_order_rate']
# final_train_data['posterior_probability_jj_hourly'] = final_train_data['user_hourly_order_rate']*final_train_data['product_hourly_order_rate']

# final_test_data['posterior_probability_jj_daily'] = final_test_data['user_daily_order_rate']*final_test_data['product_daily_order_rate']
# final_test_data['posterior_probability_jj_hourly'] = final_test_data['user_hourly_order_rate']*final_test_data['product_hourly_order_rate']


In [54]:
final_train_data['user_num_orders_with_no_reorder'] = final_train_data['order_number'] - final_train_data['user_num_orders_with_no_reorder']
final_train_data['user_fraction_orders_with_no_reorder'] = final_train_data['user_fraction_orders_with_no_reorder'].map(lambda x : 1.0-x)
final_test_data['user_num_orders_with_no_reorder'] = final_test_data['order_number'] - final_test_data['user_num_orders_with_no_reorder']
final_test_data['user_fraction_orders_with_no_reorder'] = final_test_data['user_fraction_orders_with_no_reorder'].map(lambda x : 1.0-x)

In [55]:
def get_similarity(row):
    return np.dot(wv_model.wv['p' + str(int(row.product_id))],wv_model.docvecs['u' + str(int(row.user_id))] )
final_train_data['user_product_similarity'] = final_train_data[['user_id', 'product_id']].apply(get_similarity, axis = 1)
final_test_data['user_product_similarity'] = final_test_data[['user_id', 'product_id']].apply(get_similarity, axis = 1)


In [56]:
product_none_similarity = pd.read_csv("../Data/product_none_similarity.csv")
final_train_data = final_train_data.merge(product_none_similarity, on = 'product_id', how = 'inner')
final_test_data = final_test_data.merge(product_none_similarity, on = 'product_id', how = 'inner')

In [57]:
del final_train_data['product_name']
del final_test_data['product_name']

In [58]:
final_train_data = final_train_data.merge(users, on = 'user_id', how = 'inner')
final_test_data = final_test_data.merge(users, on = 'user_id', how = 'inner')

In [59]:
final_train_data['product_std_days_between_orders'].fillna(0, inplace = True)
final_test_data['product_std_days_between_orders'].fillna(0, inplace = True)

In [60]:
for var in final_test_data.columns:
    null_count = final_test_data[var].isnull().sum()
    if null_count > 0:
        print var, null_count

In [61]:
final_train_data.to_csv("../Data/final_train_data.csv", index = False)
final_test_data.to_csv("../Data/final_test_data.csv", index = False)

In [62]:
final_train_data.shape

(8605870, 176)