In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = "/content/drive/MyDrive/Intern-TB/data_csv/"   #The directory of file

In [None]:
'''Loading prior order data'''

print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')

# dtype={'order_id': np.int32}
# dtype='product_id': np.uint16,'add_to_cart_order': np.int16,'reordered': np.int8}

  
# we will change the data type 
# of id column to str by giving
# the dict to the astype method

priors.order_id = priors.order_id.astype(np.int32)
priors.product_id = priors.product_id.astype(np.uint16)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors["reordered"] = priors["reordered"].astype(np.int8)


loading prior


In [None]:
priors.dtypes


order_id              int32
product_id           uint16
add_to_cart_order     int16
reordered              int8
dtype: object

In [None]:
'''Loading Training data'''
print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv')

train.order_id = train.order_id.astype(np.int32)
train.product_id = train.product_id.astype(np.uint16)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
train["reordered"] = train["reordered\r"].astype(np.int8)

loading train


In [None]:
train = train.drop(["reordered\r"],axis=1)

#droped redundant column

In [None]:
train.dtypes

order_id              int32
product_id           uint16
add_to_cart_order     int16
reordered              int8
dtype: object

In [None]:
'''Loading order data'''

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

loading orders


In [None]:
'''Loading Products Data'''

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

loading products


In [None]:


print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))


priors (498, 4): order_id, product_id, add_to_cart_order, reordered
orders (245, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (758, 4): order_id, product_id, add_to_cart_order, reordered


In [None]:

print('computing product f')
prods = pd.DataFrame()
print(prods)
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)

# prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

# products = products.join(prods, on='product_id')
# products.set_index('product_id', drop=False, inplace=True)
# del prods


computing product f
Empty DataFrame
Columns: []
Index: []


In [None]:
prods

Unnamed: 0_level_0,orders,reorders
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
40577,2,1.0
40578,3,2.0
40579,6,5.0
40582,7,6.0
40583,5,4.0
...,...,...
40962,4,3.0
41256,1,0.0
41266,1,0.0
41275,1,0.0


In [None]:
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)



In [None]:
prods.reset_index(drop = False, inplace = True)
products.reset_index(drop = True, inplace = True)

products = pd.merge(products,prods,on='product_id')

products.set_index("product_id",drop=False,inplace = True)

# products.set_index('product_id', drop=False, inplace=True)
del prods

In [None]:
# products.reset_index(drop = True,inplace = True)
products.set_index(["product_id"])

Unnamed: 0_level_0,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
40601,1,4,23,22.0,0.956522
40617,1,4,6,5.0,0.833333
40650,2,3,21,20.0,0.952381
40733,4,1,12,11.0,0.916667
40745,1,4,17,16.0,0.941176
...,...,...,...,...,...
40602,4,1,2,1.0,0.500000
40714,2,3,1,0.0,0.000000
40648,2,3,4,3.0,0.750000
40839,1,4,1,0.0,0.000000


In [None]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


We created "computing user f", in which we calculated Average days between two orders, and total number of orders made by a user

In [None]:
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)
usr

computing user f


Unnamed: 0_level_0,average_days_between_orders,nb_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
277,0.348361,245


Now from priors we have added, total items bought by a user, all product he bought, and how many distinct items he baught. 

In [None]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users = users.join(usr)   #joined both the dataframes and dropped the useless dataframe usr

del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)      #this user feature gives Avg. Items in single time buy
print('user f', users.shape)
users

user f (1, 6)


Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
277,498,"{40962, 40577, 40578, 40579, 40582, 40583, 405...",82,0.348361,245,2.032653


In [None]:
users.columns

Index(['total_items', 'all_products', 'total_distinct_items',
       'average_days_between_orders', 'nb_orders', 'average_basket'],
      dtype='object')

In [None]:
print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

# This was to slow !!
#def last_order(order_group):
#    ix = order_group.order_number.idxmax
#    return order_group.shape[0], order_group.order_id[ix],  order_group.add_to_cart_order.mean()
#userXproduct = pd.DataFrame()
#userXproduct['tmp'] = df.groupby('user_product').apply(last_order)


compute userXproduct f - this is long...


In [None]:
d= dict()
for row in priors.itertuples():
    z = row.user_product           #we have defined this in last cell
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

del priors

to dataframe (less memory)
user X product f 82


In [None]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)


split orders : train, test


In [None]:
#features extracter for any dataset
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users["nb_orders"])
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)
    

In [None]:
df_train, labels = features(train_orders, labels_given=True)


build candidate list
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in_cart              float32
UP_reorder_rate                     float32
UP_orders_since_last                  int

In [None]:
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'


print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
del df_train

formating for lgb


In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
del d_train

light GBM train :-)




In [None]:
df_test, _ = features(test_orders)

print('light GBM predict')
preds = bst.predict(df_test[f_to_use])

df_test['pred'] = preds

TRESHOLD = 0.31  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)


build candidate list
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in_cart              float32
UP_reorder_rate                     float32
UP_orders_since_last                  int

In [None]:
# pd.read_csv(IDIR+"")
df_test

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,pred
0,62408,40962,245,498,82,0.348361,2.032653,16,1.0,2.870588,...,4,3.0,0.750000,4,0.016327,12.500000,0.016327,191,0,0.001154
1,62408,40577,245,498,82,0.348361,2.032653,16,1.0,2.870588,...,2,1.0,0.500000,2,0.008163,2.000000,0.008163,169,8,0.010214
2,62408,40578,245,498,82,0.348361,2.032653,16,1.0,2.870588,...,3,2.0,0.666667,3,0.012245,2.666667,0.012245,184,1,0.250201
3,62408,40579,245,498,82,0.348361,2.032653,16,1.0,2.870588,...,6,5.0,0.833333,6,0.024490,5.666667,0.024490,171,7,0.054223
4,62408,40582,245,498,82,0.348361,2.032653,16,1.0,2.870588,...,7,6.0,0.857143,7,0.028571,1.142857,0.028571,171,7,0.413263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3357,63720,40870,245,498,82,0.348361,2.032653,9,0.0,0.000000,...,1,0.0,0.000000,1,0.004082,0.000000,0.004082,165,8,0.001057
3358,63720,40876,245,498,82,0.348361,2.032653,9,0.0,0.000000,...,1,0.0,0.000000,1,0.004082,30.000000,0.004082,221,2,0.000035
3359,63720,40924,245,498,82,0.348361,2.032653,9,0.0,0.000000,...,10,9.0,0.900000,10,0.040816,2.400000,0.040816,165,8,0.000479
3360,63720,40928,245,498,82,0.348361,2.032653,9,0.0,0.000000,...,1,0.0,0.000000,1,0.004082,1.000000,0.004082,197,9,0.000122


In [None]:
test_orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
62408,62408,277,test,205,6,16,1.0
62453,62453,277,test,206,6,16,0.0
62454,62454,277,test,207,6,12,0.0
62455,62455,277,test,208,6,9,0.0
62530,62530,277,test,209,7,13,1.0
62577,62577,277,test,210,1,14,1.0
62602,62602,277,test,211,1,14,0.0
62604,62604,277,test,212,1,12,0.0
62792,62792,277,test,213,3,12,2.0
62794,62794,277,test,214,3,13,0.0


In [None]:
test = pd.read_csv(IDIR + "test.csv")
test

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,62408,40588,3.0,1
1,62408,40589,3.0,1
2,62408,40594,80.0,1
3,62408,40630,5.0,1
4,62408,40690,10.0,1
...,...,...,...,...
224,63720,40630,10.0,1
225,63720,40643,40.0,1
226,63720,40690,15.0,1
227,63720,40729,1.0,1


In [None]:
order_id1 = list(test.order_id)

In [None]:
products1 = list(test.product_id)

In [None]:
order_id_test = set(order_id1)
test_dict = {}
for id in order_id_test:
    temp = test[test["order_id"]==id]
    temp1 = list(temp["product_id"])
    test_dict[id] = temp1

     
    

In [None]:
test_dict

{63618: [40587, 40588, 40589, 40632, 40649, 40669, 40672, 40724, 40760],
 63619: [40617, 40643, 40773, 40801],
 62602: [40586, 40626, 40643, 40806, 40844, 40848],
 62859: [40586, 40590, 40635],
 62604: [40611, 40617, 40797],
 63249: [40611, 40773],
 62995: [40579,
  40583,
  40587,
  40588,
  40601,
  40629,
  40643,
  40672,
  40690,
  40760,
  40769],
 62996: [40745, 40769, 40773, 40923],
 63265: [40587, 40588, 40589, 40594, 40601, 40619, 40690, 40714, 40760, 40774],
 63138: [40587, 40632, 40648, 40658, 40724, 40729, 40760, 40806, 40923],
 63521: [40617, 40646, 40650, 40923],
 63522: [40647, 40649, 40733],
 63281: [40646, 40650, 40733, 40773],
 63030: [40962],
 63162: [40601, 40650, 40658],
 63420: [40579, 40587, 40588, 40589, 40629, 40630, 40648, 40690, 40760, 40773],
 63421: [40635, 40643],
 62913: [40617, 40646, 40649, 40650, 40773],
 62530: [40587,
  40588,
  40589,
  40627,
  40629,
  40631,
  40648,
  40690,
  40753,
  40760,
  40771,
  40774,
  40806,
  40841,
  40923],
 62915

In [None]:
order_id_test

{62408,
 62453,
 62454,
 62455,
 62530,
 62577,
 62602,
 62604,
 62792,
 62794,
 62859,
 62913,
 62915,
 62916,
 62995,
 62996,
 63030,
 63081,
 63082,
 63102,
 63138,
 63162,
 63203,
 63249,
 63265,
 63281,
 63337,
 63338,
 63420,
 63421,
 63461,
 63464,
 63521,
 63522,
 63580,
 63581,
 63584,
 63618,
 63619,
 63715,
 63720}

In [None]:
pred_dict = {}

In [None]:
for i in sub.index:
  pred_dict[sub["order_id"][i]] = sub["products"][i]

In [None]:
pred_dict

{62408: '40582 40587 40588 40589 40590 40594 40630 40632 40637 40643 40648 40649 40690 40696 40724 40760 40774',
 62453: '40586 40630',
 62454: '40587 40643 40760 40773',
 62455: '40760',
 62530: '40582 40587 40588 40589 40594 40613 40669 40760',
 62577: '40587 40594 40643 40760',
 62602: '40587 40643 40650 40760',
 62604: '40587 40643 40760 40773',
 62792: '40594',
 62794: '40586 40587 40643',
 62859: '40582 40588 40690 40760',
 62913: '40587 40590 40594 40630 40632 40648 40690 40760 40774 40796',
 62915: '40586 40630',
 62916: '40587 40601 40690 40760 40774',
 62995: '40643 40650 40724 40760',
 62996: '40586 40630',
 63030: '40579 40582 40587 40588 40589 40594 40613 40643 40760 40773 40841',
 63081: '40760',
 63082: '40760',
 63102: '40643 40650 40724 40760',
 63138: '40760',
 63162: '40586 40587 40643',
 63203: '40587 40594 40643 40760',
 63249: '40587 40643 40650 40760',
 63265: '40587 40588 40589 40594 40690 40760',
 63281: '40760',
 63337: '40586 40587 40643',
 63338: '40587 4059

In [None]:
def sim(test,pred):    #defining simmilarity index between two lists of products 
  count = 0
  for i in test:
     if str(i) in pred:
       count = count + 1 
  score = float(count) /float(len(pred))
  return score

pred_score = []
for id in list(pred_dict.keys()):
  pred_score.append(sim(test_dict[id],list(pred_dict[id].split(" "))))

  
  
    

In [None]:
pred_score

[0.29411764705882354,
 0.0,
 0.5,
 0.0,
 0.5,
 1.0,
 0.25,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.0,
 0.0,
 0.0,
 1.0,
 0.25,
 1.0,
 0.0,
 0.5,
 0.0,
 1.0,
 0.0,
 0.6666666666666666,
 0.5,
 0.25,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0]

In [None]:
sum(pred_score)/len(pred_score) #avg prediction success rate for, could be changed by varing threshold value 

0.25717360114777615

We had provided with the data of a single customer, maybe on single customer it don't perform well, but with a good amount of userdata it could perform well.