In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from catboost import Pool

## Merging the csv's to create the main dataset

In [20]:
train = pd.read_csv(f'datasets/train_file.csv')
test = pd.read_csv(f'datasets/test_file.csv')
fulfilment_center = pd.read_csv(f'datasets/fulfilment_center_info.csv')
meal_info = pd.read_csv(f'datasets/meal_info.csv')

In [21]:
train.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [22]:
test.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured
0,1028232,146,55,1885,158.11,159.11,0,0
1,1127204,146,55,1993,160.11,159.11,0,0
2,1212707,146,55,2539,157.14,159.14,0,0
3,1082698,146,55,2631,162.02,162.02,0,0
4,1400926,146,55,1248,163.93,163.93,0,0


In [23]:
fulfilment_center.head()

Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [24]:
meal_info.head()

Unnamed: 0,meal_id,category,cuisine
0,1885,Beverages,Thai
1,1993,Beverages,Thai
2,2539,Beverages,Thai
3,1248,Beverages,Indian
4,2631,Beverages,Indian


In [25]:
train = pd.merge(train,fulfilment_center, on='center_id')
test = pd.merge(test,fulfilment_center, on='center_id')

train = pd.merge(train,meal_info, on='meal_id')
test = pd.merge(test,meal_info, on='meal_id')

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
 9   city_code              456548 non-null  int64  
 10  region_code            456548 non-null  int64  
 11  center_type            456548 non-null  object 
 12  op_area                456548 non-null  float64
 13  category               456548 non-null  object 
 14  cuisine                456548 non-nu

In [27]:
train.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1466964,1,55,1993,136.83,135.83,0,0,270,647,56,TYPE_C,2.0,Beverages,Thai
2,1346989,1,55,2539,134.86,135.86,0,0,189,647,56,TYPE_C,2.0,Beverages,Thai
3,1338232,1,55,2139,339.5,437.53,0,0,54,647,56,TYPE_C,2.0,Beverages,Indian
4,1448490,1,55,2631,243.5,242.5,0,0,40,647,56,TYPE_C,2.0,Beverages,Indian


In [28]:
# Removing outliers

#the no_of_orders outlier
outlier_index = train[(train['num_orders']>15000)].index
train.drop(outlier_index,inplace = True)

In [29]:
#Normalising number of orders in train set with  log(1+x)
train['num_orders'] = np.log1p(train['num_orders'])
#np.expm1 is the inverse

In [30]:
#Joining train and test to do feature engineering, will split after that
train['train_test'] = 'train'
test['train_test']='test'

#also grouping same centers together, same meals together, in order of week
total_data = pd.concat([train, test], ignore_index=True).reset_index(drop=True)[train.columns].sort_values(['center_id', 'meal_id', 'week']).reset_index(drop=True)


In [31]:
total_data['checkout_price'] = np.log1p(total_data['checkout_price']) #after discount
total_data['base_price'] = np.log1p(total_data['base_price'])
total_data['discount_on_base'] = (total_data['base_price'] - total_data['checkout_price']) / total_data['base_price']
total_data["discount_ratio"] = total_data["base_price"] / total_data["checkout_price"]

In [32]:
#Adding difference in price from previous day
total_data['price_last_curr_diff'] = (total_data['checkout_price'].shift(1) - total_data['checkout_price']).fillna(1)/ total_data['checkout_price'].shift(1).fillna(1)

In [33]:
#setting 'price_last_curr_diff' for first week for each meal,center pair to be the mean of the remaining weeks
for _, r in total_data.groupby(['center_id', 'meal_id'])['week'].first().reset_index().iterrows():
    total_data.loc[(total_data['center_id']==r['center_id']) & (total_data['meal_id']==r['meal_id']) & (total_data['week']==r['week']), 'price_last_curr_diff'] = total_data[(total_data['center_id']==r['center_id']) & (total_data['meal_id']==r['meal_id']) & (total_data['week']!=r['week'])]['price_last_curr_diff'].mean()
total_data['price_last_curr_diff'] = total_data['price_last_curr_diff'].fillna(0)

In [34]:
#Creating lag feature on num_orders from 10, 11 and 12th previous week
lags = [10, 11, 12]
gpby = total_data.groupby(['center_id','meal_id'])
for i in lags:
    total_data['_'.join(['num_orders', 'lag', str(i)])] = gpby['num_orders'].shift(i).values + np.random.normal(scale=1.6, size=(len(total_data),))

In [35]:
# Creating exponentially weighted mean features from num_orders
gpby = total_data.groupby(['center_id','meal_id'])
shift=[10,11,12,13,14,15]
alpha = 0.5
for s in shift:
    # total_data['_'.join(['num_orders', 'lag', str(s), 'ewm', str(alpha)])] = gpby['num_orders'].shift(s).ewm(alpha=alpha).mean().values
    lagged = gpby['num_orders'].shift(s)
    total_data['_'.join(['num_orders', 'lag', str(s), 'ewm', str(alpha)])] = (
        lagged
        .groupby([total_data['center_id'], total_data['meal_id']])
        .apply(lambda x: x.ewm(alpha=alpha).mean())
        .reset_index(level=[0,1], drop=True)
    )


In [36]:
total_data.head()
total_data.to_csv('total_data.csv', index = False)