# Kaggle Food Demand

## Imports

In [67]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import xgboost as xgb

%matplotlib inline
plt.style.use('seaborn-darkgrid')

In [2]:
def set_pd_options():
    options = {
        'display': {
            'max_columns': None,
            'max_colwidth': 35,
            'expand_frame_repr': False,  # Don't wrap to multiple pages
            'max_rows': 200,
            'max_seq_items': 50,         # Max length of printed sequence
            'precision': 6,
            'show_dimensions': False
        },
        # 'mode': {
        #     'chained_assignment': None   # Controls SettingWithCopyWarning
        # }
    }
    
    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)  # Python 3.6+
    print('pandas options updated')

In [3]:
# to display slanted column headings
def display_df(df, level=1):
    """
    display dataframe with rotated column names
    """
    from IPython.core.display import display, HTML

    style = """
    <style>
    th.rotate {height: 140px; white-space: nowrap;
    }

    th.rotate > div {transform: translate(25px, 51px) rotate(315deg); width: 30px;
    }

    th.rotate > div > span {border-bottom: 1px solid #ccc;  padding: 5px 10px;
    }
    </style>
    """
    dfhtml = style + df.to_html()

    try:               colnames = df.columns.get_level_values(level).values
    except IndexError: colnames = df.columns.values

    for name in colnames:
        dfhtml = dfhtml.replace(f'<th>{name}</th>', f'<th class="rotate"><div><span>{name}</span></div></th>')

    display(HTML(dfhtml))

## Dataset

### Merge Datasets

In [4]:
def merge_dataframe(csv_file='train.csv', centerinfo_csv='fulfilment_center_info.csv', 
                    mealinfo_csv='meal_info.csv'):
    ### change csv_file input to 'test.csv' to clean the test data
    ### note that to use default inputs, the csv files must be in the same directory as the notebook
    df = pd.read_csv(csv_file)
    df_centerinfo = pd.read_csv(centerinfo_csv)
    df_mealinfo = pd.read_csv(mealinfo_csv)
    print('loaded files')
    
    combined_df = df.merge(df_centerinfo, how='left', on='center_id')
    combined_df = combined_df.merge(df_mealinfo, how='left', on='meal_id')
    print('combined_df:', combined_df.shape)

    combined_df.set_index(keys='id',inplace=True)

    print(combined_df.shape)
    return combined_df

In [5]:
train_df = merge_dataframe(csv_file='dat/kaggle/train.csv', centerinfo_csv='dat/kaggle/fulfilment_center_info.csv', mealinfo_csv='dat/kaggle/meal_info.csv')
train_df.head()

loaded files
combined_df: (453280, 15)
(453280, 14)


Unnamed: 0_level_0,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1466964,1,55,1993,136.83,135.83,0,0,270,647,56,TYPE_C,2.0,Beverages,Thai
1346989,1,55,2539,134.86,135.86,0,0,189,647,56,TYPE_C,2.0,Beverages,Thai
1338232,1,55,2139,339.5,437.53,0,0,54,647,56,TYPE_C,2.0,Beverages,Indian
1448490,1,55,2631,243.5,242.5,0,0,40,647,56,TYPE_C,2.0,Beverages,Indian


In [6]:
test_df = merge_dataframe(csv_file='dat/kaggle/test.csv', centerinfo_csv='dat/kaggle/fulfilment_center_info.csv', mealinfo_csv='dat/kaggle/meal_info.csv')
test_df.head()

loaded files
combined_df: (3268, 14)
(3268, 13)


Unnamed: 0_level_0,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,city_code,region_code,center_type,op_area,category,cuisine
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1336751,145,55,1885,156.23,156.23,0,0,647,56,TYPE_C,2.0,Beverages,Thai
1401715,145,55,1993,159.14,158.14,0,0,647,56,TYPE_C,2.0,Beverages,Thai
1428343,145,55,2539,158.11,160.11,0,0,647,56,TYPE_C,2.0,Beverages,Thai
1040648,145,55,2139,292.03,292.03,0,0,647,56,TYPE_C,2.0,Beverages,Indian
1482790,145,55,2631,165.93,165.93,0,0,647,56,TYPE_C,2.0,Beverages,Indian


In [7]:
train_df.dtypes

week                       int64
center_id                  int64
meal_id                    int64
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
city_code                  int64
region_code                int64
center_type               object
op_area                  float64
category                  object
cuisine                   object
dtype: object

In [8]:
test_df.dtypes

week                       int64
center_id                  int64
meal_id                    int64
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
city_code                  int64
region_code                int64
center_type               object
op_area                  float64
category                  object
cuisine                   object
dtype: object

In [9]:
train_df.nunique()

week                      144
center_id                  77
meal_id                    51
checkout_price           1992
base_price               1905
emailer_for_promotion       2
homepage_featured           2
num_orders               1250
city_code                  51
region_code                 8
center_type                 3
op_area                    30
category                   14
cuisine                     4
dtype: int64

In [10]:
test_df.nunique()

week                       1
center_id                 77
meal_id                   51
checkout_price           606
base_price               589
emailer_for_promotion      2
homepage_featured          2
city_code                 51
region_code                8
center_type                3
op_area                   30
category                  14
cuisine                    4
dtype: int64

### Change Data Types 

In [11]:
# center_id, meal_id, city_code, region_code is numerical, so need to convert to category
train_df[['center_id', 'meal_id', 'city_code', 'region_code']] = train_df[['center_id', 'meal_id', 'city_code', 'region_code']].astype('category')
test_df[['center_id', 'meal_id', 'city_code', 'region_code']] = test_df[['center_id', 'meal_id', 'city_code', 'region_code']].astype('category')

In [12]:
train_df.dtypes

week                        int64
center_id                category
meal_id                  category
checkout_price            float64
base_price                float64
emailer_for_promotion       int64
homepage_featured           int64
num_orders                  int64
city_code                category
region_code              category
center_type                object
op_area                   float64
category                   object
cuisine                    object
dtype: object

In [13]:
test_df.dtypes

week                        int64
center_id                category
meal_id                  category
checkout_price            float64
base_price                float64
emailer_for_promotion       int64
homepage_featured           int64
city_code                category
region_code              category
center_type                object
op_area                   float64
category                   object
cuisine                    object
dtype: object

### Dummy Variables 

In [14]:
train_new_df = pd.get_dummies(train_df, prefix_sep='_', columns=['region_code', 'center_type', 'category', 'cuisine'], drop_first=True)

In [15]:
display_df(train_new_df.head())

Unnamed: 0_level_0,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
1379560,1,55,1885,136.83,152.29,0,0,177,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1466964,1,55,1993,136.83,135.83,0,0,270,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1346989,1,55,2539,134.86,135.86,0,0,189,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1338232,1,55,2139,339.5,437.53,0,0,54,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1448490,1,55,2631,243.5,242.5,0,0,40,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [16]:
train_new_df.shape

(453280, 35)

In [17]:
test_new_df = pd.get_dummies(test_df, prefix_sep='_', columns=['region_code', 'center_type', 'category', 'cuisine'], drop_first=True)

In [18]:
display_df(test_new_df.head())

Unnamed: 0_level_0,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,city_code,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1336751,145,55,1885,156.23,156.23,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1401715,145,55,1993,159.14,158.14,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1428343,145,55,2539,158.11,160.11,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1040648,145,55,2139,292.03,292.03,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1482790,145,55,2631,165.93,165.93,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [19]:
test_new_df.shape

(3268, 34)

### Add & Drop Columns

In [20]:
# add new columns - percentage discount
train_new_df['pct_disct'] = (train_new_df['base_price'] - train_new_df['checkout_price']) / train_new_df['base_price']
test_new_df['pct_disct'] = (test_new_df['base_price'] - test_new_df['checkout_price']) / test_new_df['base_price']

In [21]:
# drop column due to high correlation
train_new_df = train_new_df.drop('checkout_price',axis=1)
test_new_df = test_new_df.drop('checkout_price',axis=1)

### Target Encoding

In [22]:
# left with center_id, meal_id, city_code - use target encoder (mean and sum)

In [23]:
for catg in ['center_id', 'meal_id', 'city_code']:
    train_new_df[catg+'_mean'] = train_new_df.groupby([catg])['num_orders'].transform('mean')
    train_new_df[catg+'_sum']  = train_new_df.groupby([catg])['num_orders'].transform('sum')

In [24]:
display_df(train_new_df.head())

Unnamed: 0_level_0,week,center_id,meal_id,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai,pct_disct,center_id_mean,center_id_sum,meal_id_mean,meal_id_sum,city_code_mean,city_code_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1379560,1,55,1885,152.29,0,0,177,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.101517,163.768147,837019,752.73182,8291341,276.787149,3252249
1466964,1,55,1993,135.83,0,0,270,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.007362,163.768147,837019,601.365975,6638479,276.787149,3252249
1346989,1,55,2539,135.86,0,0,189,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.007361,163.768147,837019,310.212261,3218142,276.787149,3252249
1338232,1,55,2139,437.53,0,0,54,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.224053,163.768147,837019,42.708121,377070,276.787149,3252249
1448490,1,55,2631,242.5,0,0,40,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.004124,163.768147,837019,104.350024,1083675,276.787149,3252249


In [25]:
train_new_df.shape

(453280, 41)

In [26]:
# need to add new target encoded columns into test dataset

In [27]:
target_df = train_new_df[['center_id', 'meal_id', 'city_code', 'center_id_mean', 'center_id_sum', 'meal_id_mean', 'meal_id_sum', 'city_code_mean', 'city_code_sum']]
target_df.head()

Unnamed: 0_level_0,center_id,meal_id,city_code,center_id_mean,center_id_sum,meal_id_mean,meal_id_sum,city_code_mean,city_code_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1379560,55,1885,647,163.768147,837019,752.73182,8291341,276.787149,3252249
1466964,55,1993,647,163.768147,837019,601.365975,6638479,276.787149,3252249
1346989,55,2539,647,163.768147,837019,310.212261,3218142,276.787149,3252249
1338232,55,2139,647,163.768147,837019,42.708121,377070,276.787149,3252249
1448490,55,2631,647,163.768147,837019,104.350024,1083675,276.787149,3252249


In [28]:
# separate into each df
target_center_df = target_df[['center_id', 'center_id_mean', 'center_id_sum']]
target_meal_df   = target_df[['meal_id', 'meal_id_mean', 'meal_id_sum']]
target_city_df   = target_df[['city_code', 'city_code_mean', 'city_code_sum']]

In [29]:
# dropping duplicates on each df above
target_center_df = target_center_df.drop_duplicates()
target_meal_df   = target_meal_df.drop_duplicates()
target_city_df   = target_city_df.drop_duplicates()

In [30]:
target_center_df.shape

(77, 3)

In [31]:
target_center_df.sample(10)

Unnamed: 0_level_0,center_id,center_id_mean,center_id_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1287740,53,175.591868,1140118
1356543,64,180.32716,993242
1472084,109,264.128759,1651333
1280872,186,153.483607,842625
1368822,108,331.275431,2250354
1171094,13,610.156683,4268046
1138424,153,229.887485,1528292
1345877,101,246.040783,1508230
1379648,50,239.047749,1396756
1244454,14,198.127188,1188565


In [32]:
target_meal_df.shape

(51, 3)

In [33]:
target_meal_df.sample(10)

Unnamed: 0_level_0,meal_id,meal_id_mean,meal_id_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1044821,1971,518.120363,5712277
1137548,2569,351.447772,3297283
1325272,2704,164.498511,1602051
1107611,1770,28.495627,188983
1153657,2956,92.641354,303771
1446016,2290,880.737275,9741835
1058482,2826,461.238342,5064397
1379560,1885,752.73182,8291341
1338232,2139,42.708121,377070
1346989,2539,310.212261,3218142


In [34]:
target_city_df.shape

(51, 3)

In [35]:
target_city_df.sample(10)

Unnamed: 0_level_0,city_code,city_code_mean,city_code_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1230148,683,192.952309,1015508
1219520,700,406.020175,2777178
1212128,685,447.553939,3107367
1004691,632,177.61796,933560
1118999,526,211.887886,9156735
1379648,556,239.047749,1396756
1040403,614,279.686355,3144234
1034383,659,269.487683,1619082
1307163,675,236.666549,1342136
1205653,485,254.533686,1443206


In [36]:
# merge on center_id
test_new_df = test_new_df.merge(target_center_df, how='left', on='center_id')

In [37]:
# merge on meal_id
test_new_df = test_new_df.merge(target_meal_df, how='left', on='meal_id')

In [38]:
# merge on city_code
test_new_df = test_new_df.merge(target_city_df, how='left', on='city_code')

In [39]:
display_df(test_new_df.head())

Unnamed: 0,week,center_id,meal_id,base_price,emailer_for_promotion,homepage_featured,city_code,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai,pct_disct,center_id_mean,center_id_sum,meal_id_mean,meal_id_sum,city_code_mean,city_code_sum
0,145,55,1885,156.23,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,163.768147,837019,752.73182,8291341,276.787149,3252249
1,145,55,1993,158.14,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.006324,163.768147,837019,601.365975,6638479,276.787149,3252249
2,145,55,2539,160.11,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.012491,163.768147,837019,310.212261,3218142,276.787149,3252249
3,145,55,2139,292.03,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,163.768147,837019,42.708121,377070,276.787149,3252249
4,145,55,2631,165.93,0,0,647,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,163.768147,837019,104.350024,1083675,276.787149,3252249


### Check Null Values

In [40]:
test_new_df.isna().sum()

week                     0
center_id                0
meal_id                  0
base_price               0
emailer_for_promotion    0
homepage_featured        0
city_code                0
op_area                  0
region_code_34           0
region_code_35           0
region_code_56           0
region_code_71           0
region_code_77           0
region_code_85           0
region_code_93           0
center_type_TYPE_B       0
center_type_TYPE_C       0
category_Biryani         0
category_Desert          0
category_Extras          0
category_Fish            0
category_Other Snacks    0
category_Pasta           0
category_Pizza           0
category_Rice Bowl       0
category_Salad           0
category_Sandwich        0
category_Seafood         0
category_Soup            0
category_Starters        0
cuisine_Indian           0
cuisine_Italian          0
cuisine_Thai             0
pct_disct                0
center_id_mean           0
center_id_sum            0
meal_id_mean             0
m

In [41]:
train_new_df.isna().sum()

week                     0
center_id                0
meal_id                  0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
city_code                0
op_area                  0
region_code_34           0
region_code_35           0
region_code_56           0
region_code_71           0
region_code_77           0
region_code_85           0
region_code_93           0
center_type_TYPE_B       0
center_type_TYPE_C       0
category_Biryani         0
category_Desert          0
category_Extras          0
category_Fish            0
category_Other Snacks    0
category_Pasta           0
category_Pizza           0
category_Rice Bowl       0
category_Salad           0
category_Sandwich        0
category_Seafood         0
category_Soup            0
category_Starters        0
cuisine_Indian           0
cuisine_Italian          0
cuisine_Thai             0
pct_disct                0
center_id_mean           0
center_id_sum            0
m

In [55]:
# drop center_id, meal_id, city_code
train_new_df = train_new_df.drop(['center_id', 'meal_id', 'city_code'],axis=1)
test_new_df  = test_new_df.drop(['center_id', 'meal_id', 'city_code'],axis=1)

In [56]:
train_new_df.dtypes

week                       int64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
op_area                  float64
region_code_34             uint8
region_code_35             uint8
region_code_56             uint8
region_code_71             uint8
region_code_77             uint8
region_code_85             uint8
region_code_93             uint8
center_type_TYPE_B         uint8
center_type_TYPE_C         uint8
category_Biryani           uint8
category_Desert            uint8
category_Extras            uint8
category_Fish              uint8
category_Other Snacks      uint8
category_Pasta             uint8
category_Pizza             uint8
category_Rice Bowl         uint8
category_Salad             uint8
category_Sandwich          uint8
category_Seafood           uint8
category_Soup              uint8
category_Starters          uint8
cuisine_Indian             uint8
cuisine_Italian            uint8
cuisine_Th

In [57]:
test_new_df.dtypes

week                       int64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
op_area                  float64
region_code_34             uint8
region_code_35             uint8
region_code_56             uint8
region_code_71             uint8
region_code_77             uint8
region_code_85             uint8
region_code_93             uint8
center_type_TYPE_B         uint8
center_type_TYPE_C         uint8
category_Biryani           uint8
category_Desert            uint8
category_Extras            uint8
category_Fish              uint8
category_Other Snacks      uint8
category_Pasta             uint8
category_Pizza             uint8
category_Rice Bowl         uint8
category_Salad             uint8
category_Sandwich          uint8
category_Seafood           uint8
category_Soup              uint8
category_Starters          uint8
cuisine_Indian             uint8
cuisine_Italian            uint8
cuisine_Thai               uint8
pct_disct 

## Final Train and Test Dataset

In [58]:
display_df(train_new_df.head())

Unnamed: 0_level_0,week,base_price,emailer_for_promotion,homepage_featured,num_orders,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai,pct_disct,center_id_mean,center_id_sum,meal_id_mean,meal_id_sum,city_code_mean,city_code_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1379560,1,152.29,0,0,177,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.101517,163.768147,837019,752.73182,8291341,276.787149,3252249
1466964,1,135.83,0,0,270,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.007362,163.768147,837019,601.365975,6638479,276.787149,3252249
1346989,1,135.86,0,0,189,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.007361,163.768147,837019,310.212261,3218142,276.787149,3252249
1338232,1,437.53,0,0,54,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.224053,163.768147,837019,42.708121,377070,276.787149,3252249
1448490,1,242.5,0,0,40,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.004124,163.768147,837019,104.350024,1083675,276.787149,3252249


In [59]:
train_new_df.shape

(453280, 38)

In [60]:
display_df(test_new_df.head())

Unnamed: 0,week,base_price,emailer_for_promotion,homepage_featured,op_area,region_code_34,region_code_35,region_code_56,region_code_71,region_code_77,region_code_85,region_code_93,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai,pct_disct,center_id_mean,center_id_sum,meal_id_mean,meal_id_sum,city_code_mean,city_code_sum
0,145,156.23,0,0,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,163.768147,837019,752.73182,8291341,276.787149,3252249
1,145,158.14,0,0,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.006324,163.768147,837019,601.365975,6638479,276.787149,3252249
2,145,160.11,0,0,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.012491,163.768147,837019,310.212261,3218142,276.787149,3252249
3,145,292.03,0,0,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,163.768147,837019,42.708121,377070,276.787149,3252249
4,145,165.93,0,0,2.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,163.768147,837019,104.350024,1083675,276.787149,3252249


In [61]:
test_new_df.shape

(3268, 37)

# Modelling

## Feature and Target Variables 

In [62]:
X_train = train_new_df.drop(['num_orders'], axis=1)
y_train = train_new_df['num_orders']

In [63]:
X_train.shape

(453280, 37)

### Train Test Split

In [64]:
# do a train-test(i.e. validate) split on X_train data 
(X_train_train, 
 X_train_valid, 
 y_train_train, 
 y_train_valid) = train_test_split(X_train, y_train, test_size=0.2)

print(X_train_train.shape, '\t', X_train_valid.shape)
print(y_train_train.shape, '\t', y_train_valid.shape)

(362624, 37) 	 (90656, 37)
(362624,) 	 (90656,)


## XGBoost

### Train and Validate on Train data

In [104]:
xgb_params = {'learning_rate':0.20, 'colsample_bytree':0.8, 'max_depth':7, 
              'n_estimators':1000, 'n_jobs':-1, 'random_state':0}

In [105]:
%%time
xgb_reg = xgb.XGBRegressor(**xgb_params)
xgb_reg.fit(X=X_train_train, y=y_train_train,
            eval_set=[(X_train_valid, y_train_valid)],  # this is tell the model to check with 'test' data, if test data is not any better, it will stop running
            early_stopping_rounds=20, verbose=50)

[0]	validation_0-rmse:407.33829
[50]	validation_0-rmse:153.55533
[100]	validation_0-rmse:142.65559
[150]	validation_0-rmse:136.79248
[200]	validation_0-rmse:133.92230
[250]	validation_0-rmse:132.18755
[300]	validation_0-rmse:131.00420
[350]	validation_0-rmse:129.94989
[400]	validation_0-rmse:128.96623
[450]	validation_0-rmse:128.00821
[500]	validation_0-rmse:127.50134
[550]	validation_0-rmse:126.70706
[600]	validation_0-rmse:126.47147
[611]	validation_0-rmse:126.47234
Wall time: 3min 59s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [106]:
y_train_train_pred = xgb_reg.predict(X_train_train)
y_train_valid_pred = xgb_reg.predict(X_train_valid)

In [107]:
print('rmse:', f'{mean_squared_error(y_train_train, y_train_train_pred, squared=False):,.2f}')
print('rmse:', f'{mean_squared_error(y_train_valid, y_train_valid_pred, squared=False):,.2f}')

rmse: 84.09
rmse: 126.44


### Test Data

#### Features 

In [108]:
X_test = test_new_df

In [110]:
xgb_params = {'learning_rate':0.20, 'colsample_bytree':0.8, 'max_depth':7, 
              'n_estimators':1000, 'n_jobs':-1, 'random_state':0}

In [111]:
%%time
xgb_reg = xgb.XGBRegressor(**xgb_params)
xgb_reg.fit(X=X_train, y=y_train)

Wall time: 7min 12s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [112]:
# prediction on test set
preds = xgb_reg.predict(X_test)

In [116]:
# feature importances
feat_imp = pd.DataFrame({'feat':X_train.columns,
                         'imp':xgb_reg.feature_importances_})
feat_imp = feat_imp.sort_values('imp', ascending=False).reset_index(drop=True)
feat_imp.style.background_gradient()

Unnamed: 0,feat,imp
0,homepage_featured,0.150465
1,meal_id_sum,0.136802
2,category_Rice Bowl,0.110887
3,cuisine_Indian,0.070522
4,emailer_for_promotion,0.064762
5,cuisine_Thai,0.062314
6,category_Sandwich,0.055254
7,center_id_sum,0.040528
8,meal_id_mean,0.040487
9,center_id_mean,0.039944


# Submission 

In [113]:
test_df['num_orders'] = preds

In [114]:
submission = test_df['num_orders']
submission

id
1336751    199.607101
1401715    229.828918
1428343    104.862457
1040648     24.091131
1482790     51.942398
              ...    
1271326     68.780495
1062036     44.252476
1110849    466.464050
1147725    456.533356
1361984    186.662750
Name: num_orders, Length: 3268, dtype: float32

In [115]:
# saving dataframe into csv to be exported 
submission.to_csv('submission3.csv')

In [96]:
# # to re-run the entire thing - need to drop 'num_orders' column from test set
# test_new_df = test_new_df.drop(['num_orders'], axis=1)