In [1]:
import sys
sys.path.append('C:/Users/rohan/Documents/Projects/Food_Demand_Forecasting_Challenge/Food_Demand_Forecasting_Challenge')

# Import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as p9
%matplotlib inline
color = sns.color_palette()

# Import local libraries
from src.features import build_features

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data files
train = pd.read_csv(r'../data/interim/train_data.csv')
val = pd.read_csv(r'../data/interim/val_data.csv')
test = pd.read_csv(r'../data/interim/test_data.csv')

full_original_train_data = pd.read_csv(r'../data/interim/full_original_train_data.csv')
full_original_test_data = pd.read_csv(r'../data/interim/full_original_test_data.csv')

### Build Features

#### 1. Features on Center_id

In [3]:
# Create a copy of the datasets to build features on 
train_df = train.copy()
val_df = val.copy()
test_df = test.copy()

full_original_train_data_df = full_original_train_data.copy()
full_original_test_data_df = full_original_test_data.copy()

In [4]:
train_df.head(2)

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0


In [5]:
# Generate 1st derived feature --> Discount
train_df['discount'] =  train_df['base_price'] - train_df['checkout_price']
val_df['discount']   =  val_df['base_price']   - val_df['checkout_price']
test_df['discount']  =  test_df['base_price']  - test_df['checkout_price']

full_original_train_data_df['discount']  =  full_original_train_data_df['base_price']  - full_original_train_data_df['checkout_price']
full_original_test_data_df['discount']  =  full_original_test_data_df['base_price']  - full_original_test_data_df['checkout_price']

In [6]:
# Compute the mean values of Checkout Price (CP) and discount (D) for all meal_ids within a week--center_id combination
train_df_1 = build_features.features_by_center(train_df)
val_df_1   = build_features.features_by_center(val_df)
test_df_1  = build_features.features_by_center(test_df)

full_original_train_data_df_1 = build_features.features_by_center(full_original_train_data_df)
full_original_test_data_df_1 = build_features.features_by_center(full_original_test_data_df)

In [7]:
print(train_df_1.shape)
print(val_df_1.shape)
print(test_df_1.shape)
print(full_original_train_data_df_1.shape)
print(full_original_test_data_df_1.shape)

(407243, 52)
(32929, 52)
(16376, 52)
(456548, 52)
(32573, 51)


In [8]:
# Compute the total meals for each week--center_id combination
train_df_2 = build_features.total_meals_by_center(train_df_1)
val_df_2   = build_features.total_meals_by_center(val_df_1)
test_df_2  = build_features.total_meals_by_center(test_df_1)

full_original_train_data_df_2 = build_features.total_meals_by_center(full_original_train_data_df_1)
full_original_test_data_df_2 = build_features.total_meals_by_center(full_original_test_data_df_1)

In [9]:
print(train_df_2.shape)
print(val_df_2.shape)
print(test_df_2.shape)
print(full_original_train_data_df_2.shape)
print(full_original_test_data_df_2.shape)

(407243, 53)
(32929, 53)
(16376, 53)
(456548, 53)
(32573, 52)


#### Features on EP and HF

In [10]:
# Find the number of meal_ids by category and cuisine that were featured on Homepage and number of meal_ids 
# that were promoted by emailers
train_df_3 = build_features.features_by_ep_or_hf(train_df_2)
val_df_3   = build_features.features_by_ep_or_hf(val_df_2)
test_df_3  = build_features.features_by_ep_or_hf(test_df_2)

full_original_train_data_df_3 = build_features.features_by_ep_or_hf(full_original_train_data_df_2)
full_original_test_data_df_3 = build_features.features_by_ep_or_hf(full_original_test_data_df_2)

In [11]:
print(train_df_3.shape)
print(val_df_3.shape)
print(test_df_3.shape)
print(full_original_train_data_df_3.shape)
print(full_original_test_data_df_3.shape)

(407243, 57)
(32929, 57)
(16376, 57)
(456548, 57)
(32573, 56)


#### Features on City and Region

In [12]:
# Compute total and mean operating area for each region and city and 
# ratio of center op area to total region op area and city op area
train_df_4 = build_features.features_by_city_or_region(train_df_3)
val_df_4   = build_features.features_by_city_or_region(val_df_3)
test_df_4  = build_features.features_by_city_or_region(test_df_3)

full_original_train_data_df_4 = build_features.features_by_city_or_region(full_original_train_data_df_3)
full_original_test_data_df_4 = build_features.features_by_city_or_region(full_original_test_data_df_3)

In [13]:
print(train_df_4.shape)
print(val_df_4.shape)
print(test_df_4.shape)
print(full_original_train_data_df_4.shape)
print(full_original_test_data_df_4.shape)

(407243, 61)
(32929, 61)
(16376, 61)
(456548, 61)
(32573, 60)


#### Temporal Features

In [14]:
# Check if a meal--center combination was promoted by email or featured on homepage last week 
# or the week before and the cumulative sum of all previous promotions and features
train_df_5 = build_features.temporal_features_set_1(train_df_4)
val_df_5   = build_features.temporal_features_set_1(val_df_4)
test_df_5  = build_features.temporal_features_set_1(test_df_4)

full_original_train_data_df_5 = build_features.temporal_features_set_1(full_original_train_data_df_4)
full_original_test_data_df_5 = build_features.temporal_features_set_1(full_original_test_data_df_4)

In [15]:
print(train_df_5.shape)
print(val_df_5.shape)
print(test_df_5.shape)
print(full_original_train_data_df_5.shape)
print(full_original_test_data_df_5.shape)

(407243, 67)
(32929, 67)
(16376, 67)
(456548, 67)
(32573, 66)


In [16]:
# Compute last week checkout price and last week discount of each meal--center combination  
# and check if current set of checkout price and discount is greater than last week's

train_df_6 = build_features.temporal_features_set_2(train_df_5)
val_df_6   = build_features.temporal_features_set_2(val_df_5)
test_df_6  = build_features.temporal_features_set_2(test_df_5)

full_original_train_data_df_6 = build_features.temporal_features_set_2(full_original_train_data_df_5)
full_original_test_data_df_6 = build_features.temporal_features_set_2(full_original_test_data_df_5)

In [17]:
print(train_df_6.shape)
print(val_df_6.shape)
print(test_df_6.shape)
print(full_original_train_data_df_6.shape)
print(full_original_test_data_df_6.shape)

(407243, 71)
(32929, 71)
(16376, 71)
(456548, 71)
(32573, 70)


In [18]:
#  Create Label Encoder features for different set of cuisine and categories
train_df_7 = build_features.features_by_cui_or_cat(train_df_6)
val_df_7   = build_features.features_by_cui_or_cat(val_df_6)
test_df_7  = build_features.features_by_cui_or_cat(test_df_6)

full_original_train_data_df_7 = build_features.features_by_cui_or_cat(full_original_train_data_df_6)
full_original_test_data_df_7 = build_features.features_by_cui_or_cat(full_original_test_data_df_6)

In [19]:
print(train_df_7.shape)
print(val_df_7.shape)
print(test_df_7.shape)
print(full_original_train_data_df_7.shape)
print(full_original_test_data_df_7.shape)

(407243, 76)
(32929, 76)
(16376, 76)
(456548, 76)
(32573, 75)


In [20]:
# Create a separate dataframe for features
train_features = train_df_7.drop(['num_orders'], axis=1).copy()
val_features = val_df_7.drop(['num_orders'], axis=1).copy()
test_features = test_df_7.drop(['num_orders'], axis=1).copy()

full_original_train_features = full_original_train_data_df_7.drop(['num_orders'], axis=1).copy()
full_original_test_features = full_original_test_data_df_7.copy()

In [21]:
# Create a separate dataframe for targets
train_target = train_df_7['num_orders'].copy()
val_target = val_df_7['num_orders'].copy()
test_target = test_df_7['num_orders'].copy()
full_original_train_target = full_original_train_data_df_7['num_orders'].copy()

In [22]:
# Save the features dataframe to disk
train_features.to_csv(r'../data/processed/built_features/train_features.csv', index=False)
val_features.to_csv(r'../data/processed/built_features/val_features.csv', index=False)
test_features.to_csv(r'../data/processed/built_features/test_features.csv', index=False)

full_original_train_features.to_csv(r'../data/processed/built_features/full_original_train_features.csv', index=False)
full_original_test_features.to_csv(r'../data/processed/built_features/full_original_test_features.csv', index=False)

In [23]:
# Save the target dataframe to disk
train_target.to_csv(r'../data/processed/target/train_target.csv', index=False)
val_target.to_csv(r'../data/processed/target/val_target.csv', index=False)
test_target.to_csv(r'../data/processed/target/test_target.csv', index=False)
full_original_train_target.to_csv(r'../data/processed/target/full_original_train_target.csv', index=False)