In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
df= pd.read_csv("/kaggle/input/food-demand-forecasting/train.csv")
meal_df = pd.read_csv("/kaggle/input/food-demand-forecasting/meal_info.csv")
center_df = pd.read_csv("/kaggle/input/food-demand-forecasting/fulfilment_center_info.csv")
data= df.merge(center_df,left_on = 'center_id', right_on = 'center_id',how="left")
data= data.merge(meal_df,left_on = 'meal_id', right_on = 'meal_id',how="left")
data.head()

# Explore data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(14, 10))
sns.lineplot(data=data, x="week", y="num_orders",ci=None)

In [None]:
region_table = pd.pivot_table(data, values='num_orders', index=['week'],
                    columns=['region_code'], aggfunc=np.sum)
plt.figure(figsize=(14, 10))
sns.lineplot(data=region_table.rolling(window=4).mean()).set(title='4 week moving average of number of orders by week and region code')
plt.ylabel('num_orders')

In [None]:
cuisine_table = pd.pivot_table(data, values='num_orders', index=['week'],
                    columns=['cuisine'], aggfunc=np.sum)
plt.figure(figsize=(14, 10))
sns.lineplot(data=cuisine_table.rolling(window=4).mean()).set(title='4 week moving average of number of orders by week and cuisine')
plt.ylabel('num_orders')

In [None]:
center_type_data = data.groupby(['center_type'])['num_orders'].mean()
center_type_data = pd.DataFrame({ 'center_type': center_type_data.index, 'num_orders': center_type_data.values })
sns.barplot(data=center_type_data,x="center_type",y="num_orders").set(title='Average num of orders by center type')

In [None]:
cuisine_table = pd.pivot_table(data, values='num_orders', index=['cuisine'],
                    columns=['emailer_for_promotion'], aggfunc=np.average)
cuisine_table.plot(kind='bar',stacked=True,legend=True,title="avg num of orders with promotion")

In [None]:
cuisine_table = pd.pivot_table(data, values='num_orders', index=['cuisine'],
                    columns=['homepage_featured'], aggfunc=np.average)
cuisine_table.plot(kind='bar',stacked=True,legend=True,title="avg num of orders with homepage_featured")

In [None]:
operation_area= data.groupby(['city_code'])['op_area'].sum()
num_orders= data.groupby(['city_code'])['num_orders'].sum()
operation_area = pd.DataFrame({ 'city_code': operation_area.index, 'operation_area':operation_area.values , 'num_orders': num_orders.values })
operation_area= operation_area.merge(center_df[["city_code","region_code"]],left_on = 'city_code', right_on = 'city_code',how="left")
sns.relplot(x="city_code", y="num_orders",hue="region_code", size="operation_area",
            sizes=(1, 2000), palette="muted",height=8, data=operation_area).set(title="operaion area vs num of orders")

# Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

In [None]:
def one_hot_encode(features_to_encode, dataset):
    encoder = OneHotEncoder(sparse=False)
    encoder.fit(dataset[features_to_encode])

    encoded_cols = pd.DataFrame(encoder.transform(dataset[features_to_encode]),columns=encoder.get_feature_names())
    dataset = dataset.drop(columns=features_to_encode)
    for cols in encoded_cols.columns:
        dataset[cols] = encoded_cols[cols]
    return dataset

In [None]:
data = data.drop(["id"],axis=1)
features_to_encode = ['meal_id','city_code',"center_id",'center_type', 'category', 'cuisine']
data = one_hot_encode(features_to_encode, data)
y = data["num_orders"]
X= data.drop(["num_orders"],axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
RF_pipe = make_pipeline(StandardScaler(),RandomForestRegressor())
RF_pipe.fit(X_train, y_train)
RF_train_y_pred = RF_pipe.predict(X_test)
print(RF_pipe.score(X_test, y_test))
print('RMSLE:', 100*np.sqrt(metrics.mean_squared_log_error(y_test, RF_train_y_pred)))

In [None]:
RF_pipe = make_pipeline(StandardScaler(),RandomForestRegressor(n_estimators = 200))
RF_pipe.fit(X_train, y_train)
RF_train_y_pred = RF_pipe.predict(X_test)
print(RF_pipe.score(X_test, y_test))
print('RMSLE:', 100*np.sqrt(metrics.mean_squared_log_error(y_test, RF_train_y_pred)))