In [None]:
# try the best model performence on different training set sizes

In [None]:
# It is a meal delivery company which operates in multiple cities. 
# They have various fulfillment centers (packing warehouse)
# in these cities for dispatching meal orders to their customers. 
# The client wants you to help these centers with demand forecasting for upcoming weeks
# so that these centers will plan the stock of raw materials accordingly.

In [None]:
import numpy as np
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
import seaborn as sns
import pandas as pd
%matplotlib inline

# Load Data

In [None]:
##demand = demand.merge(fulfilment_center, on='center_id').merge(meal_info, on='meal_id')

In [None]:
# Food demand: predict num_orders
demand = pd.read_csv('../input/food-demand/demand.csv')
demand.drop('id',axis=1,inplace=True)
fulfilment_center = pd.read_csv('../input/food-demand/fulfilment_center_info.csv')
meal_info = pd.read_csv('../input/food-demand/meal_info.csv')
demand = demand.merge(fulfilment_center, on='center_id')
demand = demand.merge(meal_info, on='meal_id')
demand = demand.sort_values(['week','meal_id','center_id'])

# Data Explore

In [None]:
# show the data

In [None]:
demand.head(5)

In [None]:
# Check how many examples and how many features are in the dataset

In [None]:
demand.shape

In [None]:
# We have 456,548 examples and 14 (after dropping 'id') columns (13 features and one label - num orders

In [None]:
# Check for missing values

In [None]:
demand.isna().mean()

In [None]:
# no missing values

## describing plots on the data

#### Label Distirbution

In [None]:
from matplotlib import pyplot as py

plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

# Plot Histogram on x
x = demand['num_orders']
plt.hist(x, 30, range=[0, 2000], facecolor='#86bf91', align='mid',zorder=2, rwidth=0.9)
#py.xticks(range(0,5000))
plt.gca().set(title='Label Histogram', ylabel='Frequency');

#### number of orders by checkout price

In [None]:
sns.jointplot(x='checkout_price',y='num_orders',data=demand)

#### number of orders by weeks 

In [None]:
demand.groupby('week').num_orders.mean().plot()

#### number of orders by cuisine type

In [None]:
sns.barplot(demand['cuisine'],demand['num_orders'])

#### number of orders by meal category 

In [None]:
sns.barplot(demand['category'],demand['num_orders'])
plt.xticks(rotation = '90')

#### corr table

In [None]:
plt.figure(figsize=(20,10))
c=demand.corr()
sns.heatmap(c,cmap="YlGnBu",annot=True)

#### corr of numb_orders with other features

In [None]:
demand.corr()['num_orders'].sort_values().plot.bar()

#### All the features corr plots

In [None]:
sns.pairplot(demand)

## Feature Engineering

In [None]:
# Show object columns

In [None]:
demand.dtypes

In [None]:
# cusine, category, center_type are objects

In [None]:
# Show Unique values for object columns

In [None]:
demand['center_type'].value_counts()

In [None]:
demand['category'].value_counts()

In [None]:
demand['cuisine'].value_counts()

In [None]:
# get dummies for object type columns

In [None]:
demand = pd.get_dummies(demand)

In [None]:
# rename all the collumns to lower case

In [None]:
demand = demand.rename(columns=str.lower)

In [None]:
# Show the new data frame

In [None]:
demand.sample(10)

In [None]:
demand.shape

In [None]:
# 32 columns (31 features & 1 label)

## Keep Exploring

In [None]:
# show the corr between the features

In [None]:
demand.corr()

In [None]:
# Corr plot for the decision factor (num_orders)

In [None]:
demand.corr()['num_orders'].sort_values()

# Train test Split

In [None]:
from sklearn.model_selection import train_test_split
X, y = demand.drop('num_orders', axis=1), demand.num_orders
train_test_ratio = 4566/456548
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_ratio, random_state=0, shuffle=False)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# We took 0.01 of data to test set considering time continuess

In [None]:
X_train.head(451982)

In [None]:
X_test.head(4566)

# Helper function for evaluation

In [None]:
from sklearn.metrics import mean_absolute_error


def check_regressor(regressor, X_train, y_train, X_test, y_test):
    # Fit regression model
    regressor.fit(X_train, y_train)

    # Predict
    y_pred = regressor.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    y_pred_train = regressor.predict(X_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    
    model_name = regressor.__class__.__name__
    print('{0} mean absolute error is {1:.4f} (mae train {2:.4f})'.format(model_name, mae, mae_train))
    return mae, mae_train, model_name

In [None]:
models_errors = {}
models_errors_train = {}

In [None]:
# we will check the error between the test and the train to check the model and check overfit

# Models

## Benchmark

In [None]:
class Benchmark:
    def fit(self, x, y):
        self.value = y.mean()
        return self
    
    def predict(self, x):
        return np.ones(len(x))*self.value
    
benchmark = Benchmark()
model_mae, model_mae_train, model_name = check_regressor(benchmark, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# The mean absolute error for the testing set is 193.7072 and for the training set is 227.9756

## Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression


linear_regressor = LinearRegression()
model_mae, model_mae_train, model_name = check_regressor(linear_regressor, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set is 151.2310 and on tbhe tarining set is 162.3364

## Nearest Neighbors Regressor

In [None]:
#K nearest neighbors, try 3 different k values.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

k = 3
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.neighbors import KNeighborsRegressor

k = 5
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.neighbors import KNeighborsRegressor

k = 7
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set for k=5 is 79.8184 and on the training set is 67.0602

## Scaling

In [None]:
# K nearest neighbors with scaled values, try 3 different k values.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k = 3
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train_scaled, y_train, X_test_scaled, y_test)
model_name += 'Scaled'
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k = 5
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train_scaled, y_train, X_test_scaled, y_test)
model_name += 'Scaled'
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k = 7
nearest_neighbors = KNeighborsRegressor(k)
model_mae, model_mae_train, model_name = check_regressor(nearest_neighbors, X_train_scaled, y_train, X_test_scaled, y_test)
model_name += 'Scaled'
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set is 74.2631 and on the training set is 60.7128

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depth = 17
decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=0)
model_mae, model_mae_train, model_name = check_regressor(decision_tree, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depth = 16
decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=0)
model_mae, model_mae_train, model_name = check_regressor(decision_tree, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depth = 3
decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=0)
model_mae, model_mae_train, model_name = check_regressor(decision_tree, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set (93.0577) and on the tarining set (66.9899) 
# 17 max depth will reduce the mae for training set and increase for testing set(overfit)

### Visualize Tree

In [None]:
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
def plot_tree(tree, features=X_train.columns, labels=['0', '1']):
    graph = Source(export_graphviz(tree, feature_names=features, filled = True))
    display(SVG(graph.pipe(format='svg')))
    
plot_tree(decision_tree)

## Random Forest Regressor

In [None]:
# random forest, 3 different max depth values on n_estimator = 100

In [None]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = 100
max_depth = 3
random_forest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
model_mae, model_mae_train, model_name = check_regressor(random_forest, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = 100
max_depth = 25
random_forest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
model_mse, model_mse_train, model_name = check_regressor(random_forest, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mse
models_errors_train[model_name] = model_mse_train

In [None]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = 100
max_depth = 20
random_forest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
model_mse, model_mse_train, model_name = check_regressor(random_forest, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mse
models_errors_train[model_name] = model_mse_train

In [None]:
# the error on the testing set is 81.7132 and on the training set is 47.4846

In [None]:
# higher error for 4 max depth

## AdaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

n_estimators = 100
max_depth = 40
base_estimator = DecisionTreeRegressor(max_depth=max_depth)
ada_boost = AdaBoostRegressor(random_state=0, n_estimators=n_estimators, base_estimator=base_estimator)
model_mae, model_mae_train, model_name = check_regressor(ada_boost, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.ensemble import AdaBoostRegressor

n_estimators = 100
max_depth = 25
base_estimator = DecisionTreeRegressor(max_depth=max_depth)
ada_boost = AdaBoostRegressor(random_state=0, n_estimators=n_estimators, base_estimator=base_estimator)
model_mae, model_mae_train, model_name = check_regressor(ada_boost, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.ensemble import AdaBoostRegressor

n_estimators = 100
max_depth = 10
base_estimator = DecisionTreeRegressor(max_depth=max_depth)
ada_boost = AdaBoostRegressor(random_state=0, n_estimators=n_estimators, base_estimator=base_estimator)
model_mae, model_mae_train, model_name = check_regressor(ada_boost, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set is 71.8249 and on the training set is 11.6979 for max depth 25

## Lasso

In [None]:
# Lasso Model on 3 different alphas

In [None]:
from sklearn.linear_model import Lasso


lasso = Lasso(alpha=1)
model_mae, model_mae_train, model_name = check_regressor(lasso, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.linear_model import Lasso


lasso = Lasso(alpha=2)
model_mae, model_mae_train, model_name = check_regressor(lasso, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
from sklearn.linear_model import Lasso


lasso = Lasso(alpha=3)
model_mae, model_mae_train, model_name = check_regressor(lasso, X_train, y_train, X_test, y_test)
models_errors[model_name] = model_mae
models_errors_train[model_name] = model_mae_train

In [None]:
# the error on the testing set for alpha 1 is 148.2808 and on the training set is 161.2801

# Performance Comparision

In [None]:
import pandas as pd

def compare_performance(models_errors_train, models_errors):
    return pd.DataFrame({'train':models_errors_train, 'test':models_errors}).sort_values('test')

cofo = compare_performance(models_errors_train, models_errors)

In [None]:
cofo

In [None]:
# PLOT FOR THE COMPRASION (powerpoint)

In [None]:
cofo.plot.bar(rot=10,colormap='cool',figsize = (16,9));

# 7. Algorithms introspection 

### What are the weights of the lasso coefficients? 

In [None]:
lasso.coef_ , lasso.intercept_

### Lasso coef in table 

In [None]:
pd.DataFrame({'lasso_coef':lasso.coef_},index=X_train.columns).sort_values(by=['lasso_coef']).head(32)

### Random Forest feature importance|

In [None]:
pd.DataFrame({'importance':decision_tree.feature_importances_},index=X_train.columns).sort_values(by=['importance']).tail(14)

## 8. Hyperparameters

### Look for the hyper parameters of the algorithm that best improves the test performance, show a comparison in performance using their values.

### Check Hyper parameters - max depth, for Decision Tree

In [None]:
train_acc = []
test_acc = []

for i in range(1,21):
    tree = DecisionTreeRegressor(random_state=0,max_depth=i)
    tree.fit(X_train, y_train)
    y_pred_train = tree.predict(X_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    train_acc.append(mae_train)
    y_pred = tree.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    test_acc.append(mae)

fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1, 21), train_acc, s=10, c='b', marker="s", label='Training MAE')
ax1.scatter(range(1, 21) ,test_acc, s=10, c='r', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Max Depth")
plt.legend(loc='upper left');
plt.show()

In [None]:
# The chosen depth in decision tree is 17 due to lowest mae

### Check Hyper parameters - different k , for ADABoost

In [None]:
train_acc = []
test_acc = []

for i in range(1,21):
    
    tree = DecisionTreeRegressor(random_state=0,max_depth=i)
    ada_boost = AdaBoostRegressor(random_state=0,base_estimator=tree,n_estimators=20)
    ada_boost.fit(X_train, y_train)
    y_pred_train = ada_boost.predict(X_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    train_acc.append(mae_train)
    y_pred = ada_boost.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    test_acc.append(mae)

fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1, 21), train_acc, s=10, c='b', marker="s", label='Training MAE')
ax1.scatter(range(1, 21) ,test_acc, s=10, c='r', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Max Depth")
plt.legend(loc='upper left');
plt.show()

In [None]:
fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1, 21), train_acc, s=100, c='b', marker="s", label='Training MAE')
ax1.scatter(range(1, 21) ,test_acc, s=100, c='g', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Max Depth")
plt.legend(loc='lower left');
plt.show()

### Check Hyper parameters - different k , for Scaling KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

train_acc = []
test_acc = []

for i in range(1,5):
    nearest_neighbors = KNeighborsRegressor(n_neighbors=3)
    nearest_neighbors.fit(X_train_scaled, y_train)
    y_pred_train = nearest_neighbors.predict(X_train_scaled)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    train_acc.append(mae_train)
    y_pred = nearest_neighbors.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    test_acc.append(mae)

fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1, 5), train_acc, s=10, c='b', marker="s", label='Training MAE')
ax1.scatter(range(1, 5) ,test_acc, s=10, c='r', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Max Depth")
plt.legend(loc='upper right');
plt.show()

### Check Hyper parameters - max depth & n_estimatores , for Random Forest

In [None]:
train_acc = []
test_acc = []

for i in range(1,20):
    tree = RandomForestRegressor(random_state=0,n_estimators=100,max_depth=i)
    tree.fit(X_train, y_train)
    y_pred_train = tree.predict(X_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    train_acc.append(mae_train)
    y_pred = tree.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    test_acc.append(mae)

fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1, 20), train_acc, s=10, c='b', marker="s", label='Training MAE')
ax1.scatter(range(1, 20) ,test_acc, s=10, c='r', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Max Depth")
plt.legend(loc='upper right');
plt.show()

In [None]:
# checks the best mex depth

In [None]:
q = test_acc[0]
for i in range(0,19):
   if (test_acc[i] < q):
        q = test_acc[i]
        w = i
print(q)
print(w)

In [None]:
# 18 is the best max depth for random forest with 39.35 mae

# 9. Additional analysis 

In [None]:
# Performance vs. amount of data Using the best performing algorithm. Show a graph describing the test performance of the algorithm when using [10%/30%/50%/70%/100%] of the train set for training the algorithm. Would you recommend collecting more data for the problem?

In [None]:
# Hist for Knn scaled different training set size 

In [None]:
from sklearn.neighbors import KNeighborsRegressor

length = len(X_train_scaled)
lst = [int(0.1*length),int(0.3*length),int(0.5*length),int(0.7*v),length]
avg_error_score = []
for item in lst:
    q, w = X_train_scaled[:item], y_train[:item]
    knn = KNeighborsRegressor(n_neighbors = 3) #'best performed'
    knn.fit(q, w)
    y_pred = knn.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    avg_error_score.append(mae)

In [None]:
fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(0.1,1,0.2) ,avg_error_score, s=10, c='r', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Training Size")
plt.legend(loc='upper right');
plt.show()

In [None]:
# ada boost mae on different training set size 

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error

length = len(X_train)
lst = [int(0.1*length),int(0.3*length),int(0.5*length),int(0.7*length),length]
avg_error_score = []

n_estimators = 100
max_depth = 17 # the best performed

for item in lst: 
    q, w = X_train[:item], y_train[:item]
    base_estimator = DecisionTreeRegressor(max_depth=max_depth)
    ada_boost = AdaBoostRegressor(random_state=0, n_estimators=n_estimators, base_estimator=base_estimator)
    ada_boost.fit(q, w)
    y_pred = ada_boost.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    avg_error_score.append(mae)

In [None]:
# Hist for ada boost

fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(111)

ax1.scatter(range(1,6) ,avg_error_score, s=100, c='g', marker="o", label='Testing MAE')
ax1.set_ylabel("MAE")
ax1.set_xlabel("Training Size")
plt.legend(loc='upper right');
plt.show()