# INTRODUCTION
This python program uses a random forest regressor algorithm to forecast demand for types of beverages in different Greek stores.

# LIBRARIES

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import datetime
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# FUNCTION DEFINITIONS
The training and test data frames contain missing values. In most cases, either container capacity or container material is missing, but not both. I wrote two functions:
* ***map_container(x)*** takes container capacity as an input and outputs the container material
* ***map_capacity(x)*** takes container material as an input and outputs the container capacity

In [None]:
def map_container(x):
    if x == '330ml':
        container = 'can'
    elif x == '500ml':
        container = 'glass'
    else:
        container = 'plastic' 
    return container

def map_capacity(x):
    if x == 'can':
        capacity = '330ml'
    elif x == 'glass':
        capacity = '500ml'
    else:
        capacity = '1.5lt'
    return capacity

def graph_demand(df, brands, a):
    fig, axes = plt.subplots(nrows = len(brands), figsize = (15, 25))
    for i in range(len(brands)):
        sns.scatterplot(x=train.quantity[train.brand == brands[i]], y=train.price[train.brand == brands[i]], 
                    hue = train.shop[train.brand == brands[i]], alpha = a, ax = axes[i])
        axes[i].set(title = brands[i], xlabel = 'Quantity', ylabel = 'Price')

    fig.tight_layout()
    plt.show()

# DATA IMPORTATION

In [None]:
train = pd.read_csv('/kaggle/input/predict-demand/train.csv')
test = pd.read_csv('/kaggle/input/predict-demand/test.csv')
train.dropna(how = 'all', inplace = True)
test.dropna(how = 'all', inplace = True)
train.date = pd.to_datetime(train.date)
test.date = pd.to_datetime(test.date)
train.set_index('date', inplace = True)
test.set_index('date', inplace = True)

# DATA EXPLORATION

In [None]:
print(train.info(), '\n')
print(test.info(), '\n')
print(train.tail())
print(test.head())

In [None]:
print(train.describe())
print(test.describe())

In [None]:
pd.DataFrame(train.groupby(['shop', 'long']).size().rename('frequency'))

The training set is a *6480x12* data frame. It contains:
* 51 missing values (= 6480 - 6429) for the variable ***lat***
* 46 missing values (= 6480 - 6434) for the variable ***long***
* 16 missing values (= 6480 - 6464) for the variable ***container***
* 15 missing values (= 6480 - 6465) for the variable ***capacity***

The test set is a *1080x12* data frame. It contains:
* 8 missing values (= 1080 - 1072) for the variable ***lat***
* 13 missing values (= 1080 - 1067) for the variable ***long***
* 3 missing values (= 1080 - 1077) for the variable ***container***
* 4 missing values (= 1080 - 1076) for the variable ***capacity***

In [None]:
train.corr()

# DATA VISUALIZATION

In [None]:
fig, axes = plt.subplots(ncols = 2, figsize = (18, 7))
sns.distplot(train.quantity, kde = True, ax = axes[0])
sns.distplot(train.price, kde = True, ax = axes[1])

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (18, 18))
sns.countplot(x = 'city', data = train, ax = axes[0, 0])
sns.countplot(x = 'shop', data = train, ax = axes[0, 1])
sns.countplot(x = 'brand', data = train, ax = axes[1, 0])
sns.countplot(x = 'container', data = train, ax = axes[1, 1])

I constructed scatter plots of price and quantity of beverage demanded for each store. As is the convention in economics, price (the independent variable) is located on the vertical axis, whereas quantity demanded (the dependent variable) is located on the horizontal axis. 

The scatter plot for each beverage somewhat reflects the overall negative relationship between price and quantity demanded that economics posits. However, the relationship in this dataset is not very clear, since there are multiple variables that are not constant. Price is determined simultaneously by demand and supply. There might also be a temporal relationship that the scatterplots would ignore. Variables that change throughout the year, such as temperature, might be affecting sales.

In [None]:
brands = np.array(train.brand.unique())
graph_demand(train, brands, 0.8)

In [None]:
train_kincola_pivot = train[train.brand == 'kinder-cola'].pivot_table(index = ['price'], values = ['quantity'], aggfunc = np.mean)
train_aducola_pivot = train[train.brand == 'adult-cola'].pivot_table(index = ['price'], values = ['quantity'], aggfunc = np.mean)
train_orpow_pivot = train[train.brand == 'orange-power'].pivot_table(index = ['price'], values = ['quantity'], aggfunc = np.mean)
train_gazoza_pivot = train[train.brand == 'gazoza'].pivot_table(index = ['price'], values = ['quantity'], aggfunc = np.mean)
train_lemboost_pivot = train[train.brand == 'lemon-boost'].pivot_table(index = ['price'], values = ['quantity'], aggfunc = np.mean)

sns.scatterplot(x=train_kincola_pivot.quantity, y=train_kincola_pivot.index)
sns.scatterplot(x=train_aducola_pivot.quantity, y=train_aducola_pivot.index)
sns.scatterplot(x=train_orpow_pivot.quantity, y=train_orpow_pivot.index)
sns.scatterplot(x=train_gazoza_pivot.quantity, y=train_gazoza_pivot.index)
sns.scatterplot(x=train_lemboost_pivot.quantity, y=train_lemboost_pivot.index)

The following time series do a much better job of describing how overall demand for beverages changes throughout the year. Prices have changed over time, but they tend to be fairly close to their mean value of 1.2, so changes in prices are likely not the best explanation for the large variation in sales throughout the year.

In [None]:
fig, ax = plt.subplots(nrows = 4, figsize = (13, 13))
sns.lineplot(x = train.index, y = train.quantity, ax = ax[0])
sns.lineplot(x = train.index, y = train.price, ax = ax[1])
sns.lineplot(x = train.index, y = train.quantity, hue = train.brand, ax = ax[2])
sns.lineplot(x = train.index, y = train.price, hue = train.brand, ax = ax[3])
ax[1].axhline(np.mean(train.price))

# DATA TIDYING

In [None]:
train.loc[train.container.isnull(), 'container'] = train.loc[train.container.isnull(), 'capacity'].apply(map_container)
train.loc[train.capacity.isnull(), 'capacity'] = train.loc[train.capacity.isnull(), 'container'].apply(map_capacity)

test.loc[test.container.isnull(), 'container'] = test.loc[test.container.isnull(), 'capacity'].apply(map_container)
test.loc[test.capacity.isnull(), 'capacity'] = test.loc[test.capacity.isnull(), 'container'].apply(map_capacity)

The variable ***id*** is the index for each data point, so it is redundant. Similarly, since there is a one-to-one correspondence between ***container*** and ***capacity***, only one of the variables is necessary. Thus, I dropped ***capacity***. Similarly, ***lat*** and ***long*** identify the geographic location where each sale took place. Each pair of values represents a specific shop. Thus, I dropped ***lat*** and ***long*** and retained the variable ***shop***, which implicitly contains the geographic information.

In [None]:
train.drop(columns = ['id', 'capacity', 'lat', 'long'], inplace = True)
test.drop(columns = ['id', 'capacity', 'lat', 'long'], inplace = True)

After applying the ***map_container*** and ***map_capacity*** functions and dropping the redundant variables, there are no more missing values in the training and test data sets.

In [None]:
print(train.info(), '\n')
print(test.info(), '\n')

I concatenated the training and test data sets, created dummy variables for the 'object' data types, and seperated the data into the original training and test data sets.

In [None]:
train['label'] = 1
test['label'] = 2
temp = pd.concat([train, test])
temp = pd.get_dummies(temp)
train = temp[temp.label == 1]
test = temp[temp.label == 2]

In [None]:
train.drop(columns = ['label'], inplace = True)
test.drop(columns = ['label'], inplace = True)

# MODEL 1: RANDOM FOREST REGRESSOR
## This model does not account for temporal effects on demand for beverages.

In [None]:
forest = RandomForestRegressor(n_estimators = 500, random_state = 42)
forest.fit(train.drop(columns = ['quantity']), train['quantity'])

In [None]:
predictions = forest.predict(test.drop(columns = ['quantity']))
mae = metrics.mean_absolute_error(test.quantity, predictions)
mse = metrics.mean_squared_error(test.quantity, predictions)
mape = np.mean(np.abs(test.quantity - predictions) / np.abs(test.quantity))

print('Random Forest Regressor:\n', 16 *'-')
print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Mean Absolute Percentage Error: ', 100 * mape, '%')

In [None]:
fig, ax = plt.subplots()
plt.scatter(predictions, test.quantity - predictions, c = 'maroon', marker = '.')
ax.axhline(y = 0, xmin = 0, c = 'r')
ax.set(title = 'Residual Plot', xlabel = 'Predicted Value', ylabel = 'Actual - Predicted')