In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/predict-demand/train.csv')
test = pd.read_csv('/kaggle/input/predict-demand/test.csv')

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
# All values after row 6479 are NaN.
train  = train.drop(train.index[6480:])
train.tail()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
pd.DataFrame(train.groupby(['brand', 'container'], axis =0, as_index=True).count()['id'])

There are multiple brands, but for this task, we will only forcus on the Gazoza brand.

In [None]:
gazoza = train[train['brand'] == 'gazoza'].reset_index(drop=True)

In [None]:
gazoza.info()

### Clean up the data

There are some null values. The container null values might be easy to fill since there are only three possible values.

In [None]:
gazoza[gazoza['container'].isna()]
# These are the values where the container column had null vlaues
# We will find out which container value amoung "glass, plastic, can" is missing and fill accordingly

In [None]:
# Print all the values for the said dates and cities to see which container types are missing
gazoza[((gazoza['date'] == '31/07/13') & (gazoza['city'] == 'Larisa')) |
       ((gazoza['date'] == '30/09/15') & (gazoza['city'] == 'Patra')) |
       ((gazoza['date'] == '31/07/16') & (gazoza['city'] == 'Larisa')) |
       ((gazoza['date'] == '31/08/17') & (gazoza['city'] == 'Thessaloniki')) |
       ((gazoza['date'] == '30/09/17') & (gazoza['city'] == 'Patra'))]

In [None]:
# Fill in the missing contiainer types
gazoza.at[340, 'container'] = 'can'
gazoza.at[798, 'container'] = 'glass'
gazoza.at[984, 'container'] = 'glass'
gazoza.at[1214, 'container'] = 'plastic'
gazoza.at[1229, 'container'] = 'glass'

In [None]:
# No missing container types anymore
gazoza[gazoza['container'].isna()]

In [None]:
# Check the missing capacity values
gazoza[gazoza['capacity'].isna()]

In [None]:
# Fill in the missing capacity types
gazoza.at[368, 'capacity'] = '1.5lt'
gazoza.at[648, 'capacity'] = '500ml'
gazoza.at[962, 'capacity'] = '500ml'

In [None]:
# No more missing capacity values
gazoza[gazoza['capacity'].isna()]

Clean the test data.

In [None]:
gazoza_test = test[test['brand'] == 'gazoza'].reset_index(drop=True)
gazoza_test.drop(['id','lat','long','brand'], axis=1, inplace=True)
gazoza_test['date'] = pd.to_datetime(gazoza_test['date'])
gazoza_test.head()

In [None]:
gazoza_test.info()

In [None]:
# Check the missing capacity values
gazoza_test[gazoza_test['capacity'].isna()]

In [None]:
gazoza_test.at[208, 'capacity'] = '330ml'
gazoza_test.at[209, 'capacity'] = '500ml'

### Choose features to keep

As we can see below, the factors *'lat', 'long',* and *'shop'* are practically replecating the same information. So we can reduce the dimentionality of the data by keeping only one them.

In [None]:
pd.DataFrame(gazoza.groupby(['city', 'lat', 'long', 'shop'], axis =0, as_index=True).count()['id'])

Since the feature *'shop'* has no missing values, we'll drop *'lat'*, *'long'*.

We will also drop the brand column since all of them are just *'gazoza'*.

The *'id'* does not add any value too.

In [None]:
# Drop some columns
gazoza.drop(['id','lat','long','brand'], axis=1, inplace=True)

In [None]:
# Change date column to date type
gazoza['date'] = pd.to_datetime(gazoza['date'])

In [None]:
gazoza.head()

## Exploratory Data Analysis

In [None]:
sns.lineplot(x=gazoza['date'], y=gazoza['quantity']).set_title('Quantity Sold Over Time')

The quantity sold looks very seasonal. Although the sales are gradually decreasing over time, there are persistent peaks and troughs that appear year after year.

In [None]:
sns.scatterplot(x=gazoza['pop'], y=gazoza['quantity']).set_title('Quantity Sold vs. Population')

The scatter plot of the quantity sold vs population does not seen to yield much information.

In [None]:
sns.barplot(x=gazoza['city'], y=gazoza['pop']).set_title('Population by City')

We see that the population between the different cities is very different. So separating the poplations by city may help us see some patterns.

Let's see examine the data from Athens.

In [None]:
athens_gazoza = gazoza[gazoza['city'] == 'Athens']
sns.lineplot(x=athens_gazoza['date'], y=athens_gazoza['pop']).set_title('Athens Population Over Time')

We can see that the population of Athens has been decreasing over the years.

Let's see how to population related to the sales.

In [None]:
sns.scatterplot(x=athens_gazoza['pop'], y=athens_gazoza['quantity']).set_title('Quantity Sold vs. Athens Population')

The quantities sold seem to have a higher variance with high population numbers.

In [None]:
sns.lineplot(x=athens_gazoza['date'], y=athens_gazoza['quantity']).set_title('Athens Quantity Over Time')

The quantity sold in Athens also looks seasonal. The sales have been decreasing over time, but the the peaks are still consistent. The decrease in quantities sold could be related to population decrease.

In [None]:
sns.barplot(x=gazoza['shop'], y=gazoza['quantity']).set_title('Comparing the Sales in Different Shops')

# Athens - shop_3
#        - shop_1
# Irakleion - shop_2
# Larisa - shop_5
# Patra - shop_6
# Thessaloniki - shop_4

Let's examine the sales between the three different types containers.

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
sns.barplot(x=gazoza['city'], y=gazoza['quantity'], hue = gazoza['container']).set_title('Quantity Sold in Each City by Container Type')

In all cities, we see that the can is the highest selling container type and the plastic is the lowest selling container type.

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
sns.scatterplot(x=gazoza['price'], y=gazoza['quantity'], hue=gazoza['container']).set_title('Quantity Sold vs Price')

We can notice a generally negative relationship between the price and quantity.

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
sns.lineplot(data=gazoza, x='date', y='price', hue='container').set_title('Gazoza Price over Time by Container Type')

## Train-Test Preparation

In [None]:
# Assign Train and Test split.
X_train = gazoza.loc[:,gazoza.columns != 'quantity']
X_test = gazoza_test.loc[:,gazoza_test.columns != 'quantity']
y_train = gazoza.loc[:,gazoza.columns == 'quantity']
y_test = gazoza_test.loc[:,gazoza_test.columns == 'quantity']

In [None]:
# Split the date feature into year, month and date

X_train['year'] = X_train['date'].apply(lambda d : d.year)
X_train['month'] = X_train['date'].apply(lambda d : d.month)
X_train['day'] = X_train['date'].apply(lambda d : d.day)


X_test['year'] = X_test['date'].apply(lambda d : d.year)
X_test['month'] = X_test['date'].apply(lambda d : d.month)
X_test['day'] = X_test['date'].apply(lambda d : d.day)


# X_train.groupby('year').mean()['price'].plot().set_title("Price over Time")

In [None]:
X_train.head()

In [None]:
# Drop the date feature
X_train.drop('date', axis=1, inplace=True)
X_test.drop('date', axis=1, inplace=True)

In [None]:
# Remove the sufix of the values of the capacity column
X_train['capacity'] = X_train['capacity'].map(lambda x: str(x)[:-2])
X_test['capacity'] = X_test['capacity'].map(lambda x: str(x)[:-2])

# Change the capacity values into numeric
X_train['capacity'] = pd.to_numeric(X_train['capacity'])
X_test['capacity'] = pd.to_numeric(X_test['capacity'])

In [None]:
# Convert the leters to ml

to_scale_index = X_train[X_train['capacity'] == 1.5]['capacity'].index
for x in to_scale_index:
    X_train.at[x, 'capacity'] = X_train.at[x, 'capacity'] * 1000
    
to_scale_index = X_test[X_test['capacity'] == 1.5]['capacity'].index
for x in to_scale_index:
    X_test.at[x, 'capacity'] = X_test.at[x, 'capacity'] * 1000

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train = pd.get_dummies(X_train, columns = ['city', 'shop', 'container'], drop_first=True)
X_test = pd.get_dummies(X_test, columns = ['city', 'shop', 'container'], drop_first=True)

In [None]:
X_train.shape

## Create NN Model

In [None]:
# Scale all the variables variables
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense

In [None]:
demand_model = Sequential()

In [None]:
demand_model.add(Dense(17, activation = 'relu'))
demand_model.add(Dense(10, activation = 'relu'))
demand_model.add(Dense(10, activation = 'relu'))
demand_model.add(Dense(10, activation = 'relu'))

demand_model.add(Dense(1))

In [None]:
demand_model.compile(optimizer='adam', loss='mse')

In [None]:
demand_model.fit(x=X_train, y=y_train, epochs=200, verbose = 1)

In [None]:
pd.DataFrame(demand_model.history.history).plot().set_title('Loss over Epochs')

In [None]:
training_score = demand_model.evaluate(X_train, y_train, verbose=0)
test_score = demand_model.evaluate(X_test, y_test, verbose=0)

In [None]:
training_score

In [None]:
test_score