# Data add and merge

In this step we read in the preapred data sets from different packle files and merge them into on pandas dataframe.

In addition we merge external datasets

In [1]:
import pandas as pd

# load datasets
prepared_sales_data = pd.read_pickle('../exported_data/prepared_sales_data.pkl')
prepared_weather_data = pd.read_pickle('../exported_data/prepared_weather_data.pkl')
prepared_kiwo_data = pd.read_pickle('../exported_data/prepared_kiwo_data.pkl')

base_dateranges = pd.read_pickle('../exported_data/base_dateranges_data.pkl')

## Additional external data 

We use binary (0/1) notation to indicate that event has taken place

This are values we will add
- Feiertage 
- Wahltage
- Schulferien

This values will be add as float value and number is only available per month
- Einzelhandels sales index (represent general market development)

This are values we will calculate based on date
- Wochentage

In [2]:
# load datasets
holyday_data = pd.read_csv('../bakery_sales_data/feiertage_sh.csv')
schhol_data = pd.read_csv('../bakery_sales_data/schulferien.csv')
elec_data = pd.read_csv('../bakery_sales_data/wahltage.csv')

sales_index = pd.read_csv('../bakery_sales_data/einzelhandels_index_by_date.csv')

# change data type of Datum with specified format
holyday_data['Datum'] = pd.to_datetime(holyday_data['Datum'], format='%d.%m.%Y')
schhol_data['Datum'] = pd.to_datetime(schhol_data['Datum'])
elec_data['Datum'] = pd.to_datetime(elec_data['Datum'], format='%m/%d/%Y')
sales_index['Datum'] = pd.to_datetime(sales_index['Datum'])

# start merge with base dataranges frame
pre_data = base_dateranges.merge(holyday_data, on='Datum', how='left') \
    .merge(schhol_data, on='Datum', how='left') \
    .merge(elec_data, on='Datum', how='left') \
    .merge(sales_index, on='Datum', how='left')

# dictionary holyday code
feiertage_dict = {
    'Neujahr': 1,
    'Karfreitag': 2,
    'Ostermontag': 3,
    'Maifeiertag': 4,
    'Christi Himmelfahrt': 5,
    'Pfingstmontag': 6,
    'Tag der Deutschen Einheit': 7,
    '1. Weihnachtstag': 8,
    '2. Weihnachtstag': 9
}

# dictionary school holyday code
holyday_dict = {
    'Sommerferien': 1,
    'Keine Ferien': 0,
    'Herbstferien': 2,
    'Weihnachtsferien': 3,
    'Osterferien': 4,
    'Pfingsten': 0
}

# mapping-function
def map_holyday_to_code(feiertag):
    return feiertage_dict.get(feiertag, 0)  

# mapping-function
def map_schoolhol_to_code(ferientag):
    return holyday_dict.get(ferientag, 0)  

# HolyCode/Schhol = numeric code, BinHoly = binary holydays 0 = no, 1 = yes
pre_data['HolyCode'] = pre_data['Feiertag'].apply(map_holyday_to_code)
pre_data['SchholCode'] = pre_data['Ferien'].apply(map_schoolhol_to_code)
pre_data['BinHoly'] = (pre_data['HolyCode'] != 0).astype(int)
pre_data['BinSchhol'] = (pre_data['SchholCode'] != 0).astype(int)
pre_data['BinElec'] = pre_data['Wahl'].apply(lambda x: 0 if pd.isnull(x) else 1)
# delete 
pre_data.drop(columns=['Feiertag', 'Ferien', 'Wahl'], inplace=True)

# add day of week
pre_data['DayOfWeek'] = pre_data['Datum'].dt.day_name()
# dictonary
weekend_dict = {
    'Saturday': 1,
    'Sunday': 1,
    'Monday': 0,
    'Tuesday': 0,
    'Wednesday': 0,
    'Thursday': 0,
    'Freiday': 0
}
# function 
def map_days_to_code(wdays):
    return weekend_dict.get(wdays, 0) 

# new row weekend
pre_data['weekend'] = pre_data['DayOfWeek'].apply(map_days_to_code)

print(pre_data)

          Datum  Salesindex  HolyCode  SchholCode  BinHoly  BinSchhol  \
0    2013-07-01        92.2         0           1        0          1   
1    2013-07-02        92.2         0           1        0          1   
2    2013-07-03        92.2         0           1        0          1   
3    2013-07-04        92.2         0           1        0          1   
4    2013-07-05        92.2         0           1        0          1   
...         ...         ...       ...         ...      ...        ...   
2218 2019-07-28        99.3         0           0        0          0   
2219 2019-07-29        99.3         0           0        0          0   
2220 2019-07-30        99.3         0           0        0          0   
2221 2019-07-31        99.3         0           0        0          0   
2222 2019-08-01        99.5         0           0        0          0   

      BinElec  DayOfWeek  weekend  
0           0     Monday        0  
1           0    Tuesday        0  
2           0  

## Merge all datasets
We will use base tabel and left join sales, wetherdata, kiwo and pre data together.

In [3]:

bakery_sales_full_data = pd.DataFrame(base_dateranges, columns=['Datum'])


bakery_sales_full_data = base_dateranges.merge(prepared_sales_data, on='Datum', how='left')


# left join of non-binary data and filling missing data with NaN
bakery_sales_full_data = bakery_sales_full_data.merge(prepared_weather_data, on='Datum', how='left')

# left join of binary data for kiwo and other events
bakery_sales_full_data = bakery_sales_full_data.merge(prepared_kiwo_data, on='Datum', how='left')

bakery_sales_full_data = bakery_sales_full_data.merge(pre_data, on='Datum', how='left')

# show sample data
print(bakery_sales_full_data.head(3))
print("\n")
print(bakery_sales_full_data.tail(3))
print("\n")
print(bakery_sales_full_data.sample(5))
print("\n")
bakery_sales_full_data.info()

       Datum  Warengruppe      Umsatz  Bewoelkung  Temperatur  \
0 2013-07-01          1.0  148.828353         6.0     17.8375   
1 2013-07-01          2.0  535.856285         6.0     17.8375   
2 2013-07-01          3.0  201.198426         6.0     17.8375   

   Windgeschwindigkeit  Wettercode Jahr_Monat  monthly_mean_temp  \
0                 15.0        20.0    2013-07           21.90121   
1                 15.0        20.0    2013-07           21.90121   
2                 15.0        20.0    2013-07           21.90121   

   monthly_mean_temp_diff  ...  snowday KielerWoche  Salesindex  HolyCode  \
0                -4.06371  ...        0           0        92.2         0   
1                -4.06371  ...        0           0        92.2         0   
2                -4.06371  ...        0           0        92.2         0   

   SchholCode  BinHoly  BinSchhol  BinElec  DayOfWeek  weekend  
0           1        0          1        0     Monday        0  
1           1        0     

Save bakery_sales_prediction_data as Pickle File to preserve datatypes and could later easy import file as input for the models

In [4]:
# set output path
output_path = '../exported_data/bakery_sales_full_data.pkl'

# print as csv
bakery_sales_full_data.to_pickle(output_path)
print(f'DataFrame saved to {output_path}')


DataFrame saved to ../exported_data/bakery_sales_full_data.pkl
