In [39]:
import numpy as np
import pandas as pd
import helpers
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [60]:
# load both CSV-Files

store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv', dtype = {'StateHoliday': np.str})

# merge CSV-Files based on the StoreID and convert dates to Datetime-format

df = pd.merge(store, train, on='Store')
df = helpers.date_convert(df)


In [51]:
df.dtypes

PromoInterval                        object
Date                         datetime64[ns]
DayOfWeek                           float64
Sales                               float64
Customers                           float64
Open                                float64
Promo                               float64
StateHoliday                         object
SchoolHoliday                       float64
Year                                  int64
Quarter                               int64
Month                                 int64
Week                                  int64
Day                                   int64
Store                                 int64
StoreType                            object
Assortment                           object
CompetitionDistance                 float64
CompetitionOpenSinceMonth           float64
CompetitionOpenSinceYear            float64
Promo2                                int64
Promo2SinceWeek                     float64
Promo2SinceYear                 

In [61]:
# One Hot Encoding: transforms objects (eg. a, b, c) of a given column to integers (eg. 0, 1, 2)

def cat_to_int(df, columnlist):
    for i in columnlist:
        df = pd.concat([df, pd.get_dummies(df[i], prefix=i)], axis=1)
    df = df.drop(columnlist, axis=1, errors='ignore')   
    return df
        
df = cat_to_int(df,{'StateHoliday','StoreType','Assortment'})

In [64]:
# Replace NaN with Zeros and convert to Int

def float_to_int(df, columnlist):
    for i in columnlist:
        df[i].fillna(0, inplace=True)
        df[i] = df[i].astype(int)
    return df

df = float_to_int(df, {'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'})

In [67]:
# Convert Promoyear and Promoweekno to datetime format

def year_week(y, w):
    return datetime.datetime.strptime(f'{y} {w} 1', '%G %V %u')

df_subset = df.loc[df['Promo2SinceYear']!=0, ['Promo2SinceYear','Promo2SinceWeek']]
df['PromoStart'] = df_subset.apply(lambda row: year_week(row.Promo2SinceYear, row.Promo2SinceWeek), axis=1)

# create PromoDuration Column:  Date - PromoStart

df['PromoDuration'] = (df['Date'] - df['PromoStart'])/np.timedelta64(1,'D')
df['PromoDuration'].fillna(0, inplace=True)

In [45]:
# Sets RunningPromo to 1 if 1.) Months in Substring of PromoIntervall and current month match OR 2.) if Promo ==1 (today runs Promo)

df['RunningAnyPromo'] = 0
months_abbr = []
for i in range(1,13):
    months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

for i in months_abbr:
    mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0])| df['Promo']==1
    df.loc[mask, 'RunningAnyPromo'] = 1

In [75]:
# Sets RunningPromo to 1 if Months in Substring of PromoIntervall and current month match 

df['RunningPromo2'] = 0
months_abbr = []
for i in range(1,13):
    months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

for i in months_abbr:
    mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0])
    df.loc[mask, 'RunningPromo2'] = 1

df = df.drop('PromoInterval', axis=1, errors='ignore')   

In [None]:
# Check if correct  ????? definition of Running Promo

# df.loc[(df['Month'] == 12) & (df['RunningPromo'] != 0)][['RunningPromo','PromoInterval','Month','Date','Promo','Promo2','PromoStart','PromoDuration','PromoDuration2']]

In [None]:
df.isna().sum()

In [47]:
df.dtypes

PromoInterval                        object
Date                         datetime64[ns]
DayOfWeek                           float64
Sales                               float64
Customers                           float64
Open                                float64
Promo                               float64
SchoolHoliday                       float64
Year                                  int64
Quarter                               int64
Month                                 int64
Week                                  int64
Day                                   int64
Store                                 int64
CompetitionDistance                 float64
CompetitionOpenSinceMonth             int64
CompetitionOpenSinceYear              int64
Promo2                                int64
Promo2SinceWeek                       int64
Promo2SinceYear                       int64
StateHoliday_0                        uint8
StateHoliday_a                        uint8
StateHoliday_b                  