In [None]:
import numpy as np
import pandas as pd
import helpers
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [None]:
# load both CSV-Files

store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv', dtype = {'StateHoliday': np.str})

# merge CSV-Files based on the StoreID and convert dates to Datetime-format

df = pd.merge(store, train, on='Store')
df = helpers.date_convert(df)

In [None]:
def xgboost_data_transformation(df):
    # Replace NaN with Zeros and convert to Int
    df = helpers.float_to_int(df, {'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'})
    
    # Convert CompetitionYear and CompetitionMonth to datetime format
    df_subset = df.loc[(df['CompetitionOpenSinceYear']!=0) & (df['CompetitionOpenSinceMonth']!=0), ['CompetitionOpenSinceYear','CompetitionOpenSinceMonth']]
    df['CompetitionStart'] = df_subset['CompetitionOpenSinceYear'].astype(str) + '-' + df_subset['CompetitionOpenSinceMonth'].astype(str)  + '-01' 
    df['CompetitionStart'] = pd.to_datetime(df['CompetitionStart'])
    
    # Convert Promoyear and Promoweekno to datetime format
    df_subset = df.loc[df['Promo2SinceYear']!=0, ['Promo2SinceYear','Promo2SinceWeek']]
    df['PromoStart'] = df_subset.apply(lambda row: helpers.year_week(row.Promo2SinceYear, row.Promo2SinceWeek), axis=1)

    # create PromoDuration Column:  Date - PromoStart
    df['PromoDuration'] = (df['Date'] - df['PromoStart'])/np.timedelta64(1,'D')
    df['PromoDuration'].fillna(0, inplace=True)
    
    # Calculate is Competition is active and how long the competition is active 
    df['CompetitionActive'] = np.where(df['CompetitionStart'] <= df['Date'], 1, 0)
    df['CompetitionDays'] = (df['Date'] - df['CompetitionStart'])/np.timedelta64(1,'D')
    
    df['RunningAnyPromo'] = 0
    months_abbr = []

    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1) | df['Promo']==1
        df.loc[mask, 'RunningAnyPromo'] = 1
        
    # Sets RunningPromo to 1 if Months in Substring of PromoIntervall and current month match 
    df['RunningPromo2'] = 0
    months_abbr = []
    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1)
        df.loc[mask, 'RunningPromo2'] = 1
    df = df.drop({'Date','CompetitionStart','PromoStart'}, axis=1, errors='ignore') 
    return df

In [None]:
def tree_data_transformation(df):
    # Replace NaN in Sales with Zero
    df['Sales'].fillna(0, inplace=True)
    
    #Replace NaN in Customers with Mean(Customers), but if Store not open set Customers to 0
    df['Customers'].fillna(df['Customers'].mean, inplace=True)
    df.loc[df['Open'] == 0, 'Customers'] = 0
    
    # Replace NaN with Zeros and convert to Int
    df = helpers.float_to_int(df, {'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'})
    
    # Convert CompetitionYear and CompetitionMonth to datetime format
    df_subset = df.loc[(df['CompetitionOpenSinceYear']!=0) & (df['CompetitionOpenSinceMonth']!=0), ['CompetitionOpenSinceYear','CompetitionOpenSinceMonth']]
    df['CompetitionStart'] = df_subset['CompetitionOpenSinceYear'].astype(str) + '-' + df_subset['CompetitionOpenSinceMonth'].astype(str)  + '-01' 
    df['CompetitionStart'] = pd.to_datetime(df['CompetitionStart'])
    
    # Convert Promoyear and Promoweekno to datetime format
    df_subset = df.loc[df['Promo2SinceYear']!=0, ['Promo2SinceYear','Promo2SinceWeek']]
    df['PromoStart'] = df_subset.apply(lambda row: helpers.year_week(row.Promo2SinceYear, row.Promo2SinceWeek), axis=1)

    # create PromoDuration Column:  Date - PromoStart
    df['PromoDuration'] = (df['Date'] - df['PromoStart'])/np.timedelta64(1,'D')
    df['PromoDuration'].fillna(0, inplace=True)
    
    # Calculate is Competition is active and how long the competition is active 
    df['CompetitionActive'] = np.where(df['CompetitionStart'] <= df['Date'], 1, 0)
    df['CompetitionDays'] = (df['Date'] - df['CompetitionStart'])/np.timedelta64(1,'D')
    
    df['RunningAnyPromo'] = 0
    months_abbr = []

    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1) | df['Promo']==1
        df.loc[mask, 'RunningAnyPromo'] = 1
        
    # Sets RunningPromo to 1 if Months in Substring of PromoIntervall and current month match 
    df['RunningPromo2'] = 0
    months_abbr = []
    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1)
        df.loc[mask, 'RunningPromo2'] = 1
    df = df.drop({'Date','CompetitionStart','PromoStart','PromoInterval','Promo','Promo2','CompetitionDays','DayOfWeek'}, axis=1, errors='ignore') 
    df = df.dropna(how='any', subset=['Open', 'StateHoliday', 'SchoolHoliday','CompetitionDistance'])
    return df

In [None]:
xg_df = xgboost_data_transformation(df)

In [None]:
tr_df = tree_data_transformation(df)

In [None]:
tr_df.isnull().sum()

In [None]:
tr_df.dtypes

In [None]:
enc.fit(X)

In [None]:
# Check if correct  ????? definition of Running Promo

# df.loc[(df['Month'] == 12) & (df['RunningPromo'] != 0)][['RunningPromo','PromoInterval','Month','Date','Promo','Promo2','PromoStart','PromoDuration','PromoDuration2']]

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.loc[df['DayOfWeek'].isnull(), ['DayOfWeek','Day','Week','Date']].head()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df = df[np.isfinite(df[['Open','StateHoliday']])]

In [None]:
df.shape

In [None]:
df['StateHoliday']