In [15]:
import numpy as np
import pandas as pd
import helpers
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [2]:
# load both CSV-Files

store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv', dtype = {'StateHoliday': np.str})

# merge CSV-Files based on the StoreID and convert dates to Datetime-format

df = pd.merge(store, train, on='Store')
df = helpers.date_convert(df)

In [3]:
def xgboost_data_transformation(df):
    # convert to Int
    #df = helpers.float_to_int(df, {'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'})
    
    # Convert CompetitionYear and CompetitionMonth to datetime format
    df_subset_Comp = df.loc[(~df['CompetitionOpenSinceYear'].isnull()) & (~df['CompetitionOpenSinceMonth'].isnull()), ['CompetitionOpenSinceYear','CompetitionOpenSinceMonth']]
    df_subset_Comp = helpers.float_to_int(df_subset_Comp, {'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'})
    df_subset_Comp['CompetitionStart'] = df_subset_Comp['CompetitionOpenSinceYear'].astype(str) + '-' + df_subset_Comp['CompetitionOpenSinceMonth'].astype(str)  + '-01' 
    df['CompetitionStart'] = pd.to_datetime(df_subset_Comp['CompetitionStart'])
    
    # Convert Promoyear and Promoweekno to datetime format
    df_subset = df.loc[(~df['Promo2SinceYear'].isnull()) & (~df['Promo2SinceWeek'].isnull()), ['Promo2SinceYear','Promo2SinceWeek']]
    df_subset = helpers.float_to_int(df_subset, {'Promo2SinceYear', 'Promo2SinceWeek'})
    df['PromoStart'] = df_subset.apply(lambda row: helpers.year_week(row.Promo2SinceYear, row.Promo2SinceWeek), axis=1)

    # create PromoDuration Column:  Date - PromoStart
    df['PromoDuration'] = (df['Date'] - df['PromoStart'])/np.timedelta64(1,'D')
    df['PromoDuration'].fillna(0, inplace=True)
    
    # Calculate is Competition is active and how long the competition is active 
    df['CompetitionActive'] = np.where(df['CompetitionStart'] <= df['Date'], 1, 0)
    df['CompetitionDays'] = (df['Date'] - df['CompetitionStart'])/np.timedelta64(1,'D')
    
    df['RunningAnyPromo'] = 0
    months_abbr = []

    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1) | df['Promo']==1
        df.loc[mask, 'RunningAnyPromo'] = 1
        
    # Sets RunningPromo to 1 if Months in Substring of PromoIntervall and current month match 
    df['RunningPromo2'] = 0
    months_abbr = []
    for i in range(1,13):
        months_abbr.append((i, datetime.date(2008, i, 1).strftime('%b')))

    for i in months_abbr:
        mask = (df['PromoInterval'].str.contains(i[1], na=False)) & (df['Month']==i[0]) & (df['Promo2']==1)
        df.loc[mask, 'RunningPromo2'] = 1
    df = df.drop({'Date','CompetitionStart','PromoStart'}, axis=1, errors='ignore') 
    return df

In [4]:
def tree_data_transformation(df):
    # Replace NaN with Zeros
    for i in {'Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Sales'}:
        df[i].fillna(0, inplace=True)
    
    #Replace NaN in Customers with Mean(Customers), but if Store not open set Customers to 0
    df['Customers'].fillna(df['Customers'].mean(), inplace=True)
    df.loc[df['Open'] == 0, 'Customers'] = 0
    
    df = df.drop({'Date','CompetitionStart','PromoStart','PromoInterval','Promo','Promo2','CompetitionDays','DayOfWeek'}, axis=1, errors='ignore') 
    df = df.dropna(how='any', subset=['Open', 'StateHoliday', 'SchoolHoliday','CompetitionDistance'])
    return df

In [5]:
# One hot encoding
def ohe(features):
    features_obj = features.select_dtypes(include='object')
    le = preprocessing.LabelEncoder()
    X = features_obj.apply(le.fit_transform)
    enc = preprocessing.OneHotEncoder(categories='auto')
    enc.fit(X)
    onehotlabels = enc.transform(X).toarray()
    features = features.select_dtypes(exclude=['object'])
    features_ohe = pd.concat([features.reset_index(), pd.DataFrame(onehotlabels)], axis=1, ignore_index=True)
    return features_ohe

In [7]:
xg_df = xgboost_data_transformation(df)
tr_df = tree_data_transformation(xg_df)
X = ohe(tr_df)
y = tr_df['Sales'].values.ravel()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)