In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Disclaimer:**

Originally I was planning to learn sales forecasts by using Neural Network from the link below. However things did not go well as Keras [Merge] was replaced with [concatenate] function, I almost gave up the thought after several attempts until I found the revised Embedding Layers from another notebook (link provided).

Sadly speaking, Keras regression prediction only results in a public score 0.11155, compared with XgBoost 0.10190 & lightgbm 0.10377 without Embedding layers (Hopefully there is a better way to do it by fine tunning layers in NN). By applying One-hot Encoding on the main DF will return sparse matrix (especially there are 1000+ stores), thus both XgBoost & lightgbm will in fact give worse scores compared with DF w/o One-Hot.

Using Weights from Keras Embedding Layers to replace One-Hot Encoding as well as dimension reduction (again, I get the idea from link below) , both XgBoost & lightgbm return significantly better scores. Average the result from both models only show very minimum gain in public scores.

Here is the individual result (You can notice that lightgbm has significant boost in performance with Embedding layers while training time is only half of XgBoost):<br>
XgBoost - 0.10190 (w/o Embedding), 0.09890 (w Embedding, 2 hours training time)<br>
lightgbm - 0.10377 (w/o Embedding), 0.09761 (w Embedding, 1 hour training time)<br>

*PS: I did not use full features from the original solution, the result should be better with more features. Kindly upvote if you find it useful :)

**References:**<br>
Original solution & feature engineering (3rd place in leaderboard) [link](https://www.kaggle.com/competitions/rossmann-store-sales/discussion/17974)<br>
Published Article from the author: [link](https://arxiv.org/pdf/1604.06737.pdf)

Keras Weight & Lightgbm: [link](https://www.kaggle.com/code/smksett11/rossmann-entityembedding)

Other reference(Xgboost): [link](https://www.kaggle.com/code/danspace/rossmann-store-sales-xgboost)

# Import Data

## Competition data

In [None]:
store = pd.read_csv("../input/rossmann-store-sales/store.csv")
train = pd.read_csv("../input/rossmann-store-sales/train.csv",parse_dates=[2])
test = pd.read_csv("../input/rossmann-store-sales/test.csv",parse_dates=[3])

Read this discussion on how to match each store with its state [link](https://www.kaggle.com/competitions/rossmann-store-sales/discussion/17048)

In [None]:
print(store.shape)
store.head()

In [None]:
print(train.shape)
train.head()

In [None]:
#Sales forecast until 2015-09-17
print(test.shape)
test.head()

## External data

Descriptions for external data & other important information [link](https://www.kaggle.com/competitions/rossmann-store-sales/discussion/17229)

In [None]:
import csv
import pickle
from datetime import datetime
import os
import glob

### Google Trend

In [None]:
csv_location = "../input/rossmann-google-trend/googletrend/"
google_trend_files = glob.glob(csv_location + '/*.csv')

#Preview for one of the google trend , Woche mean "Week" in German
pd.read_csv(google_trend_files[0]).head()

### Weather

In [None]:
csv_location = '../input/weather/weather/'
german_states_weather = glob.glob(csv_location + '/*.csv')

#Preview for one of the weather file
pd.read_csv(german_states_weather[0],delimiter=";").head()

### Store State

In [None]:
store_state = pd.read_csv('../input/store-state/store_states.csv')
store_state.head()

# Data Exploration

In [None]:
# check store nan rows
store.isnull().sum()

In [None]:
store.PromoInterval.value_counts()

In [None]:
store['StoreType'].value_counts()

In [None]:
store['Assortment'].value_counts()

In [None]:
train.isnull().sum()

In [None]:
#To confirm no closed store with sales value
train.groupby('Open')['Sales'].sum()

In [None]:
train['StateHoliday'].value_counts()

In [None]:
test.isnull().sum()

In [None]:
# draw store 1 and store 10 sales distribution plot
import matplotlib.pyplot as plt
store_1 = train.loc[(train["Store"]==1)&(train['Sales']>0), ['Date',"Sales"]]
store_10 = train.loc[(train["Store"]==10)&(train['Sales']>0), ['Date',"Sales"]]
f = plt.figure(figsize=(18,10))
ax1 = f.add_subplot(211)
ax1.plot(store_1['Date'], store_1['Sales'], '-')
ax1.set_xlabel('Time')
ax1.set_ylabel('Sales')
ax1.set_title('Store 1 Sales Distribution')

ax2 = f.add_subplot(212)
ax2.plot(store_10['Date'], store_10['Sales'], '-')
ax2.set_xlabel('Time')
ax2.set_ylabel('Sales')
ax2.set_title('Store 10 Sales Distribution')

In [None]:
# check stores open distribution on days of week
import seaborn as sns
sns.countplot(x = 'DayOfWeek', hue = 'Open', data = train)
plt.title('Store Daily Open Countplot')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# check distribution of sales in train set
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
g1 = sns.distplot(train['Sales'],hist = True,label='skewness:{:.2f}'.format(train['Sales'].skew()),ax = ax1)
g1.legend()
g1.set(xlabel = 'Sales', ylabel = 'Density', title = 'Sales Distribution')
g2 = sns.distplot(np.log1p(train['Sales']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['Sales']).skew()),ax=ax2)
g2.legend()
g2.set(xlabel = 'log(Sales+1)',ylabel = 'Density', title = 'log(Sales+1) Distribution')
plt.show()

# Data Preprocessing

## Competition Data

In [None]:
def process_train_test(pd):
    #Only affect test dataset,, see discuss thread 
    pd['Open'].fillna(value = 0, inplace = True)
    pd['week_of_year'] = pd['Date'].dt.week
    pd['year'] = pd['Date'].dt.year
    pd['month'] = pd['Date'].dt.month
    pd['day'] = pd['Date'].dt.day

In [None]:
process_train_test(train)
print(train.isnull().sum().sum())
train.head()

In [None]:
process_train_test(test)
print(test.isnull().sum().sum())
test.head()

In [None]:
#Transform column for mapping later
store['PromoInterval'] = store['PromoInterval'].str[0]
store.fillna(0,inplace = True)
print(store.isnull().sum().sum())

In [None]:
store.head()

In [None]:
train = train.merge(store,how='left',on='Store')
print(train.shape)
print("train missing value ",train.isnull().sum().sum())
train.head()

In [None]:
test = test.merge(store,how='left',on='Store')
print(test.shape)
print("test missing value ",test.isnull().sum().sum())
test.head()

## External Data

### Store State

In [None]:
train = train.merge(store_state,how='left',on='Store')
print(train.shape)
print("train missing value ",train.isnull().sum().sum())
train.head()

In [None]:
test = test.merge(store_state,how='left',on='Store')
print(test.shape)
print("test missing value ",test.isnull().sum().sum())
test.head()

### Google Trend

Abbrev references for each state [link](https://kb.bullseyelocations.com/article/58-germany-state-codes)

In [None]:
# google_trend = {}

# for one_state in google_trend_files:
#     state_code = os.path.splitext(os.path.basename(one_state))[0][-2:]
#     #Extract last two string, i.e BE
#     if state_code == 'NI':
#         state_code = 'HB,NI'
#     print(state_code)
#     with open(one_state, 'r') as csvfile:
#         trends = csv.reader(csvfile, delimiter=',')
#         for row, trend in enumerate(trends):
#             #first 0 is header, therefore it will be skipped for processing
#             if row == 0:
#                     continue
#             # The sata is represented from Sunday till Saturday - take Saturday and check the week number
#             trend_value = int(trend[1])
#             end_day_of_range = trend[0].split(' - ')[1]
#             dt = datetime.strptime(end_day_of_range, '%Y-%m-%d')
#             year = dt.year
#             month = dt.month
#             day = dt.day
#             week_of_year = dt.isocalendar()[1]

#             key = (state_code, year, week_of_year)
#             google_trend[key] = trend_value / 100

# [(k,v) for k,v in google_trend.items()][:5]

In [None]:
list_df = []

for one_state in google_trend_files:
    state_code = os.path.splitext(os.path.basename(one_state))[0][-2:]
    #Extract last two string, i.e BE
    if state_code == 'NI':
        state_code = 'HB,NI'
            
    #DE should be German General trend, therefore it should be apply for all states
    #Original code not apply due to apply in DF will result duplicate value
    if state_code == 'DE':
        DE_trend = pd.read_csv(one_state)
        end_day_of_range = pd.to_datetime(DE_trend['Woche'].str.split().str[-1].str.strip() ,infer_datetime_format='%Y-%m-%d')
#         DE_trend['State'] = 'DE'
        DE_trend['year'] = end_day_of_range.dt.year
        DE_trend['week_of_year'] = end_day_of_range.dt.week
        DE_trend['DE_trend'] = DE_trend['Dez. 2012 - Sep. 2015'] / 100
        DE_trend.drop(['Woche','Dez. 2012 - Sep. 2015'],axis=1,inplace=True)

    else:
        df= pd.read_csv(one_state)
        end_day_of_range = pd.to_datetime(df['Woche'].str.split().str[-1].str.strip() ,infer_datetime_format='%Y-%m-%d') 
        df['State'] = state_code
        df['year'] = end_day_of_range.dt.year
        df['week_of_year'] = end_day_of_range.dt.week
#         df['state_trend_key'] = state_code + str(year) + str(week_of_year)
        df['state_trend'] = df['Dez. 2012 - Sep. 2015'] / 100
        df.drop(['Woche','Dez. 2012 - Sep. 2015'],axis=1,inplace=True)
        list_df.append(df)

state_trend = pd.concat(list_df, ignore_index=True)

In [None]:
train = train.merge(state_trend,how='left',on=['State','year','week_of_year'])
train = train.merge(DE_trend,how='left',on=['year','week_of_year'])
print(train.shape)
print("train missing value ",train.isnull().sum().sum())
train.head()

In [None]:
test = test.merge(state_trend,how='left',on=['State','year','week_of_year'])
test = test.merge(DE_trend,how='left',on=['year','week_of_year'])
print(test.shape)
print("test missing value ",test.isnull().sum().sum())
test.head()

### Weather

In [None]:
event_list = ['', 'Fog-Rain', 'Fog-Snow', 'Fog-Thunderstorm',
              'Rain-Snow-Hail-Thunderstorm', 'Rain-Snow', 'Rain-Snow-Hail',
              'Fog-Rain-Hail', 'Fog', 'Fog-Rain-Hail-Thunderstorm', 'Fog-Snow-Hail',
              'Rain-Hail', 'Rain-Hail-Thunderstorm', 'Fog-Rain-Snow', 'Rain-Thunderstorm',
              'Fog-Rain-Snow-Hail', 'Rain', 'Thunderstorm', 'Snow-Hail',
              'Rain-Snow-Thunderstorm', 'Snow', 'Fog-Rain-Thunderstorm']
event_list_map = dict(zip(event_list, range(len(event_list))))
#Confirm the mapping
[(k,v) for k,v in event_list_map.items()][:3]

In [None]:
def states_names_to_abbreviation(state_name):
    d = {}
    d['BadenWuerttemberg'] = 'BW'
    d['Bayern'] = 'BY'
    d['Berlin'] = 'BE'
    d['Brandenburg'] = 'BB'  # do not exist in store_state
    d['Bremen'] = 'HB'  # we use Niedersachsen instead of Bremen
    d['Hamburg'] = 'HH'
    d['Hessen'] = 'HE'
    d['MecklenburgVorpommern'] = 'MV'  # do not exist in store_state
    d['Niedersachsen'] = 'HB,NI'  # we use Niedersachsen instead of Bremen
    d['NordrheinWestfalen'] = 'NW'
    d['RheinlandPfalz'] = 'RP'
    d['Saarland'] = 'SL'
    d['Sachsen'] = 'SN'
    d['SachsenAnhalt'] = 'ST'
    d['SchleswigHolstein'] = 'SH'
    d['Thueringen'] = 'TH'

    return d[state_name]

In [None]:
#Original code
# weather = {}
# events = []
# for one_state in german_states_weather:
#     state_name = os.path.splitext(os.path.basename(one_state))[0]
#     state_code = states_names_to_abbreviation(state_name)
#     with open(one_state, 'r') as csvfile:
#         daily_weather = csv.reader(csvfile, delimiter=';')
#         for row_index, one_day in enumerate(daily_weather):
#             if row_index == 0:
#                 continue
#             date = one_day[0]
#             key = (state_code, date)
#             temperature = [int(one_day[1]), int(one_day[2]), int(one_day[3])]
#             temperature = [(x - 10) / 30 for x in temperature]  # normalize
#             humidity = [int(one_day[7]), int(one_day[8]), int(one_day[9])]
#             humidity = [(x - 50) / 50 for x in humidity]  # normalize
#             wind = [int(one_day[16]) / 50, int(one_day[17]) / 30]
#             if one_day[20] == 'NA':
#                 cloud = [0]
#             else:
#                 cloud = [int(one_day[20])]
#             event = [event2int(one_day[21])]
#             weather[key] = temperature + humidity + wind + cloud + event
#             events.append(one_day[21])
# [(k,v) for k,v in weather.items()][:3]

In [None]:
w_list = []
for one_state in german_states_weather:
    state_name = os.path.splitext(os.path.basename(one_state))[0]
    state_code = states_names_to_abbreviation(state_name)
    weather = pd.read_csv(one_state, delimiter=";", parse_dates=['Date'])
    weather['State'] = state_code
    
    for temp in ['Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC']:
        weather[temp] = (weather[temp] - 10) / 30
        
    for humi in ['Max_Humidity','Mean_Humidity', 'Min_Humidity']:
        weather[humi] = (weather[humi] - 50) / 50
    
    weather['Max_Wind_SpeedKm_h'] = weather['Max_Wind_SpeedKm_h'] / 50
    weather['Mean_Wind_SpeedKm_h'] = weather['Mean_Wind_SpeedKm_h'] / 30
    weather['CloudCover'].fillna(0,inplace=True)
    weather['Events'] = weather['Events'].map(event_list_map)
    #Blank weather event is 0 index
    weather['Events'].fillna(0,inplace=True)
    weather = weather[['Date','State','Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC','Max_Humidity','Mean_Humidity', 'Min_Humidity',\
                      'Max_Wind_SpeedKm_h','Mean_Wind_SpeedKm_h','CloudCover','Events']]
    w_list.append(weather)
    
weather_all = pd.concat(w_list, ignore_index=True)

In [None]:
print(weather_all.isnull().sum().sum())
weather_all.head()

In [None]:
train = train.merge(weather_all,how='left',left_on=['State','Date'],right_on=['State','Date'])
print(train.shape)
print("train missing value ",train.isnull().sum().sum())
train.head()

In [None]:
test = test.merge(weather_all,how='left',left_on=['State','Date'],right_on=['State','Date'])
print(test.shape)
print("test missing value ",test.isnull().sum().sum())
test.head()

# Feature Engineering

## Competition Data

In [None]:
#To encode promote state holiday, store type and assortment 
def abc2int(pd):
    d = {'0': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4}
    return pd.map(d)

In [None]:
for col in ['StateHoliday','StoreType','Assortment']:
    train[col] = abc2int(train[col]).fillna(0)
    test[col] = abc2int(test[col]).fillna(0)

In [None]:
#To encode external data - store state
def state2int(pd):
    d = {'HB,NI': 0, 'HH': 1, 'TH': 2, 'RP': 3, 'ST': 4, 'BW': 5,
         'SN': 6, 'BE': 7, 'HE': 8, 'SH': 9, 'BY': 10, 'NW': 11}
    return pd.map(d)

In [None]:
train['State'] = state2int(train['State'])
test['State'] = state2int(test['State'])

In [None]:
#Convert promote interval (Jan -Mar) to int by taking first letter as reference
def PromoInterval2int(pd):
    d = {'0': 0, 'J': 1, 'F': 2, 'M': 3}
    return pd.map(d)

In [None]:
train['PromoInterval'] = PromoInterval2int(train['PromoInterval']).fillna(0)
test['PromoInterval'] = PromoInterval2int(test['PromoInterval']).fillna(0)

In [None]:
#open since year with 0 is missing data on train data
def hasCompetitionmonths(pd):
    result = []
    for index, row in pd.loc[:,['Date','CompetitionOpenSinceMonth','CompetitionOpenSinceYear']].iterrows():
        if index % 100000 == 0:
            print("processing row ",index)
        if row[2] == 0:
            months_since_competition = 0
        else:
            dt_competition_open = datetime(year=int(row[2]),
                                           month=int(row[1]),
                                           day=15)
            #// to return division result in integer, single slash return float
            months_since_competition = (row[0] - dt_competition_open).days // 30
            if months_since_competition < 0:
                months_since_competition = 0
        #24 months training data, I reckon
        result.append(min(months_since_competition, 24))   
    return result

In [None]:
train['hasCompetitionmonths'] = hasCompetitionmonths(train)
print("Complete processing train dataset!")
test['hasCompetitionmonths'] = hasCompetitionmonths(test)
print("Complete processing test dataset!")

In [None]:
#Original code
# def hasPromo2weeks(date, Promo2SinceYear, Promo2SinceWeek):
#     if Promo2SinceYear == 0:
#         return 0
#     start_promo2 = Week(Promo2SinceYear, Promo2SinceWeek).monday()
#     weeks_since_promo2 = (date.date() - start_promo2).days // 7
#     if weeks_since_promo2 < 0:
#         return 0
#     return min(weeks_since_promo2, 25)

In [None]:
from isoweek import Week

#Limit the result to only 25 weeks or equivalent to half year, promotion in recently week will boost sales, vise versa
def hasPromo2weeks(pd):
    result = []
    for index, row in pd.loc[:,['Date','Promo2SinceWeek','Promo2SinceYear']].iterrows():
        if index % 100000 == 0:
            print("processing row ",index)
        if row[2] == 0:
            weeks_since_promo2 = 0
        else:
            #isoweek to return the date of Monday, i.e. 2010-03-29
            start_promo2 = Week(int(row[2]), int(row[1])).monday()
            weeks_since_promo2 = (row[0].date() - start_promo2).days // 7
            #if promotion happen in current year, it will result negative, however the final result is 0, should try 1?
            #Because if haspromo2week is 0, latestpromo2months will be 0 also, row 1017204
            if weeks_since_promo2 < 0:
                weeks_since_promo2 = 0
        result.append(min(weeks_since_promo2, 25))   
    return result

In [None]:
train['hasPromo2weeks'] = hasPromo2weeks(train)
print("Complete processing train dataset!")
test['hasPromo2weeks'] = hasPromo2weeks(test)
print("Complete processing test dataset!")

In [None]:
#Original code, return 0 when haspromo2week or promointerval is 0
#For example if the store promotion interval is Mar,Jun,Sep,Dec and date is Jan-2013, then it will return 1 (Jan minus Dec)
#For example if the store promotion interval is Jan,Apr,Jul,Oct and date is Jan-2013, then it will return 0 (Jan minus Jan)
# def latest_promo2_months(date, promointerval, Promo2SinceYear, Promo2SinceWeek):
#     promo2int = promointerval
#     if date.month < promo2int:
#         latest_promo2_start_year = date.year - 1
#         latest_promo2_start_month = promo2int + 12 - 3
#     else:
#         latest_promo2_start_year = date.year
#         latest_promo2_start_month = ((date.month - promo2int) // 3) * 3 + promo2int

#     latest_promo2_start_day = datetime(year=latest_promo2_start_year,
#                                        month=latest_promo2_start_month,
#                                        day=1)
#     weeks_since_latest_promo2 = (date - latest_promo2_start_day).days // 30
#     return weeks_since_latest_promo2

# num_row =2
# latest_promo2_months(train['Date'][num_row],int(train['PromoInterval'][num_row]),train['Promo2SinceYear'][num_row],train['Promo2SinceWeek'][num_row])

In [None]:
def latest_promo2_months(pd):
    result = []
    for index, row in pd.loc[:,['Date','hasPromo2weeks','PromoInterval']].iterrows():
        if index % 100000 == 0:
            print("processing row ",index)
            
        if row[1] == 0:
            weeks_since_latest_promo2 =  0
        elif row[2] == 0:
            weeks_since_latest_promo2 =  0
        else:
            if row[0].month < row[2]:
                latest_promo2_start_year = row[0].year - 1
                latest_promo2_start_month = row[2] + 12 - 3
            else:
                latest_promo2_start_year = row[0].year
                latest_promo2_start_month = ((row[0].month - row[2]) // 3) * 3 + row[2]

            latest_promo2_start_day = datetime(year=int(latest_promo2_start_year),
                                               month=int(latest_promo2_start_month),
                                               day=1)
            weeks_since_latest_promo2 = (row[0] - latest_promo2_start_day).days // 30
        result.append(weeks_since_latest_promo2) 
    return result

In [None]:
train['latest_promo2_months'] = latest_promo2_months(train)
print("Complete processing train dataset!")
test['latest_promo2_months'] = latest_promo2_months(test)
print("Complete processing test dataset!")

In [None]:
#row 1017204 look like incorrect result
display(train[['Date','Promo2SinceWeek','Promo2SinceYear','PromoInterval','hasPromo2weeks','latest_promo2_months']].head())
train[['Date','Promo2SinceWeek','Promo2SinceYear','PromoInterval','hasPromo2weeks','latest_promo2_months']].tail()

In [None]:
year2int = lambda x:1 if x < 2000 else x - 1998

train['CompetitionOpenSinceYear'] = train['CompetitionOpenSinceYear'].apply(year2int)
test['CompetitionOpenSinceYear'] = test['CompetitionOpenSinceYear'].apply(year2int)

In [None]:
train['year'] = train['year'] - 2013
test['year'] = test['year'] - 2013

train['Promo2SinceYear'] = train['Promo2SinceYear'] - 2008
train['Promo2SinceYear'] = train['Promo2SinceYear'].apply(lambda x:0 if x < 0 else x)
test['Promo2SinceYear'] = test['Promo2SinceYear'] - 2008
test['Promo2SinceYear'] = test['Promo2SinceYear'].apply(lambda x:0 if x < 0 else x)

In [None]:
#Columns that will applied Embedding layer in NN later
int_col = ['Store','DayOfWeek','year','month','day','StateHoliday','hasCompetitionmonths','hasPromo2weeks','latest_promo2_months','StoreType',\
'Assortment','PromoInterval','CompetitionOpenSinceYear','Promo2SinceYear','State','week_of_year','Events']

train[int_col] = train[int_col].astype('int64')
test[int_col] = test[int_col].astype('int64')

In [None]:
def int_try_execpt(x):
    try:
        return int(x)
    except ValueError:
        return 0

In [None]:
#Convert float to integer, if error then return 0 , np.log(0) will return infinitty value, therefore add 1 to the distance
train['CompetitionDistance'] = np.log(np.array(train['CompetitionDistance'].apply(int_try_execpt)) + 1) /10
test['CompetitionDistance'] = np.log(np.array(test['CompetitionDistance'].apply(int_try_execpt)) + 1) /10

In [None]:
print(train.shape)
print(train.isnull().sum().sum())
train.head()

In [None]:
print(test.shape)
print(test.isnull().sum().sum())
test.head()

In [None]:
#Some of the shops is open but doesnot have revenue during the day...
train_inds = np.where((train['Open'] ==1) & (train['Sales'] >0) )[0]
len(train_inds)

In [None]:
x_train, y_train = train.copy().drop(columns = ['Sales','Date','Customers']).iloc[train_inds,:], train.copy()['Sales'][train_inds]
x_train.shape, y_train.shape

In [None]:
# one_hot = ['DayOfWeek','day'\
# ,'StoreType','Assortment','PromoInterval','CompetitionOpenSinceYear','Promo2SinceYear','State']

# x_train = pd.get_dummies(data=x_train, columns=one_hot,drop_first=True)
# x_train.shape

In [None]:
#Sub dataset for recent model
# sub_inds = train['month'].isin([5,6,7,8,9]) & train['Sales'] > 0
# x_train_sub, y_train_sub = train.copy().drop(columns = ['Sales','Date','Customers']).loc[sub_inds,:], train.copy().loc[sub_inds,'Sales']
# x_train_sub.shape, y_train_sub.shape

In [None]:
print(np.isinf(np.log(y_train)).sum())

In [None]:
test_inds = np.where(test['Open'] ==1)[0]
test_inds0 = np.where(test['Open'] ==0)[0]
print("Test open stores: ",len(test_inds))
print("Test closed stores: ",len(test_inds0))

In [None]:
x_test = test.copy().drop(columns = ['Date','Id']).iloc[test_inds,:]
x_test.shape

In [None]:
# x_test = pd.get_dummies(data=x_test, columns=one_hot,drop_first=True)
# x_test.shape

# Model

## NN with Embedding Layers

Concept about Embedding layer [link](https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce)

Key concept: Word embeddings can be thought of as an alternate to one-hot encoding along with dimensionality reduction.

Loss Function: https://towardsdatascience.com/deep-learning-which-loss-and-activation-functions-should-i-use-ac02f1c56aa8

https://towardsdatascience.com/7-popular-activation-functions-you-should-know-in-deep-learning-and-how-to-use-them-with-keras-and-27b4d838dfe6

In [None]:
embed_list = ['Store','DayOfWeek','month','day','week_of_year']

#minus is necessay as NN expect zero index as starting point, else fit process will throw error
for col in embed_list:
    x_train[col] = x_train[col] - 1
    x_test[col] = x_test[col] - 1

In [None]:
col_list = ['Store','DayOfWeek','Promo','year','month','day','StateHoliday','SchoolHoliday','hasCompetitionmonths','hasPromo2weeks',\
'latest_promo2_months','CompetitionDistance','StoreType','Assortment','PromoInterval','CompetitionOpenSinceYear','Promo2SinceYear',\
'State','week_of_year',['Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC'],['Max_Humidity', 'Mean_Humidity', 'Min_Humidity'],\
['Max_Wind_SpeedKm_h','Mean_Wind_SpeedKm_h'],'CloudCover','Events','DE_trend','state_trend']


X_list = []
for list in col_list:
    nparray = np.array(x_train[list])
    X_list.append(nparray)

In [None]:
Xtest_list = []
for list in col_list:
    nparray = np.array(x_test[list])
    Xtest_list.append(nparray)

In [None]:
class EntitiyEmbedding:
    def __init__(self):
        self.input_model = []
        self.output_model = []
        self.features = []
        self.embeddings = []

    def add(self, feature, input_shape, output_shape):
        self.features.append(feature)
        self.embeddings.append(feature)
        input_model = Input(shape=(1,),name=(feature + '_input'))
        output_model = Embedding(input_shape, output_shape,name=(feature + '_out'))(input_model)
        output_model = Reshape(target_shape=(output_shape,))(output_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def dense(self, feature, output_shape):
        self.features.append(feature)
        input_model = Input(shape=(output_shape,),name=(feature + '_input'))
        output_model = Dense(output_shape,name=(feature + '_out'))(input_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def concatenate(self):
        output_model = Concatenate()(self.output_model)
        output_model = Dense(1000, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(500, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)
        self.model = KerasModel(inputs=self.input_model, outputs=output_model)
        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def fit(self, X_train, y_train, epochs=12, batch_size=128):
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    def predict(self, X=None):
        if X is None:
            X = self.X_test
        pred = self.model.predict(X)
        return pred
    
    def summary(self):
        self.model.summary()
        
    def get_weight(self):
        weights = {}
        for feature in self.features:
            w = self.model.get_layer(feature + '_out').get_weights()[0]
            columns = []
            for i in range(w.shape[1]):
                columns.append(feature + '_' + str(i))
            w = pd.DataFrame(w, columns=columns)
            w.index.names = [feature]
            weights[feature] = w
        return weights

In [None]:
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Dropout
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding

In [None]:
model = EntitiyEmbedding()
model.add('Store', input_shape=1115, output_shape=10)
model.add('DayOfWeek', input_shape=7, output_shape=6)
model.dense('Promo', output_shape=1)
model.add('year', input_shape=3, output_shape=2)
model.add('month', input_shape=12, output_shape=6)
model.add('day', input_shape=31, output_shape=10)
model.add('StateHoliday', input_shape=4, output_shape=3)
model.dense('SchoolHoliday', output_shape=1)
model.add('hasCompetitionmonths', input_shape=25, output_shape=2)
model.add('hasPromo2weeks', input_shape=26, output_shape=1)
model.add('latest_promo2_months', input_shape=4, output_shape=1)
model.dense('CompetitionDistance', output_shape=1)
model.add('StoreType', input_shape=5, output_shape=2)
model.add('Assortment', input_shape=4, output_shape=3)
model.add('PromoInterval', input_shape=4, output_shape=3)
model.add('CompetitionOpenSinceYear', input_shape=18, output_shape=4)
model.add('Promo2SinceYear', input_shape=8, output_shape=4)
model.add('State',input_shape=12, output_shape=6)
model.add('week_of_year', input_shape=53, output_shape=10)
model.dense('temperature', output_shape=3)
model.dense('humidity', output_shape=3)
model.dense('windspeed', output_shape=2)
model.dense('CloudCover',  output_shape=1)
model.add('Events', input_shape=22, output_shape=4)
model.dense('DE_trend', output_shape=1)
model.dense('state_trend', output_shape=1)
model.concatenate()

In [None]:
model.summary()

In [None]:
from sklearn.model_selection import train_test_split
# X_train, X_ee, y_train, y_ee = train_test_split(X, y, test_size=200000, random_state=44)

In [None]:
#10 or 12 epochs
cmax = np.max(np.log(np.array(y_train)))
model.fit(X_list, np.log(np.array(y_train))/cmax , epochs=10)

In [None]:
we = model.get_weight()

In [None]:
emb_list = ['Store','DayOfWeek','year','month','day','StateHoliday','hasCompetitionmonths','hasPromo2weeks',\
'latest_promo2_months','StoreType','Assortment','PromoInterval','CompetitionOpenSinceYear','Promo2SinceYear',\
'State','week_of_year','Events']

In [None]:
# we['Store'].reset_index()

In [None]:
#method to reduce cardinality of categorical data, however take note the index is different

for col in emb_list:
    x_train = x_train.merge(we[col].reset_index(),how='left',on=[col])
    x_test = x_test.merge(we[col].reset_index(),how='left',on=[col])
    
x_train.head()


## Xgboost

In [None]:
# define eval metrics
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [None]:
# X_train1, X_val, y_train1, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
import xgboost as xgb

# #best round is 3436 w/o one-hot, 2528 with NN features
# params = {"objective": "reg:linear", # for linear regression
#           "booster" : "gbtree",   # use tree based models 
#           "eta": 0.03,   # learning rate
#           "max_depth": 10,    # maximum depth of a tree
#           "subsample": 0.9,    # Subsample ratio of the training instances
#           "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
#           "silent": 1,   # silent mode
#           "seed": 10   # Random number seed
#           }
# num_boost_round = 50000

# dtrain = xgb.DMatrix(X_train1, np.log1p(y_train1))
# dvalid = xgb.DMatrix(X_val,np.log1p(y_val))
# watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
# # train the xgboost model
# model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
#   early_stopping_rounds= 20, feval=rmspe_xg, verbose_eval=True)

In [None]:
dtrain = xgb.DMatrix(x_train, np.log1p(y_train))
dtest = xgb.DMatrix(x_test)
# specify parameters via map
params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.03,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10   # Random number seed
          }
num_round = 2528
model = xgb.train(params, dtrain, num_round)

In [None]:
pre_xg = model.predict(dtest)

## Lightgbm

In [None]:
import lightgbm as lgb

In [None]:
# X_train1, X_val, y_train1, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
params = {"objective" : "rmse",
          "boosting" : "gbdt", 
          "metric" : "rmse",
          "num_iterations" : 15148,
          "top_k" : 30, 
          "max_depth" : 8, 
          "num_leaves" : 800, 
          "min_data_in_leaf" : 20, 
          "learning_rate" : 0.02,
          "bagging_fraction" : 0.7, 
          "bagging_seed" : 3,
          "bagging_freq" : 5, 
          "feature_fraction" : 0.5, 
          "num_threads" : 4
         }

dataset_params = {"max_bin" : 200, 
                  "min_data_in_bin" : 3 
                 }
lgb_train = lgb.Dataset(x_train, np.log1p(y_train), params=dataset_params)
# lgb_val = lgb.Dataset(X_val, np.log(y_val), params=dataset_params)
# model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True,valid_sets=[lgb_val],callbacks=[lgb.early_stopping(stopping_rounds=100)])
model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True)

In [None]:
pre_lgb = model.predict(x_test)

In [None]:
pre_all = np.expm1(pre_xg)*0.5 + np.expm1(pre_lgb)*0.5
# pre_all = np.expm1(pre_xg)

In [None]:
test.loc[test_inds0,'Sales'] = 0
test.loc[test_inds,'Sales'] = pre_all

In [None]:
test[['Id','Sales']].head()

In [None]:
test['Sales'].describe()

# Submission

In [None]:
# make submission using best weight
result = pd.DataFrame({"Id": test["Id"],'Sales': test["Sales"]})
result.to_csv("submission_tf.csv", index=False)

Reference:  

1. [XGBoost documentation](http://xgboost.readthedocs.io/en/latest/parameter.html#)  
2. [Model documentation 1st place](http://www.kaggle.com/c/rossmann-store-sales/discussion/18024)
3. [XGBoost Feature Importance](https://www.kaggle.com/cast42/xgboost-in-python-with-rmspe-v2/code)
4. [Rossmann Sales Top1%](https://www.kaggle.com/xwxw2929/rossmann-sales-top1)


    