In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

![Step1](https://raw.githubusercontent.com/davyee100/kaggle/master/Step1.png)
![Step1_1](https://raw.githubusercontent.com/davyee100/kaggle/master/Step1_1.png)

This step will read the input file and show the basic information of the following:
1. Number of observations and features
1. Structure of the DataFrame
1. Sample Data
1. What's the Min and Max Date of both Training and Test Data

In [None]:
train_data = pd.read_csv('../input/rossmann-store-sales/train.csv', low_memory=False, parse_dates=['Date'])
test_data = pd.read_csv('../input/rossmann-store-sales/test.csv', low_memory=False, parse_dates=['Date'])
store_data = pd.read_csv('../input/rossmann-store-sales/store.csv', low_memory=False)

# train_data_copy= train_data.copy(deep = True)
# test_data_copy= test_data.copy(deep = True)

print('Observations and Features Summary')
print('---------------------------------')
print('Training Data: ', train_data.shape)  # Show the number of observations and features
print('Test Data: ', test_data.shape)  # Show the number of observations and features
print('Store Data: ', store_data.shape)  # Show the number of observations and features

In [None]:
print(train_data.info())  # Show the information about DataFrame

In [None]:
print(test_data.info())  # Show the information about DataFrame

In [None]:
print(store_data.info())  # Show the information about DataFrame

In [None]:
print('Display Sample Data')
# train_data.head()  # Show the head of DataFrame
# train_data.tail()  # Show the tail of DataFrame
train_data.sample(10)  # Show number of samples data E.g. 10 records

In [None]:
test_data.sample(10)  # Show number of samples data E.g. 10 records

In [None]:
store_data.sample(10)  # Show number of samples data E.g. 10 records

In [None]:
print('Min and Max of Date')
print('-------------------')
print('Training Data: Min Date:', train_data['Date'].min(), 'Max Date:', train_data['Date'].max())
print('Test Data    : Min Date:', test_data['Date'].min(), 'Max Date:', test_data['Date'].max())

![Step1_2](https://raw.githubusercontent.com/davyee100/kaggle/master/Step1_2.png)

This step will clean-up inconsistencies in data before being used for modeling. This is to avoid data quality that will affect the training process / impact negatively the results generated
1. Convert Date Field to proper format
1. Number of Null Values
1. Replacing Null Values / Dropping Columns
1. Sample Data

### **_Training Data_**
![Section](https://raw.githubusercontent.com/davyee100/kaggle/master/Section.png)

In [None]:
train_data['Date'] = pd.to_datetime(train_data['Date'],format = '%Y-%m-%d')
train_data['Day'] = train_data['Date'].dt.day
train_data['Month'] = train_data['Date'].dt.month
train_data['Year'] = train_data['Date'].dt.year
train_data['SalesPerCustomer'] = train_data['Sales'] / train_data['Customers']
train_data['SalesPerCustomer'].fillna("0", inplace=True)
train_data['SalesPerCustomer'] = train_data['SalesPerCustomer'].astype(np.float64)

In [None]:
print(train_data.isnull().sum())

**Summary:** No missing values for Training DataSet, therefore no cleansing is required

### **_Test Data_**
![Section](https://raw.githubusercontent.com/davyee100/kaggle/master/Section.png)

In [None]:
test_data['Date'] = pd.to_datetime(test_data['Date'],format = '%Y-%m-%d')
test_data['Day'] = test_data['Date'].dt.day
test_data['Month'] = test_data['Date'].dt.month
test_data['Year'] = test_data['Date'].dt.year

In [None]:
print(test_data.isnull().sum())

**Next Steps:** From the above observation, we can see there is missing values in Test data on the column "Open". We will analyze on the records that contains missing values.

In [None]:
print('Distinct Values of "Open": ', test_data['Open'].unique())
test_data.loc[test_data.Open.isnull()]

**Next Steps:** Noticed that only Store = 622 has some missing values for column 'Open'. Let's check some sample data for Store = 622

In [None]:
test_data.loc[(test_data['Store'] == 622)].sort_values(by=['Date'], ascending=True)

**Next Steps:** Only the last few records in Test DataSet have those empty values. It seems that DayOfWeek = 7 is consistently populated in that range but not the otherwise. We could either replace all NaN with values of '1' or let it assumed as '0'. But before that let's confirm if data is consistent in Train DataSet for Store = 622 in terms of 'Open', 'DayOfWeek' and 'StateHoliday'

In [None]:
# Store = 622, it's a Sunday and Store is Open
train_data.loc[(train_data['Store'] == 622) & (train_data['DayOfWeek'] == 7) & (train_data['Open'] == 1)]

In [None]:
# Store = 622, it's not a Sunday and State Holiday
train_data.loc[(train_data['Store'] == 622) & (train_data['DayOfWeek'] != 7) & (train_data['StateHoliday'].isin(['a','b','c']))]

**Next Steps:** From the above, it is deduced that all data for Store = 622 is consistent in terms of 'Open', 'StateHoliday' and 'DayOfWeek'. Therefore we will replace all missing values in column 'Open' with value 1

In [None]:
test_data['Open'] = test_data['Open'].fillna("1").astype('int64')

In [None]:
print(test_data.isnull().sum())

In [None]:
test_data.loc[test_data['Store'] == 622].sort_values(by=['Date'], ascending=True)

### **_Store Data_**
![Section](https://raw.githubusercontent.com/davyee100/kaggle/master/Section.png)

In [None]:
print(store_data.isnull().sum())

**Next Steps:** There are a few missing values of Competition & Promo2 columns. We will start by processing those 'CompetitionDistance' that is null

In [None]:
store_data.loc[store_data['CompetitionDistance'].isnull() == True]

**Next Steps:** Since those store that has missing 'CompetitionDistance' does not have any values for both 'CompetitionOpenSinceMonth' and 'CompetitionOpenSinceYear' we will default it with value '0' for both 'CompetitionOpenSinceMonth' and 'CompetitionOpenSinceYear' first, then only followed by 'CompetitionDistance'

In [None]:
store_data.loc[store_data['CompetitionDistance'].isnull(), 'CompetitionOpenSinceMonth'] = store_data.loc[store_data['CompetitionDistance'].isnull(), 'CompetitionOpenSinceMonth'].fillna(0)
store_data.loc[store_data['CompetitionDistance'].isnull(), 'CompetitionOpenSinceYear'] = store_data.loc[store_data['CompetitionDistance'].isnull(), 'CompetitionOpenSinceYear'].fillna(0)
store_data['CompetitionDistance'].fillna(0, inplace=True)

In [None]:
print(store_data.isnull().sum())

**Next Steps:** Now let's cleanup both 'CompetitionOpenSinceMonth' and 'CompetitionOpenSinceYear'

In [None]:
store_data.loc[(store_data['CompetitionOpenSinceMonth'].isnull() == True) & (store_data['CompetitionOpenSinceYear'].isnull() == True)]

**Summary:** Since we are not able to determine the Year and Month since the competition started, therefore we will default it to '0'. If there is a need to explore other option, we will revisit this default value.

In [None]:
store_data['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
store_data['CompetitionOpenSinceYear'].fillna(0, inplace=True)

**Next Steps:** Now let's cleanup the Promo2 columns

In [None]:
print(store_data.isnull().sum())

In [None]:
store_data.loc[store_data['Promo2SinceWeek'].isnull()]

**Next Steps:** We will replace all these values with '0' for simplicity

In [None]:
store_data['Promo2SinceWeek'].fillna(0, inplace=True)
store_data['Promo2SinceYear'].fillna(0, inplace=True)

**Next Steps:** Let's take a look at the 'PromoInterval' column's unique values and replace it with a blank space

In [None]:
store_data['PromoInterval'].unique()

In [None]:
store_data['PromoInterval'].fillna('', inplace=True)

In [None]:
print(store_data.isnull().sum())

![Step1_3](https://raw.githubusercontent.com/davyee100/kaggle/master/Step1_3.png)

This step will replace all representation of string data with an apporpriate integer encoding that allows modeling to be able to run on it.

### **_Train Data & Test Data_**
![Section](https://raw.githubusercontent.com/davyee100/kaggle/master/Section.png)

In [None]:
# Get list of categorical variables
s = ((train_data.dtypes == 'object') & (train_data.columns != 'Date'))
train_object_cols = list(s[s].index)

print("Categorical variables:")
print(train_object_cols)

**Next Steps:** Only 'StateHoliday' need to be cleaned up for both Training and Test Dataset

In [None]:
import gc
from sklearn.preprocessing import OneHotEncoder

oneHot = OneHotEncoder()
oneHot = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit the OneHotEncoder with training data then followed by using the same OneHotEncoder to encode the test data
OH_train_data = oneHot.fit_transform(train_data[train_object_cols])
OH_test_data = oneHot.transform(test_data[train_object_cols])
# Read the columns name of OneHotEncoding Process. By default OneHot encoding will return columns with sequential number E.g. 0, 1, 2, 3 as the column in the matrix returned
column_names = oneHot.get_feature_names(train_object_cols)

# Convert the matrix into dataframe with the column names acquired in the previous step
OH_train_data = pd.DataFrame(OH_train_data, columns=column_names)
OH_test_data = pd.DataFrame(OH_test_data, columns=column_names)

# The DataFrame created in previous step will loose it's index, therefore we need to put back the index
OH_train_data.index = train_data.index
OH_test_data.index = test_data.index

# Join the 2 dataframes created into 1 single dataframe
train_data = pd.concat([train_data, OH_train_data], axis=1)
test_data = pd.concat([test_data, OH_test_data], axis=1)

del [[OH_train_data,OH_test_data]]
gc.collect()

# Convert all fields encoded into int32
data_cleaner = [train_data, test_data]

for dataset in data_cleaner:
    for cols in column_names:
        dataset[cols] = dataset[cols].astype(np.int32)

In [None]:
train_data.head()

In [None]:
test_data.head()

### **_Store Data_**
![Section](https://raw.githubusercontent.com/davyee100/kaggle/master/Section.png)

In [None]:
# Get list of categorical variables
s = ((store_data.dtypes == 'object') & (store_data.columns != 'PromoInterval'))
store_object_cols = list(s[s].index)

print("Categorical variables:")
print(store_object_cols)

**Next Steps:** There are 2 columns 'StoreType' and 'Assortment' that has string values that we need to encode

In [None]:
# Fit and transform the OneHotEncoder with store data
OH_store_data = oneHot.fit_transform(store_data[store_object_cols])

# Read the columns name of OneHotEncoding Process. By default OneHot encoding will return columns with sequential number E.g. 0, 1, 2, 3 as the column in the matrix returned
column_names = oneHot.get_feature_names(store_object_cols)

# Convert the matrix into dataframe with the column names acquired in the previous step
OH_store_data = pd.DataFrame(OH_store_data, columns=column_names)

# The DataFrame created in previous step will loose it's index, therefore we need to put back the index
OH_store_data.index = store_data.index

# Join the 2 dataframes created into 1 single dataframe
store_data = pd.concat([store_data, OH_store_data], axis=1)

del [[OH_store_data]]
gc.collect()

# Convert all fields encoded into int32
for cols in column_names:
    store_data[cols] = store_data[cols].astype(np.int32)

In [None]:
store_data.head()

**Next Step:** Process the column 'PromoInterval' similar to OneHot Encoding but with each values of the month to each columns where it occurred

In [None]:
month_data = np.array(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec'])
month_series = pd.Series(month_data, index=[0,1,2,3,4,5,6,7,8,9,10,11])

In [None]:
def promo_month(month_index,promointerval):
    if month_index in promointerval:
        return 1
    else:
        return 0

for month_index in month_series:
    store_data['Promo_' + month_index] = store_data.apply(lambda x: promo_month(month_index, x['PromoInterval']), axis=1)

In [None]:
store_data.sample(10)

In [None]:
# Convert the data type of columns of Competition and Promo2
store_data['CompetitionOpenSinceMonth'] = store_data['CompetitionOpenSinceMonth'].astype('int64')
store_data['CompetitionOpenSinceYear'] = store_data['CompetitionOpenSinceYear'].astype('int64')
store_data['Promo2SinceWeek'] = store_data['Promo2SinceWeek'].astype('int64')
store_data['Promo2SinceYear'] = store_data['Promo2SinceYear'].astype('int64')
store_data.info()

![Step1_4](https://raw.githubusercontent.com/davyee100/kaggle/master/Step1_4.png)

We will do this with a left outer join based on 'Store' which is the key. Then we will create a new DataFrame that still contains all Original Fields for Visualizations. Once we have completed all Visualizations then we will drop out all fields that are no longer required.

In [None]:
train_data_visual = train_data[['Store','DayOfWeek','Date','Sales','Customers','SalesPerCustomer','Open','Promo','StateHoliday','SchoolHoliday','Year','Month','Day']].merge(store_data[['Store','StoreType','Assortment','CompetitionDistance','CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval']], on=['Store'], how='left')

object_cols = train_object_cols + store_object_cols
object_cols.insert(0,"PromoInterval")
object_cols.insert(0,"StateHoliday")

train_data = train_data.merge(store_data, on=['Store'], how='left')
train_data = train_data.drop(object_cols, axis=1)

test_data = test_data.merge(store_data, on=['Store'], how='left')
test_data = test_data.drop(object_cols, axis=1)

In [None]:
print(train_data_visual.shape)

In [None]:
print('New Train Data with Store: ', train_data.shape)
print('New Test Data with Store: ', test_data.shape)

In [None]:
print('Display Structure of DataFrame')
print('------------------------------')
print(train_data.info())  # Show the information about DataFrame

In [None]:
print(test_data.info())  # Show the information about DataFrame

In [None]:
train_data.sample(10)

In [None]:
test_data.sample(10)

![Step2](https://raw.githubusercontent.com/davyee100/kaggle/master/Step2.png)

Do an explanatory data analysis using matplotlib and seaborn to see anything else we can discover through manual data discovery

In [None]:
print('Display Structure of Visual DataFrame')
print('-------------------------------------')
print(train_data_visual.info())  # Show the information about DataFrame

### **_Analysis 1: "Sales", "Customers" and Store "Open" vs "Day of Week"_**

In [None]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from pandas.plotting import register_matplotlib_converters
%matplotlib inline

register_matplotlib_converters()

train_data_temp = train_data_visual.groupby(['DayOfWeek']).agg({'Sales':'sum', 'Customers':'sum', 'Open':'sum'}).reset_index()
train_data_temp['SalesPerCustomer'] = train_data_temp['Sales'] / train_data_temp['Customers']
sns.set_palette('Set2')

fig, qaxis = plt.subplots(2, 2, figsize = (22,12))

sns.barplot(x='DayOfWeek', y='Sales', data=train_data_temp, ax=qaxis[0,0])
qaxis[0,0].set_title('Sales vs Day of Week Comparison')

sns.barplot(x='DayOfWeek', y='Customers', data=train_data_temp, ax=qaxis[0,1])
qaxis[0,1].set_title('Customers vs Day of Week Comparison')

sns.barplot(x='DayOfWeek', y='SalesPerCustomer', data=train_data_temp, ax=qaxis[1,0])
qaxis[1,0].set_title('SalesPerCustomer vs Day of Week Comparison')

sns.barplot(x='DayOfWeek', y='Open', data=train_data_temp, ax=qaxis[1,1])
qaxis[1,1].set_title('Open vs Day of Week Comparison')

**_Remarks:_** Above shows the comparison between 'DayOfWeek' against both 'Customers' and 'Sales' Measures. It shows that most of the stores closes on Sunday = '7' and both 'Customers' and 'Sales' are mainly recorded during the other days which is the weekday except 'Saturday'


### **_Analysis 2: "Sales" vs "Store Type" and "Assortment"_**

In [None]:
train_data_temp = train_data_visual.groupby(['StoreType']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp['SalesPerCustomer'] = train_data_temp['Sales'] / train_data_temp['Customers']

train_data_temp2 = train_data_visual.groupby(['Assortment']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp2['SalesPerCustomer'] = train_data_temp2['Sales'] / train_data_temp2['Customers']

fig, qaxis = plt.subplots(3, 2, figsize = (22,20))

sns.barplot(x='StoreType', y='Sales', data=train_data_temp, ax=qaxis[0,0])
qaxis[0,0].set_title('Sales vs Store Type Comparison')

sns.barplot(x='Assortment', y='Sales', data=train_data_temp2, ax=qaxis[0,1])
qaxis[0,1].set_title('Sales vs Assortment Comparison')

sns.barplot(x='StoreType', y='Customers', data=train_data_temp, ax=qaxis[1,0])
qaxis[1,0].set_title('Customers vs Store Type Comparison')

sns.barplot(x='Assortment', y='Customers', data=train_data_temp2, ax=qaxis[1,1])
qaxis[1,1].set_title('Customers vs Assortment Comparison')

sns.barplot(x='StoreType', y='SalesPerCustomer', data=train_data_temp, ax=qaxis[2,0])
qaxis[2,0].set_title('SalesPerCustomer vs Store Type Comparison')

sns.barplot(x='Assortment', y='SalesPerCustomer', data=train_data_temp2, ax=qaxis[2,1])
qaxis[2,1].set_title('SalesPerCustomer vs Assortment Comparison')

**_Remarks:_** From this we can see that Store Type = a has both highest values of 'Sales' and 'Customers'. But on the contrary in terms of 'Sales Per Customer' Store Type = d has the highest value / purchasing power per customer. It could be due to the expensive / exclusivity of the Store Type.

Assortment a = basic, b = extra, c = extended

### **_Analysis 3: "Sales" vs "Promo" and "Promo 2"_**

In [None]:
train_data_temp = train_data_visual.groupby(['Promo']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp['SalesPerCustomer'] = train_data_temp['Sales'] / train_data_temp['Customers']

train_data_temp2 = train_data_visual.groupby(['Promo2']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp2['SalesPerCustomer'] = train_data_temp2['Sales'] / train_data_temp2['Customers']

fig, qaxis = plt.subplots(3, 2, figsize = (22,20))

sns.barplot(x='Promo', y='Sales', data=train_data_temp, ax=qaxis[0,0])
qaxis[0,0].set_title('Sales vs Promo Comparison')

sns.barplot(x='Promo2', y='Sales', data=train_data_temp2, ax=qaxis[0,1])
qaxis[0,1].set_title('Sales vs Promo2 Comparison')

sns.barplot(x='Promo', y='Customers', data=train_data_temp, ax=qaxis[1,0])
qaxis[1,0].set_title('Customers vs Promo Comparison')

sns.barplot(x='Promo2', y='Customers', data=train_data_temp2, ax=qaxis[1,1])
qaxis[1,1].set_title('Customers vs Promo2 Comparison')

sns.barplot(x='Promo', y='SalesPerCustomer', data=train_data_temp, ax=qaxis[2,0])
qaxis[2,0].set_title('Sales Per Customer vs Promo Comparison')

sns.barplot(x='Promo2', y='SalesPerCustomer', data=train_data_temp2, ax=qaxis[2,1])
qaxis[2,1].set_title('Sales Per Customer vs Promo2 Comparison')

**_Remarks:_** From the observations above, it seems that Promo2 does not seem to further increase the sales subsequently

### **_Analysis 4: "Sales" vs "School Holiday" and "State Holiday"_**

In [None]:
train_data_temp = train_data_visual.groupby(['SchoolHoliday']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp['SalesPerCustomer'] = train_data_temp['Sales'] / train_data_temp['Customers']

train_data_temp2 = train_data_visual.groupby(['StateHoliday']).agg({'Sales':'sum', 'Customers':'sum'}).reset_index()
train_data_temp2['SalesPerCustomer'] = train_data_temp2['Sales'] / train_data_temp2['Customers']

fig, qaxis = plt.subplots(3, 2, figsize = (22,20))

sns.barplot(x='SchoolHoliday', y='Sales', data=train_data_temp, ax=qaxis[0,0])
qaxis[0,0].set_title('Sales vs School Holiday Comparison')

sns.barplot(x='StateHoliday', y='Sales', data=train_data_temp2, ax=qaxis[0,1])
qaxis[0,1].set_title('Sales vs State Holiday Comparison')

sns.barplot(x='SchoolHoliday', y='Customers', data=train_data_temp, ax=qaxis[1,0])
qaxis[1,0].set_title('Customers vs School Holiday Comparison')

sns.barplot(x='StateHoliday', y='Customers', data=train_data_temp2, ax=qaxis[1,1])
qaxis[1,1].set_title('Customers vs State Holiday Comparison')

sns.barplot(x='SchoolHoliday', y='SalesPerCustomer', data=train_data_temp, ax=qaxis[2,0])
qaxis[2,0].set_title('Sales Per Customer vs School Holiday Comparison')

sns.barplot(x='StateHoliday', y='SalesPerCustomer', data=train_data_temp2, ax=qaxis[2,1])
qaxis[2,1].set_title('Sales Per Customer vs State Holiday Comparison')

**_Remarks:_** Obviously "School Holiday" and "State Holiday" does not impact the "Sales" positively as both graphs shown.

### **_Analysis 5: Monthly Sales Trend_**

In [None]:
train_data_temp = train_data_visual.groupby(['StoreType',pd.Grouper(key='Date', freq='7D')]).Sales.sum().reset_index()

fig, ax = plt.subplots(figsize= (28,8))

# # Define the date format
date_form = DateFormatter("%y/%m")
ax.xaxis.set_major_formatter(date_form)
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))

ax = sns.lineplot(data = train_data_temp, x = 'Date', y = 'Sales', hue = 'StoreType')
ax.set_title('Monthly Sales Trend by Store Type')

plt.legend(['Store Type a', 'Store Type b', 'Store Type c', 'Store Type d'], title='Store Type')

**_Remarks :_** From the above visualization, we can deduce that most of the Sales has its highest peak during XMAS.

### **_Analysis 6: Weekly Sales Trend_**

In [None]:
train_data_temp = train_data_visual[(train_data_visual['Year'] == 2013)]
train_data_temp = train_data_temp.groupby(['StoreType',pd.Grouper(key='Date', freq='7D')]).Sales.sum().reset_index()

fig, ax = plt.subplots(figsize= (28,8))

# # Define the date format
date_form = DateFormatter("%m/%d")
ax.xaxis.set_major_formatter(date_form)
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))

ax = sns.lineplot(data = train_data_temp, x = 'Date', y = 'Sales', hue = 'StoreType')
ax.set_title('Weekly Sales Trend by Store Type')

plt.legend(['Store Type a', 'Store Type b', 'Store Type c', 'Store Type d'], title='Store Type')

### **_Remarks :_** To revisit the weekly trend later to display background

### **_Analysis 7: Correlation of Features_**

Let's take a look at the Correlation of features in training data

In [None]:
# Compute the correlation matrix 
train_corr_all = train_data.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(train_corr_all, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize = (14, 12))
colormap = sns.diverging_palette(220, 10, as_cmap = True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(train_corr_all,
            mask = mask,
            square = True,
            linewidths = .1,
            vmax = 1.0,
            linecolor = 'white',
            cbar_kws = {'shrink':.9},
            ax = ax,
            cmap = colormap)
plt.title('Rossmann Store Sales Correlation of Features', size=15)
plt.show()

In [None]:
# Clean-up non-active Data Frame
del [[train_corr_all,train_data_temp,train_data_temp2]]
gc.collect()

![Step3](https://raw.githubusercontent.com/davyee100/kaggle/master/Step3.png)

In [None]:
# Common Model Helpers
from sklearn import feature_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Common Model Algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

import xgboost as xgb

Data_X = ['Store','DayOfWeek','Open','Promo','SchoolHoliday','Day','Month','Year', \
          'StateHoliday_0','StateHoliday_a','StateHoliday_b','StateHoliday_c','CompetitionDistance','CompetitionOpenSinceMonth','CompetitionOpenSinceYear', \
          'Promo2','Promo2SinceWeek','Promo2SinceYear','StoreType_a','StoreType_b','StoreType_c','StoreType_d', 'Assortment_a','Assortment_b','Assortment_c', \
          'Promo_Jan', 'Promo_Feb','Promo_Mar','Promo_Apr', 'Promo_May', 'Promo_Jun', 'Promo_Jul', 'Promo_Aug', 'Promo_Sept', 'Promo_Oct','Promo_Nov', 'Promo_Dec']
Target_Y = ['Sales']

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

# A rule of thumb is to transform my target value to log if i see the values are very dispersed which is the case
# and then of course revert them with np.exp to their real values

X = train_data[Data_X]
y = np.log1p(train_data['Sales'])

X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

### Decision Tree Model (Default)

In [None]:
model_dt = DecisionTreeRegressor(max_depth=12, random_state=1).fit(X_train,y_train)
y_valid_pred = model_dt.predict(X_valid)

print("Validation - R2 Scoring                             : ", r2_score(np.expm1(y_valid),np.expm1(y_valid_pred)))
print("Validation - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_valid),np.expm1(y_valid_pred)))

In [None]:
y_train_pred = model_dt.predict(X_train)

print("Train - R2 Scoring                             : ", r2_score(np.expm1(y_train),np.expm1(y_train_pred)))
print("Train - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_train),np.expm1(y_train_pred)))

### Random Forest Model (Default)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(n_estimators=15).fit(X_train,y_train)
y_valid_pred = model_rfr.predict(X_valid)

print("Validation - R2 Scoring                             : ", r2_score(np.expm1(y_valid),np.expm1(y_valid_pred)))
print("Validation - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_valid),np.expm1(y_valid_pred)))

In [None]:
y_train_pred = model_rfr.predict(X_train)

print("Train - R2 Scoring                             : ", r2_score(np.expm1(y_train),np.expm1(y_train_pred)))
print("Train - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_train),np.expm1(y_train_pred)))

### XGBoost Model (Default)

In [None]:
import xgboost as xgb
print("XGBoost Version: {}". format(xgb.__version__))

In [None]:
params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.03,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10   # Random number seed
          }
num_boost_round = 4000

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

In [None]:
y_valid_pred = model_xgb.predict(xgb.DMatrix(X_valid))

print("Validation - R2 Scoring                             : ", r2_score(np.expm1(y_valid),np.expm1(y_valid_pred)))
print("Validation - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_valid),np.expm1(y_valid_pred)))

In [None]:
y_train_pred = model_xgb.predict(xgb.DMatrix(X_train))

print("Train - R2 Scoring                             : ", r2_score(np.expm1(y_train),np.expm1(y_train_pred)))
print("Train - Mean Squared Error (RMSPE)             : ", rmspe(np.expm1(y_train),np.expm1(y_train_pred)))

In [None]:
# RMSPE correction on the whole
def correction():
    weights = np.arange(0.98, 1.02, 0.005)
    errors = []
    for w in weights:
        error = rmspe(np.expm1(y_valid), np.expm1(y_valid_pred*w))
        errors.append(error)
        
    # make line plot
    plt.plot(weights, errors)
    plt.xlabel('weight')
    plt.ylabel('RMSPE')
    plt.title('RMSPE Curve')

    # print min error
    idx = errors.index(min(errors))
    print('Best weight is {}, RMSPE is {:.4f}'.format(weights[idx], min(errors)))
    
correction()

In [None]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(test_data[Data_X])

# specify parameters via map
params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.03,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10   # Random number seed
          }
num_round = 3000
model_xgb = xgb.train(params, dtrain, num_round)

In [None]:
# make prediction
preds = model_xgb.predict(dtest)

In [None]:
# Make Submission using Best Weight
result = pd.DataFrame({"Id": test_data["Id"],'Sales': np.expm1(preds*1)})
result.to_csv("submission_xgb.csv", index=False)

# plot feature importance, show top 10 features
fig, ax = plt.subplots(figsize=(8,8))
xgb.plot_importance(model_xgb, max_num_features=10, height=0.5, ax=ax)
plt.show()