In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import the Data**

In [None]:
df = pd.read_csv('../input/rossmann-store-sales/train.csv')
df.head()

In [None]:
df.shape

In [None]:
store = pd.read_csv('../input/rossmann-store-sales/store.csv')
store.head()

In [None]:
store.shape

## Merge the Data

In [None]:
data = df.merge(store, on = ['Store'], how = 'inner')
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
print('Distinct number of Stores:', len(data['Store'].unique()))
print('Distinct number of Stores:', len(data['Date'].unique()))
print('Average daily sales of all Stores:', round(data['Sales'].mean(), 2))

In [None]:
data['DayOfWeek'].value_counts()

## Create new columns related to Dates 

In [None]:
data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format = True)

data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Quarter'] = data['Date'].dt.quarter
data['Week'] = data['Date'].dt.week
data['Day'] = data['Date'].dt.day

data['Season'] = np.where(data['Month'].isin([3,4,5]), "Spring",
                 np.where(data['Month'].isin([6,7,8]), "Summer",
                 np.where(data['Month'].isin([9,10,11]), "Fall",
                 np.where(data['Month'].isin([12,1,2]), "Winter", "None"))))

In [None]:
data.iloc[:,-6:].head()

# **EDA**

In [None]:
plt.hist(data['Sales'])
plt.title('Histogram of Store Sales')
plt.ylabel('bins')
plt.xlabel('frequency')
plt.show()

In [None]:
data.hist(figsize = (20,15))
plt.show()

## Handle the Missing Values 

In [None]:
data.isnull().sum()

In [None]:
data['CompetitionDistance'] = data['CompetitionDistance'].fillna(data['CompetitionDistance'].mean())

## Encode the Data 

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

target = ['Sales']
numeric_col = ['Customers', 'Open', 'Promo', 'Promo2', 'StateHoliday', 'SchoolHoliday', 'CompetitionDistance']
categorical_col = ['DayOfWeek', 'Quarter', 'Month', 'Year', 'StoreType', 'Assortment', 'Season']

def create_encode(df, col):
    le = LabelEncoder()
    a = le.fit_transform(data[col]).reshape(-1,1)
    ohe = OneHotEncoder(sparse=False)
    col_names = [col+ "_" + str(i) for i in le.classes_]
    return (pd.DataFrame(ohe.fit_transform(a), columns = col_names))

In [None]:
temp = data[numeric_col]

for col in categorical_col:
    temp_df = create_encode(data, col)
    temp = pd.concat([temp, temp_df], axis=1)

In [None]:
print('Shape of Data: ', temp.shape)
print('Distinct Datatypes: ', temp.dtypes.unique())

In [None]:
temp['StateHoliday'] = np.where(temp['StateHoliday']=='0', 0,1)
temp.dtypes.unique()

## Split the Data 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(temp, data[target], test_size=0.2, random_state=1000)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1000)

In [None]:
print('Shape of X_train: ', X_train.shape)
print('Shape of X_val: ', X_val.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_val: ', y_val.shape)
print('Shape of y_test: ', y_test.shape)

# **Create a DNN Model**

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(350, input_dim=44, activation='relu'))
model.add(Dense(350, activation='relu'))
model.add(Dense(350, activation='relu'))
model.add(Dense(350, activation='relu'))
model.add(Dense(1, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

In [None]:
result = model.evaluate(X_test, y_test)

for i in range(len(model.metrics_names)): 
    print('Metric', model.metrics_names[i], ':', str(round(result[i], 2)))

In [None]:
prediction = model.predict(X_test)

## Finally, Make Submission file 

In [None]:
sample = pd.read_csv('../input/rossmann-store-sales/sample_submission.csv')

In [None]:
submission = pd.DataFrame(prediction, columns=['Sales'])
submission['Id'] = sample['Id']
submission.head()

# Thank you, Don't Forget to hit the up vote. Please 