In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

## DATA SHAPE

In [None]:
df = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv")

In [None]:
df

In [None]:
storedf = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv")

In [None]:
storedf

### DATA TYPE

In [None]:
storedf.info()

In [None]:
storedf.isna().any()

In [None]:
df.isna().any() # is null

In [None]:
df.info()

### Merge 
Merging store data frame and sales record dataframe to get flat and single dataframe

In [None]:
mergedf = df.merge(storedf,on=["Store"],how="inner")

In [None]:
mergedf

### Statistical Data Visualization


#### Maximum minimum average sale count

In [None]:
# Store with maximum sale count
mergedf[mergedf["Sales"] == mergedf["Sales"].max()]

In [None]:
# PLOT: store sale with maximum sale count 

df_max_store = mergedf[mergedf["Store"] == 909]


In [None]:
df_max_store[["Date","Sales"]].plot()

In [None]:
import numpy as np

mergedf['Date'] = pd.to_datetime(mergedf['Date'],infer_datetime_format=True)
mergedf['Month'] = mergedf["Date"].dt.month
mergedf['Quarter'] = mergedf["Date"].dt.quarter
mergedf["Year"] = mergedf["Date"].dt.year




In [None]:
mergedf["Day"] = mergedf["Date"].dt.day
mergedf["Week"] = mergedf["Date"].dt.week
mergedf["Season"] = np.where(mergedf["Month"].isin([3,4,5]),"spring",
                            np.where(mergedf["Month"].isin([6,7,8]),
                                    "summer",np.where(mergedf["Month"].isin([9,10,11]),"fall",
                                                     np.where(mergedf["Month"].isin([12,1,2]),
                                                             "winter","None"))))

In [None]:
print(mergedf[["Date","Year","Month","Day","Week","Quarter","Season"]].head())

In [None]:
plt.figure(figsize=(15,8))
plt.hist(mergedf["Sales"])
plt.title("Histogram for Store Sales")
plt.xlabel("bins")
plt.xlabel("Frequency")
plt.show()


In [None]:
mergedf.hist(figsize=(20,10))

In [None]:
mergedf.isnull().sum()/mergedf.shape[0] * 100

In [None]:
import seaborn as sns 
sns.set(style="whitegrid")

ax = sns.barplot(x="Season", y="Sales", data=mergedf)

In [None]:
ax = sns.barplot(x="Assortment",y="Sales",data=mergedf)

In [None]:
ax = sns.barplot(x="StoreType",y="Sales",data=mergedf)

In [None]:
ax = sns.barplot(x="Season", y="Sales", data=mergedf,estimator=np.size)

In [None]:
ax = sns.barplot(x="Assortment", y="Sales", data=mergedf,estimator=np.size)

In [None]:
ax = sns.barplot(x="StoreType", y="Sales", data=mergedf,estimator=np.size)

# DATA PREPARATION

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
target = ["Sales"]
numeric_columns = ["Customers","Open","Promo","Promo2", "StateHoliday","SchoolHoliday","CompetitionDistance"]
categorical_columns = ["DayOfWeek","Quarter","Month","Year","StoreType","Assortment","Season"]



In [None]:
mergedf.isna().any()

In [None]:
mergedf["CompetitionDistance"]=mergedf["CompetitionDistance"].fillna(mergedf["CompetitionDistance"].mode()[0])

## ONE HOT ENCODING

In [None]:
def one_hot(df,column):
    
    uniqueList = df[column].unique()
    
    
    temp = pd.DataFrame()
    i = 1
    for item in uniqueList:
        
        cname = str(column)+"_"+str(i)
        
        temp[cname] = [1 if d == True else 0 for d in mergedf[column]==item]
        
        
        i+=1
        
    
    return temp

#mergedf["Year"].unique()

In [None]:
ctemp = pd.DataFrame()
first = True

for c in categorical_columns:


    ttdf = one_hot(mergedf[[c]],c)
    
    if first == True:
        ctemp = ttdf.copy()
        first = False
    else:
        ctemp = pd.concat([ctemp,ttdf], axis = 1)

In [None]:
ctemp # checking one-hot encoding

In [None]:
temp = pd.concat([ctemp,mergedf[numeric_columns]],axis=1)

In [None]:
#total columns

len(temp.columns)

In [None]:
# all columns 

temp.columns

In [None]:
# making StateHoliday numerical

temp["StateHoliday"] = [ 1 if a == 'b' else 0 for a in list(temp["StateHoliday"])]

In [None]:
from sklearn.model_selection import train_test_split

## SPLITING TRAIN TEST DATA

In [None]:

x_train, x_test, y_train, y_test = train_test_split(temp,mergedf[target],test_size=0.2,random_state=42)


In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,test_size=0.1,random_state=42)
print("Shape of x_train:",x_train.shape)
print("Shape of x_val:",x_val.shape)
print("Shape of x_test:",x_test.shape)
print("Shape of y_train:",y_train.shape)
print("Shape of y_val:",y_val.shape)
print("Shape of y_test:",y_test.shape)

In [None]:
mean_sales = y_train.mean()
print("Average Sales :",mean_sales)

## CREATING MODEL

In [None]:
# check if  there are any non-numeric value


# i=0
# for a in x_train["StateHoliday"]:
#     print(i)
#     i+=1
#     print(int(a))

# x_train[x_train["StateHoliday"]=='b']["StateHoliday"] = 1

In [None]:

from keras.models import Sequential
from keras.layers import Dense, Dropout
model = Sequential()
model.add(Dense(150,input_dim = 44,activation="relu"))

model.add(Dense(1,activation = "linear"))

model.compile(optimizer='adam',loss="mean_absolute_error",
metrics=["mean_absolute_error"])

model.fit(x_train.astype(np.float32),y_train.astype(np.float32),epochs=10,batch_size=64)

In [None]:
result = model.evaluate(x_test.astype(np.float32),y_test.astype(np.float32))

for i in range(len(model.metrics_names)):
    print("Metric ",model.metrics_names[i],":",str(round(result[i],2)))

## Conclusion

We will try to improve this performance next time. Thanks for reading.

## References
* Learn Keras for Deep Neural Networks By Jojo Moolayil ,2019