In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.callbacks import History
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv")
df.shape

In [None]:
df.head()

In [None]:
df=df.drop("row_id",axis=1)
df.head()

In [None]:
print("Distinct number of Stores :", len(df["store"].unique()))

In [None]:
print("Distinct number of Products :", len(df["product"].unique()))

In [None]:
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)

In [None]:
# date features
df["Month"] = df["date"].dt.month
df["Quarter"] = df["date"].dt.quarter
df["Year"] = df["date"].dt.year
df["Day"] = df["date"].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df["Week"] = df["date"].dt.week
df["Season"] = np.where(df["Month"].isin([3,4,5]),"Spring",np.where(df["Month"].isin([6,7,8]), "Summer",np.where(df["Month"].isin ([9,10,11]),"Fall",np.where(df["Month"].isin ([12,1,2]),"Winter","None"))))

In [None]:
print(df[["date","Year","Month","Day","Week","Quarter","Season","day_of_week"]].head())

In [None]:
#Create a histogram to study the Daily Sales for the stores
plt.figure(figsize=(15,8))
plt.hist(df["num_sold"])
plt.title("Histogram for Store Sales")
plt.xlabel("bins")
plt.xlabel("Frequency")
plt.show()

In [None]:
df.isnull().sum()/df.shape[0] * 100

In [None]:
ax = sns.barplot(x="Season", y="num_sold", data=df)

In [None]:
ax = sns.barplot(x="Month", y="num_sold", data=df)

In [None]:
ax = sns.barplot(x="day_of_week", y="num_sold", data=df)

In [None]:
ax = sns.barplot(x="product", y="num_sold", data=df)

In [None]:
ax = sns.barplot(x="store", y="num_sold", data=df)

In [None]:
ax = sns.barplot(x="country", y="num_sold", data=df)

In [None]:
target = ["num_sold"]

In [None]:
categorical_columns = ["day_of_week","Quarter","Month","store","product","Season",'country']

In [None]:
def create_one_hot_encoding(df, col):
 labelencoder = LabelEncoder()
 fittrans=labelencoder.fit_transform(df[col]).reshape(-1,1)
 onehotencoder = OneHotEncoder(sparse=False)
 column_names = [col+ "_"+ str(idx) for idx in labelencoder.classes_]
 return(pd.DataFrame(onehotencoder.fit_transform(fittrans),columns =column_names))

In [None]:
temp = pd.DataFrame()
for column in categorical_columns:
 temp_df = create_one_hot_encoding(df,column)
 temp = pd.concat([temp,temp_df],axis=1)

In [None]:
temp

In [None]:
temp.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(temp, df[target],test_size=0.2,random_state=2018)
#Further divide training dataset into train and validation dataset with an 90:10 split
x_train, x_val, y_train, y_val = train_test_split(x_train, 
y_train,test_size=0.1,random_state=2018)
#Check the sizes of all newly created datasets
print("Shape of x_train:",x_train.shape)
print("Shape of x_val:",x_val.shape)
print("Shape of x_test:",x_test.shape)
print("Shape of y_train:",y_train.shape)
print("Shape of y_val:",y_val.shape)
print("Shape of y_test:",y_test.shape)

In [None]:
y_train.mean()

In [None]:
history = History()

In [None]:
model = Sequential()
model.add(Dense(250,input_dim = 35,activation="relu"))
model.add(Dense(250,activation="relu"))
model.add(Dense(1,activation = "linear"))

In [None]:
#Configure the model
model.compile(optimizer='adam',loss="mean_absolute_error",metrics=["mean_absolute_error"])
#Train the model
model.fit(x_train.values,y_train.values, validation_data= (x_val,y_val),epochs=20,batch_size=32,callbacks=[history])

In [None]:
result = model.evaluate(x_test.values,y_test.values)
for i in range(len(model.metrics_names)):
 print("Metric ",model.metrics_names[i],":",str(round(result[i],2)))

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model's Training & Validation loss across epochs")
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
y_test["Prediction"] = model.predict(x_test)
print(y_test.head(10))

In [None]:
print("MSE :",mean_squared_error(y_test["num_sold"].
values,y_test["Prediction"].values))
print("MAE :",mean_absolute_error(y_test["num_sold"].
values,y_test["Prediction"].values))

In [None]:
testdata = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv")

In [None]:
testdata.head()

In [None]:
testdata=testdata.drop('row_id',axis=1)
testdata.head()

In [None]:
testdata['date'] = pd.to_datetime(testdata['date'], infer_datetime_format=True)

In [None]:
testdata["Month"] = testdata["date"].dt.month

In [None]:
testdata["Quarter"] = testdata["date"].dt.quarter
testdata["Year"] = testdata["date"].dt.year
testdata["Day"] = testdata["date"].dt.day
testdata['day_of_week'] = testdata['date'].dt.dayofweek
testdata["Week"] = testdata["date"].dt.week
testdata["Season"] = np.where(testdata["Month"].isin([3,4,5]),"Spring",np.where(testdata["Month"].isin([6,7,8]), "Summer",np.where(testdata["Month"].isin ([9,10,11]),"Fall",np.where(testdata["Month"].isin ([12,1,2]),"Winter","None"))))

In [None]:
testdata.head()

In [None]:
temptest = pd.DataFrame()
for column in categorical_columns:
 temp_df = create_one_hot_encoding(testdata,column)
 temptest = pd.concat([temptest,temp_df],axis=1)

In [None]:
temptest

In [None]:
temptest.shape

In [None]:
temp.shape

In [None]:
testdata["Prediction"] = model.predict(temptest)
print(testdata.head(10))

In [None]:
submissiondf=pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv")
submissiondf.head()

In [None]:
submissiondf["num_sold"] = model.predict(temptest)
print(submissiondf.head(10))

In [None]:
submissiondf.to_csv("submission.csv",index=False)