In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
submission_example = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
df.head()

In [None]:
## Sales over time
df.groupby('date_block_num')['item_cnt_day'].sum().plot(kind='area', title='Sales Over Time')
plt.show()
## Notice how all sales spike up every ~12 months?
## Month of the sale will be added to the features

In [None]:
df

In [None]:
## Extract the month data from the date column
df['Month'] = pd.to_datetime(df['date'])
df['Month'] = df['Month'].apply(func=lambda x: x.month)

In [None]:
df

# **Simple Neural Network**

In [None]:
# Simple neural network

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

data = df[['Month', 'shop_id', 'item_id']]
label = df['item_cnt_day']

train_data, test_data = train_test_split(data, test_size=.25)
train_label, test_label = train_test_split(label, test_size=.25)

# Dropout layers will randomly erase the output of the dense layers to prevent overfitting!

model = Sequential()
model.add(Flatten(input_shape=(3,)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(.4))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='softmax'))

model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1),
              loss='mean_squared_error')

hist = model.fit(train_data, train_label, epochs=5, batch_size=50000)


# **XGBoost**

In [None]:
## XGBoost

import xgboost as xgb
from sklearn.metrics import mean_squared_error

data_dmatrix = xgb.DMatrix(data=data, label=label)

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(train_data, train_label)

preds = xg_reg.predict(test_data)
mse = mean_squared_error(test_label, preds)
print("MSE: %f" % (mse))

In [None]:
## K-Fold Cross Validation

params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

cv_results.head()

In [None]:
## Plot the importance of features
xgb.plot_importance(xg_reg)
plt.show()

In [None]:
# The error difference between XGBClassifier and Neural Network

tf_loss = np.average(hist.history.get('loss'))
xgb_loss = mse

plt.barh(['Tensorflow Loss', 'XGBLoss'],[tf_loss, xgb_loss])
plt.show()

In [None]:
submission_example

In [None]:
submission_df = test_df.copy()
submission_df['Month'] = 11
submission_df

In [None]:
#Predicting the data for submission
 
    # preds = model.predict(submission_df[['Month', 'shop_id','item_id']])
preds = xg_reg.predict(submission_df[['Month', 'shop_id','item_id']])

submission_example['item_cnt_month'] = preds

In [None]:
# Completed prediction

submission_example

In [None]:
# How it finally looks
submission_example.describe()

In [None]:
submission_example.to_csv('ibrahim_altay_submission.csv', index=False)