# Gelecek Satış Tahmini
## 1. Verilerin okunması

In [None]:
import pandas as pd
import numpy as np

item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
sales_train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
sample_sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

## 2. Verilerin gözlenmesi

In [None]:
item_cat.head(10)

In [None]:
item_cat.shape

In [None]:
items.head(10)

In [None]:
items.shape

In [None]:
sales_train.head(10)

In [None]:
sales_train.shape

In [None]:
sample_sub.head(10)

In [None]:
sample_sub.shape

In [None]:
shops.head(10)

In [None]:
shops.shape

In [None]:
test.head(10)

In [None]:
test.shape

## 3. Veri ön işleme

In [None]:
df = pd.merge(sales_train, items, on="item_id")
df = pd.merge(df, item_cat, on="item_category_id")
df = pd.merge(df, shops, on="shop_id")
df

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
df.info()

In [None]:
df = df.sort_values('date', ascending=True)

In [None]:
df.drop_duplicates(keep=False, inplace=True)

## 4. Keşifçi veri analizi

In [None]:
df.describe()

In [None]:
df.isna().sum() # null veri yok

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
plt.figure(figsize=(10, 16))
sns.countplot(y=df["item_category_name"])

In [None]:
plt.figure(figsize=(10, 16))
sns.countplot(y=df["shop_name"])

In [None]:
plt.figure(figsize=(12, 8))
sns.distplot(df.groupby('date_block_num').sum()['item_cnt_day'])

### Uç değerlerin tespiti

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(df["item_cnt_day"])

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(df["item_price"])

In [None]:
# Uç değerleri veri setinden çıkardık
df = df[df["item_cnt_day"].between(df["item_cnt_day"].quantile(.15), df["item_cnt_day"].quantile(.85))] # without outliers
df = df[df["item_price"].between(df["item_price"].quantile(.15), df["item_price"].quantile(.85))] # without outliers
df.shape

In [None]:
total_day = df["date"].unique().shape # gün sayısı
total_day

In [None]:
train_i = int(total_day[0] * 0.9) # %90 train - %10 test
train_i

In [None]:
df['date'].min().date(), df['date'].max().date()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Dükkanların toplam satış adedi
df[["shop_id", "item_cnt_day"]].groupby(["shop_id"]).sum()

In [None]:
# Dükkanların günlük toplam satış adedi
df[["shop_id", "date", "item_cnt_day"]].groupby(["shop_id", "date"]).sum()

In [None]:
df["item_id"].min(), df["item_id"].max(), len(df["item_id"].unique())

In [None]:
df["date"][0].strftime('%m-%Y')

In [None]:
df_m = df.copy()

In [None]:
df_m["date"] = df_m["date"].apply(lambda x: x.strftime('%m-%Y'))
df_m

In [None]:
df_m[["date", "item_id", "item_price", "item_cnt_day"]].groupby(["date", "item_id"]).sum()

In [None]:
df_m.head()

In [None]:
df_n = df_m.groupby(["date", "item_id","shop_id"]).sum().reset_index()
df_n = df_n[['date','item_id','shop_id','item_cnt_day']]
df_n

In [None]:
df_n["item_cnt_day"].clip(0.,20.,inplace=True)
df_n

In [None]:
df_n = df_n.pivot_table(index=['item_id','shop_id'], columns='date',values='item_cnt_day',fill_value=0).reset_index()
df_n

In [None]:
df_test = pd.merge(test,df_n,on=['item_id','shop_id'], how='left').fillna(0)
df_test = df_test.drop(labels=['ID','item_id','shop_id'],axis=1)
df_test

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
item_price = scaler.fit_transform(df_m["item_price"].values.reshape(-1,1))
item_price

In [None]:
df2 = df_m.groupby(["date","item_id","shop_id"]).mean().reset_index()
df2 = df2[['date','item_id','shop_id','item_price']].pivot_table(index=['item_id','shop_id'], columns='date',values='item_price',fill_value=0).reset_index()
df2

In [None]:
price = pd.merge(test,df2,on=['item_id','shop_id'], how='left').fillna(0)
price = price.drop(labels=['ID','item_id','shop_id'],axis=1)
price

In [None]:
df_test

In [None]:
y_train = df_test["10-2015"]
x_sales = df_test.drop(labels=["10-2015"],axis=1)
x_sales = x_sales.values.reshape((x_sales.shape[0], x_sales.shape[1], 1))
x_prices = price.drop(labels=["10-2015"],axis=1)
x_prices= x_prices.values.reshape((x_prices.shape[0], x_prices.shape[1], 1))
X = np.append(x_sales,x_prices,axis=2)
y = y_train.values.reshape((y_train.shape[0], 1))

In [None]:
X.shape, y.shape

In [None]:
df_test = df_test.drop(labels=["01-2013"],axis=1)
x_test_sales = df_test.values.reshape((df_test.shape[0], df_test.shape[1], 1))
x_test_prices = price.drop(labels=["01-2013"],axis=1)
x_test_prices = x_test_prices.values.reshape((x_test_prices.shape[0], x_test_prices.shape[1], 1))
df_test = np.append(x_test_sales,x_test_prices,axis=2)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=1, shuffle=False)

In [None]:
import tensorflow as tf
print(tf.__version__)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Bidirectional, TimeDistributed, ConvLSTM2D
from keras.optimizers import SGD
from keras.losses import Huber

model = Sequential([
            Conv1D(filters=32, kernel_size=5,
                      strides=1, padding="causal",
                      activation="relu",
                      input_shape=[X_train.shape[1], X_train.shape[2]]),
            Bidirectional(LSTM(64, return_sequences=True)),
            Dropout(0.3),
            TimeDistributed(Dense(64, activation="relu")),
            Dropout(0.3),
            TimeDistributed(Dense(32, activation="relu")),
            TimeDistributed(Dense(1))
])

model.compile(loss=Huber(),
              optimizer="adam",
              metrics=["mse"])
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
%%time
with tf.device('/device:GPU:0'):
    hist = model.fit(X_train, y_train,validation_data=(X_valid, y_valid), 
                     batch_size=256, 
                     verbose=1, epochs=300)

In [None]:
y_sub = model.predict(df_test)

In [None]:
y_pred = model.predict(X_valid)

In [None]:
y_valid.shape

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_valid, y_pred[:,0]))
rmse

In [None]:
sample_sub

In [None]:
y_valid.shape, y_pred.shape, y_sub.shape

In [None]:
plt.plot(hist.history['loss'], label='train')
plt.plot(hist.history['val_loss'], label='validation')
plt.xlabel("Epochs")
plt.ylabel("Mean Square Error")
plt.legend()
plt.show()

In [None]:
submission = pd.DataFrame(y_sub[:,-1],columns=['item_cnt_month'])
submission.to_csv('submission.csv',index_label='ID')

In [None]:
submission