In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Demand Forecasting
### İş Problemi :
İş Problemi
Bir mağaza zinciri, 10 farklı mağazası ve 50 farklı ürünü için 3 aylık bir talep tahmini istemektedir.

### Veri Seti
Bir mağaza zincirinin 5 yıllık verilerinde 10 farklı mağazası ve 50 farklı ürünün bilgileri yer almaktadır.

### Değişkenler

<table>
  <tr >
    <th>Değişken</th>
    <th>Açıklama</th> 
  </tr>
    <tr>
    <td>date</td>
    <td>Satış verilerinin tarihi</td> 
  </tr>
  
  <tr>
    <td>Store</td>
    <td>Mağaza ID'si</td> 
  </tr>
  <tr>
    <td>Item</td>
    <td>Ürün ID'si</td> 
  </tr>
    <td>Sales</td>
    <td>Satılan ürün sayıları/td> 
  </tr>
</table>

### Projede yapılanlar :

▪ Random Noise <br>
▪ Lag/Shifted Features<br>
▪ Rolling Mean Features<br>
▪ Exponentially Weighted Mean Features <br>
kullanarak zaman serileri için trend, seasonality gibi değişkenler eklenmiştir.<br>
▪ Custom Cost Function (SMAPE)<br>
Lgbm için custom cost fonksiyon tanımlanmıştır.<br>
▪ LightGBM ile Model Validasyonu<br>


In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')

### VERİ İNCELEME

In [None]:
train = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/sample_submission.csv')
df = pd.concat([train, test], sort=False)

usd_kur = pd.read_csv('/kaggle/input/usd-information/usd.csv', parse_dates=['date'])
usd_kur.info()
usd_kur = usd_kur[["date", "dolar_kuru"]]
df = df.merge(usd_kur, how="left")
train = train.merge(usd_kur, how="left")
test = test.merge(usd_kur, how="left")

In [None]:
df.head()

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

Dolar kuru object olarak görünmekte. Bunu float tipine çevirmek gerekli.

In [None]:
df["dolar_kuru"] = df["dolar_kuru"].str.replace(',', '.')

df["dolar_kuru"] = df["dolar_kuru"].astype("float64")

In [None]:
df.info()

In [None]:
print(f' Min Date : {df["date"].min()}, Max Date: {df["date"].max()}')

print(f' Min Dolar Kuru : {df["dolar_kuru"].min()}, Max Dolar Kuru: {df["dolar_kuru"].max()}')

### Satış dağılımı nasıl?


In [None]:
df["sales"].describe([0.10, 0.30, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99])

### Kaç store var?


In [None]:
df[["store"]].nunique()

### Kaç item var?


In [None]:
df[["item"]].nunique()

### Her store'da eşit sayıda mı eşsiz item var?

In [None]:
df.groupby(["store"])["item"].nunique()

### Peki her store'da eşit sayıda mı sales var?

In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum"]})


### Mağaza-item kırılımında satış istatistikleri nasıl?


In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

### ÖZELLİK MÜHENDİSLİĞİ

#### 1. TARİHTEN ÖZELLİKLER OLUŞTURULMASI

In [None]:
def create_date_features(df,date_column):
    df['month'] = df[date_column].dt.month
    df['day_of_month'] = df[date_column].dt.day
    df['day_of_year'] = df[date_column].dt.dayofyear
    df['week_of_year'] = df[date_column].dt.weekofyear
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['year'] = df[date_column].dt.year
    df["is_wknd"] = df[date_column].dt.weekday // 4
    df['is_month_start'] = df[date_column].dt.is_month_start.astype(int)
    df['is_month_end'] = df[date_column].dt.is_month_end.astype(int)
    df['quarter'] = df[date_column].dt.quarter
    df['is_quarter_start'] = df[date_column].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df[date_column].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df[date_column].dt.is_year_start.astype(int)
    df['is_year_end'] = df[date_column].dt.is_year_end.astype(int)
    return df

df = create_date_features(df,"date")

In [None]:
df.head()

#### 2. LAG, Rolling Mean, Exponentially Weighted Mean Özelliklerinin Oluşturulması

In [None]:
##########################################################
# Random Noise: Oluşturulacak zaman serisi özelliklerine 
# rastgele gürültüler eklenecektir.
#########################################################

def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

In [None]:
####################################################
# Lag Features : Mevsimselliği yakalamak için 
# Lag değişkenleri oluşturulacaktır.
####################################################

def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [None]:
df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 546, 728])
df.head()

In [None]:
#################################################
# Rolling Mean Features : Trendi yakalamak için 
# Lag değişkenleri oluşturulacaktır.
#################################################

def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

In [None]:
df = roll_mean_features(df, [365, 546])
df.tail()

In [None]:
#########################################
# Exponentially Weighted Mean Features
#########################################
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean()) \
                + random_noise(dataframe)
    return dataframe

In [None]:
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)
df.tail()

#### VERİ ÖN İŞLEME

In [None]:
########################
# One-Hot Encoding
########################

df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])

In [None]:
########################
# Converting sales to log(1+sales)
########################

df['sales'] = np.log1p(df["sales"].values)

In [None]:
df.head()

### MODEL

#### Custom Cost Fonksiyonu : 
#### Smape (Symmetric mean absolute percentage error (adjusted MAPE)

In [None]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

smape fonskiyonunu lgbm'de kullanabilecek hale getirelim.

In [None]:
def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

Eğitim ve validasyon setlerini oluşturalım.

In [None]:
train = df.loc[(df["date"] < "2017-01-01"), :]

val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

In [None]:
######################################################
# Base Models
######################################################

import lightgbm as lgbm

lgbm_model = lgbm.LGBMRegressor().fit(X_train, Y_train)
y_pred = lgbm_model.predict(X_val)

In [None]:
# Validation rmse
print(f' Validasyon rmse: {np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(Y_val)))}')
# Validation smpe
print(f' Validasyon smpe: { smape(np.expm1(y_pred), np.expm1(Y_val))}')

In [None]:
############################################
# LightGBM Model : LGBM'in daha 
# çok özelliğini kullanabilmek için 
# ve özel cost fonksiyonunu kullanabilmek için
# bu şekilde kullanacağız. Fit 
# ile değil train ile eğiteceğiz.
############################################

# LightGBM parameters
lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.01,
              'feature_fraction': 1,
              'max_depth': None,
              'verbose': 0,
              'num_boost_round': 10000,
              'early_stopping_rounds': 200,
              'nthread': -1}

In [None]:
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)

y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

In [None]:
########################
# Değişken önem düzeyleri
########################

def plot_lgb_importances(model, plot=False, num=10):
    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))

In [None]:
plot_lgb_importances(model, num=30, plot=True)

Oluşturduğumuz değişkenlerden en önemli değişken sales_roll_mean_546 olarak görülmektedir.

## Final Model

En iyi iterasyon : 10000'dir. Bunu kullanarak final modeli kuralım.


In [None]:
df.isna().sum()


In [None]:
test = df.loc[df.sales.isna()]
len(test)

In [None]:
train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

In [None]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.01,
              'feature_fraction': 1,
              'max_depth': None,
              'verbose': 0,
              'nthread': -1,
              "num_boost_round": model.best_iteration}

In [None]:
# LightGBM dataset
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

In [None]:
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.head()
submission_df.to_csv('submission.csv', index=False)