In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')

train=pd.read_csv("../input/demand-forecasting-kernels-only/train.csv")
test=pd.read_csv("../input/demand-forecasting-kernels-only/test.csv")
sample=pd.read_csv("../input/demand-forecasting-kernels-only/sample_submission.csv")
df = pd.concat([train, test], sort=False)
df.date=pd.to_datetime(df.date)


In [None]:

def degisken_tiplerine_ayirma(data, cat_th, car_th):
    """
   Veri:data parametresi ili fonksiyona girilen verinin değişkenlerin sınıflandırılması.
    Parameters
    ----------
    data: pandas.DataFrame
    İşlem yapılacak veri seti
 ​
    cat_th:int
    categoric değişken threshold değeri
 ​
    car_th:int
    Cardinal değişkenler için threshold değeri
 ​
    Returns
    -------
     cat_deg:list
     categorik değişken listesi
     num_deg:list
     numeric değişken listesi
     car_deg:list
     categoric ama cardinal değişken listesi
 ​
    Examples
    -------
     df = dataset_yukle("breast_cancer")
     cat,num,car=degisken_tiplerine_ayirma(df,10,20)
    Notes
    -------
     cat_deg + num_deg + car_deg = toplam değişken sayısı
 ​
    """



    num_but_cat = [i for i in data.columns if data[i].dtypes != "O" and data[i].nunique() < cat_th]

    car_deg = [i for i in data.columns if data[i].dtypes == "O" and data[i].nunique() > car_th]

    num_deg = [i for i in data.columns if data[i].dtypes != "O" and i not in num_but_cat]

    cat_deg = [i for i in data.columns if data[i].dtypes == "O" and i not in car_deg]

    cat_deg = cat_deg + num_but_cat

    print(f"Dataset kolon/değişken sayısı: {data.shape[1]}")
    print(f"Dataset satır/veri sayısı: {data.shape[0]}")
    print("********************************************")
    print(f"Datasetin numeric değişken sayısı: {len(num_deg)}")
    print(f"Datasetin numeric değişkenler: {num_deg}")
    print("********************************************")
    print(f"Datasetin categoric değişken sayısı: {len(cat_deg)}")
    print(f"Datasetin categoric değişkenler: {cat_deg}")
    print("********************************************")
    print(f"Datasetin cardinal değişken sayısı: {len(car_deg)}")
    print(f"Datasetin cardinal değişkenler: {car_deg}")
    print("********************************************")

    return cat_deg, num_deg, car_deg


def dataset_ozet(data, head=5):
    print("##################### Shape #####################")
    print(f"Satır sayısı: {data.shape[0]}")
    print(f"Kolon sayısı: {data.shape[1]}")


    print("##################### Types #####################")
    print(data.dtypes)

    print("##################### Head #####################")
    print(data.head(head))

    print("##################### Tail #####################")
    print(data.tail(head))

    print("##################### NA Kontrolü #####################")
    print(data.isnull().sum())

    print("##################### Quantiles #####################")
    print(data.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

    print("##################### Describe Tablosu #####################")
    print(data.describe().T)


def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit



def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


def categoric_ozet(data, degisken, plot=False, null_control=False):
    """
   Task
    ----------
    Datasetinde bulunan categoric değişkenlerin değişken tiplerinin sayısını ve totale karşı oranını bulur.
    Ayrıca isteğe bağlı olarak değişken dağılımının grafiğini ve değişken içinde bulunan null sayısını çıkartır.
​
    Parameters
    ----------
    data:pandas.DataFrame
    categoric değişkenin bulunduğu dataset.
    degisken:String
    Categoric değişken ismi.
    plot:bool
    Fonksiyonda categoric değişken dağılımının grafiğini çizdirmek için opsiyonel özellik.
    null_control:bool
    Fonksiyonda değişken içinde null değer kontolü için opsiyonel özellik
​
    Returns
    -------
    tablo:pandas.DataFrame
    Unique değişkenlerin ratio olarak oran tablosu
    Examples
    -------
    df=dataset_yukle("titanic")
    cat_deg,num_deg,car_deg=degisken_tiplerine_ayirma(df,10,20)
    for i in cat_deg:
        tablo=categoric_ozet(df,i,True,True)
    """


    print(pd.DataFrame({degisken: data[degisken].value_counts(),
                    "Ratio": 100 * data[degisken].value_counts() / len(data)}))
    tablo = pd.DataFrame({degisken: data[degisken].value_counts(),
                      "Ratio": 100 * data[degisken].value_counts() / len(data)})
    print("##########################################")
    if plot:
        sns.countplot(x=data[degisken], data=data)
        plt.show()
    if null_control:
        print(f"Null veri sayısı: {data[degisken].isnull().sum()}")

    return tablo


def numeric_ozet(data, degisken, plot=False, null_control=False):
    """
    Task
    ----------
    Datasetinde bulunan numeric değişkenlerin değişken tiplerinin sayısını ve totale karşı oranını bulur.
    Ayrıca isteğe bağlı olarak değişken dağılımının grafiğini ve değişken içinde bulunan null sayısını çıkartır.
​
    Parameters
    ----------
    data:pandas.DataFrame
    categoric değişkenin bulunduğu dataset.
    degisken:String
    Categoric değişken ismi.
    plot:bool
    Fonksiyonda categoric değişken dağılımının grafiğini çizdirmek için opsiyonel özellik.
    null_control:bool
    Fonksiyonda değişken içinde null değer kontolü için opsiyonel özellik
​
    Returns
    -------
    tablo:pandas.DataFrame
    Unique değişkenlerin ratio olarak oran tablosu
    Examples
    -------
    df=dataset_yukle("titanic")
    cat_deg,num_deg,car_deg=degisken_tiplerine_ayirma(df,10,20)
    for i in cat_deg:
        tablo=categoric_ozet(df,i,True,True)
    """
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(data[degisken].describe(quantiles).T)

    if plot:
        data[degisken].hist(bins=20)
        plt.xlabel(degisken)
        plt.title(degisken)
        plt.show(block=True)
    print("##########################################")

    if null_control:
        print(f"Null veri sayısı: {data[degisken].isnull().sum()}")



def target_analyser(dataframe, target, num_deg, cat_deg):
    for degisken in dataframe.columns:
        if degisken in cat_deg:
            print(degisken, ":", len(dataframe[degisken].value_counts()))
            print(pd.DataFrame({"COUNT": dataframe[degisken].value_counts(),
                                "RATIO": dataframe[degisken].value_counts() / len(dataframe),
                                "TARGET_MEAN": dataframe.groupby(degisken)[target].mean()}), end="\n\n\n")
        if degisken in num_deg:
            print(pd.DataFrame({
                "TARGET_MEAN": dataframe.groupby(target)[degisken].mean()}), end="\n\n\n")


def outlier_threshold(data, degisken):
    Q1 = data[degisken].quantile(0.25)
    Q3 = data[degisken].quantile(0.75)
    Q_Inter_Range = Q3 - Q1
    alt_limit = Q1 - 1.5 * Q_Inter_Range
    ust_limit = Q3 + 1.5 * Q_Inter_Range
    return alt_limit, ust_limit


def threshold_degisimi(data, degisken):
    alt_limit, ust_limit = outlier_threshold(data, degisken)
    data[data[degisken] < alt_limit] = alt_limit
    data[data[degisken] > ust_limit] = ust_limit
    return data


def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns


def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe



def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    df['is_Mon'] = np.where(df['day_of_week'] == 1, 1, 0)
    df['is_Tue'] = np.where(df['day_of_week'] == 2, 1, 0)
    df['is_Wed'] = np.where(df['day_of_week'] == 3, 1, 0)
    df['is_Thu'] = np.where(df['day_of_week'] == 4, 1, 0)
    df['is_Fri'] = np.where(df['day_of_week'] == 5, 1, 0)
    df['is_Sat'] = np.where(df['day_of_week'] == 6, 1, 0)
    df['is_Sun'] = np.where(df['day_of_week'] == 7, 1, 0)
    return df


def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe


def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe


def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))


def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe


def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val


def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [None]:
df=create_date_features(df)

df = lag_features(df, [91,98,  182,273, 364, 546])
df = roll_mean_features(df, [ 182, 365])
alphas = [0.95, 0.9,0.8, 0.7,0.5]
lags = [91,98, 182,273, 365, 546]
df=ewm_features(df, alphas, lags)
cat_deg,num_deg,car_deg=degisken_tiplerine_ayirma(df,10,20)
cat_deg=[i for i in cat_deg if i!="year"]
df = pd.get_dummies(df, columns=cat_deg)
df['sales'] = np.log1p(df["sales"].values)

In [None]:
train = df.loc[(df["date"] < "2017-01-01"), :]
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

In [None]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 8,

              'learning_rate': 0.03,
              'feature_fraction': 0.9,
              'max_depth': 3,
              'verbose': 0,
              

              'num_boost_round': 20000,
              'early_stopping_rounds': 300,
              'nthread': -1}

lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

In [None]:


model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)

y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

smape(np.expm1(y_pred_val), np.expm1(Y_val))

def plot_lgb_importances(model, plot=False, num=10):

    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))


plot_lgb_importances(model, num=30)
plot_lgb_importances(model, num=30, plot=True)

lgb.plot_importance(model, max_num_features=20, figsize=(10, 10), importance_type="gain")
plt.show()


In [None]:

train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

lgb_params = {'metric': {'mae'},
              'num_leaves': 8,
              'learning_rate': 0.03,
              'feature_fraction': 0.9,
              'max_depth': 3,
              'verbose': 0,
              
              'nthread': -1,
              "num_boost_round": model.best_iteration}
            

# LightGBM dataset
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

test_preds = model.predict(X_test, num_iteration=model.best_iteration)


submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)

submission_df.to_csv('submission.csv', index=False)
submission_df.head(20)