In [1]:
"""
Store Item Demand Forecasting Challenge (Talep Tahmin)

Bir mağaza zinciri, 10 farklı mağazası ve 50 farklı ürünü için 3 aylık bir talep tahmini istemektedir.
"""

'\nStore Item Demand Forecasting Challenge (Talep Tahmin)\n\nBir mağaza zinciri, 10 farklı mağazası ve 50 farklı ürünü için 3 aylık bir talep tahmini istemektedir.\n'

In [2]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 500)
warnings.filterwarnings("ignore")

# Loading the data
train = pd.read_csv("train.csv", parse_dates=['date'])
test = pd.read_csv("test.csv", parse_dates=['date'])

df = pd.concat([train, test], sort=False)

In [3]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

### EDA (Keşifçi Veri Analizi) ##
df['date'].min(), df['date'].max()

(Timestamp('2013-01-01 00:00:00'), Timestamp('2018-03-31 00:00:00'))

In [4]:
check_df(df)

##################### Shape #####################
(958023, 5)
##################### Types #####################
date     datetime64[ns]
store             int64
item              int64
sales           float64
id              float64
dtype: object
##################### Head #####################
        date  store  item  sales  id
0 2013-01-01      1     1   13.0 NaN
1 2013-01-02      1     1   11.0 NaN
2 2013-01-03      1     1   14.0 NaN
3 2013-01-04      1     1   13.0 NaN
4 2013-01-05      1     1   10.0 NaN
##################### Tail #####################
            date  store  item  sales       id
44995 2018-03-27     10    50    NaN  44995.0
44996 2018-03-28     10    50    NaN  44996.0
44997 2018-03-29     10    50    NaN  44997.0
44998 2018-03-30     10    50    NaN  44998.0
44999 2018-03-31     10    50    NaN  44999.0
##################### NA #####################
date          0
store         0
item          0
sales     45000
id       913023
dtype: int64
##################

In [5]:
df.groupby(['store'])['item'].nunique()

store
1     50
2     50
3     50
4     50
5     50
6     50
7     50
8     50
9     50
10    50
Name: item, dtype: int64

In [6]:
df.groupby(['store', 'item']).agg({"sales": ["sum"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
store,item,Unnamed: 2_level_2
1,1,36711.0
1,2,97050.0
1,3,60638.0
1,4,36440.0
1,5,30335.0
...,...,...
10,46,120601.0
10,47,45204.0
10,48,105570.0
10,49,60317.0


In [7]:
# Çeşitli ürünlerde mağaza performansı
df.groupby(['store', 'item']).agg({"sales": ["sum","mean","median","std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,median,std
store,item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,1,36711.0,19.854516,19.0,6.788943
1,2,97050.0,53.148959,52.0,15.005779
1,3,60638.0,33.208105,33.0,10.072529
1,4,36440.0,19.956188,20.0,6.640618
1,5,30335.0,16.612815,16.0,5.672102
...,...,...,...,...,...
10,46,120601.0,66.046550,65.0,18.114991
10,47,45204.0,24.755750,24.0,7.924820
10,48,105570.0,57.814896,57.0,15.898538
10,49,60317.0,33.032311,32.0,10.091610


In [8]:
## FEATURE ENGINEERING ###

# Mevsimsellik alanında feature'lar ürettik
def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df['date'].apply(lambda x: x.isocalendar()[1])
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df['is_wknd'] = df.date.dt.weekday // 4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    return df

create_date_features(df)

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end
0,2013-01-01,1,1,13.0,,1,1,1,1,1,2013,0,1,0
1,2013-01-02,1,1,11.0,,1,2,2,1,2,2013,0,0,0
2,2013-01-03,1,1,14.0,,1,3,3,1,3,2013,0,0,0
3,2013-01-04,1,1,13.0,,1,4,4,1,4,2013,1,0,0
4,2013-01-05,1,1,10.0,,1,5,5,1,5,2013,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,2018-03-27,10,50,,44995.0,3,27,86,13,1,2018,0,0,0
44996,2018-03-28,10,50,,44996.0,3,28,87,13,2,2018,0,0,0
44997,2018-03-29,10,50,,44997.0,3,29,88,13,3,2018,0,0,0
44998,2018-03-30,10,50,,44998.0,3,30,89,13,4,2018,1,0,0


In [9]:
# Mağaza, ürün ve ay kırılımında satışların istatistikleri
df.groupby(['store','item','month']).agg({"sales": ["sum", "mean", "median", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,sales,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,median,std
store,item,month,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,2368.0,13.303371,13.0,4.389007
1,1,2,2063.0,14.631206,14.0,4.668146
1,1,3,2728.0,17.600000,17.0,4.545013
1,1,4,3118.0,20.786667,20.0,4.894301
1,1,5,3448.0,22.245161,22.0,6.564705
...,...,...,...,...,...,...
10,50,8,13108.0,84.567742,85.0,15.676527
10,50,9,11831.0,78.873333,79.0,15.207423
10,50,10,11322.0,73.045161,72.0,14.209171
10,50,11,11549.0,76.993333,77.0,16.253651


In [10]:
## Random Noise (Rastgele Gürültü) ##
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

## lag/Shifted Features (Gecikme/Değiştirilmiş Özellikler) ##

# Satış değişkenini göre featurelar üreteceğim, bu yüzden veriyi sıralıyorum.
df.sort_values(by=["store", "item", "date"], axis=0, inplace=True)

pd.DataFrame({"sales": df["sales"].values[0:10],
              "lag1": df["sales"].shift(1).values[0:10],
              "lag2": df["sales"].shift(2).values[0:10],
              "lag3": df["sales"].shift(3).values[0:10],
              "lag4": df["sales"].shift(4).values[0:10]})


Unnamed: 0,sales,lag1,lag2,lag3,lag4
0,13.0,,,,
1,13.0,13.0,,,
2,11.0,13.0,13.0,,
3,11.0,11.0,13.0,13.0,
4,14.0,11.0,11.0,13.0,13.0
5,14.0,14.0,11.0,11.0,13.0
6,13.0,14.0,14.0,11.0,11.0
7,13.0,13.0,14.0,14.0,11.0
8,10.0,13.0,13.0,14.0,14.0
9,10.0,10.0,13.0,13.0,14.0


In [11]:
df.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(1))

0         NaN
24       13.0
1        13.0
25       11.0
2        11.0
         ... 
44995     NaN
44996     NaN
44997     NaN
44998     NaN
44999     NaN
Name: sales, Length: 958023, dtype: float64

In [12]:
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store","item"])["sales"].transform(lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe 

df = lag_features(df, [91,98,105,112,119,126,182, 364, 546, 728])

In [13]:
## Rolling Mean Features (Hareketli Ortalama Özellikleri) ## 
pd.DataFrame({"sales": df["sales"].values[0:10],
              "roll2": df["sales"].shift(1).rolling(window=2).mean().values[0:10],
              "roll3": df["sales"].shift(1).rolling(window=3).mean().values[0:10],
              "roll5": df["sales"].shift(1).rolling(window=5).mean().values[0:10]})

def roll_mean_features(df,windows):
    for window in windows:
        df['sales_roll_mean_' + str(window)] = df.groupby(["store", "item"])['sales']. \
            transform(lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(df)
    return df

df = roll_mean_features(df,[365, 546])

In [14]:
## Exponentially Weighted Mean Features (Üstel Ağırlıklı Ortalama) ##
def ewm_features(df, alphas, lags):
    for alpha in alphas: # Alphalar en yakın değere ne kadar önem vermem gerekiyordu onu belirtiyor.
        for lag in lags:
            df['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
            df.groupby(["store", "item"])["sales"].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return df       

alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91,98,105,112,180,270,365,546,728]
df = ewm_features(df, alphas, lags)


In [15]:
## One-Hot Encoding ##
df = pd.get_dummies(df, columns=['store','item','day_of_week','month'])

In [16]:
## Bağımlı Değişkenin Logaritması ( Log of Dependent Variable ) ##
df['sales'] = np.log1p(df["sales"].values)

In [17]:
## Time-Based Validation Sets (Zamana Dayalı Doğrulama Kümeleri) ##

# 2017'nin başına kadar train seti
train = df.loc[df["date"] < "2017-01-01", :]
#2017'nin ilk 3'ayı validation seti.
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

In [18]:
cols = [col for col in train.columns if col not in ['date', 'id', 'sales', 'year']]

Y_train = train["sales"]
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

In [20]:
def lgbm_smape(y_pred, dataset):
    y_true = dataset.get_label()
    y_pred = np.clip(y_pred, 1e-9, None)
    y_true = np.clip(y_true, 1e-9, None)
    smape = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))
    return 'smape', np.mean(smape), False

lgb_params = {
    'num_leaves': 10,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'max_depth': 5,
    'verbose': 0,
    'num_boost_round': 1000,
    'early_stopping_rounds': 200,
    'nthread': -1
}

lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

model = lgb.train(
    lgb_params, 
    lgbtrain,
    valid_sets=[lgbtrain, lgbval],
    num_boost_round=lgb_params['num_boost_round'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=100)  # This replaces verbose_eval
    ],
    feval=lgbm_smape
)

Training until validation scores don't improve for 200 rounds
[100]	training's l2: 0.0513518	training's smape: 0.0482267	valid_1's l2: 0.0531025	valid_1's smape: 0.0494567
[200]	training's l2: 0.0352637	training's smape: 0.0401336	valid_1's l2: 0.0372308	valid_1's smape: 0.0418604
[300]	training's l2: 0.0324563	training's smape: 0.0386317	valid_1's l2: 0.0341275	valid_1's smape: 0.0403582
[400]	training's l2: 0.031405	training's smape: 0.0380743	valid_1's l2: 0.0332799	valid_1's smape: 0.0399294
[500]	training's l2: 0.0307321	training's smape: 0.0377232	valid_1's l2: 0.0326962	valid_1's smape: 0.0395975
[600]	training's l2: 0.0302619	training's smape: 0.0374707	valid_1's l2: 0.0322191	valid_1's smape: 0.0393132
[700]	training's l2: 0.0299261	training's smape: 0.0372825	valid_1's l2: 0.0319068	valid_1's smape: 0.0391223
[800]	training's l2: 0.0296327	training's smape: 0.0371271	valid_1's l2: 0.0316736	valid_1's smape: 0.0389807
[900]	training's l2: 0.029374	training's smape: 0.0369897	v

In [22]:
def smape(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-9, None)
    y_true = np.clip(y_true, 1e-9, None)
    smape_value = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))
    return np.mean(smape_value)

# Validasyon verisi üzerinde tahmin yapın
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

# SMAPE hesaplama
smape_value = smape(np.expm1(Y_val), np.expm1(y_pred_val))
print("SMAPE değeri: ", smape_value)

SMAPE değeri:  0.13839122972217047


In [30]:
## Değişken Önem Düzeyi ##
def plot_lgb_importances(model, plot=False, num=10):
    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature':model.feature_name(),
                             'split':model.feature_importance('split'),
                             'gain':100 * gain / gain.sum()}).sort_values('gain',ascending=False)
    if plot:
        plt.figure(figsize=(10,10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))
    return feat_imp
    
feat_imp = plot_lgb_importances(model, num=200)

imortance_zero = feat_imp[feat_imp["gain"]==0]['feature'].values

important_feature = [col for col in cols if col not in imortance_zero]
len(important_feature)

                        feature  split       gain
17          sales_roll_mean_546    916  54.363157
13                sales_lag_364   1216  13.182979
16          sales_roll_mean_365    643   9.882718
60   sales_ewm_alpha_05_lag_365    403   4.875986
18   sales_ewm_alpha_095_lag_91     92   2.188422
..                          ...    ...        ...
105                     item_33      0   0.000000
65                      store_3      0   0.000000
107                     item_35      0   0.000000
111                     item_39      0   0.000000
71                      store_9      0   0.000000

[142 rows x 3 columns]


114

In [32]:
## Final Model ##
train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols] # Buradaki ools yerine important_feature kullanılabilir daha iyi olur 

test = df.loc[df.sales.isna()]
X_test = test[cols]

lgb_params = {
    'num_leaves': 10,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'max_depth': 5,
    'verbose': 0,
    'nthread': -1,
    'num_boost_round': model.best_iteration
}

lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

final_model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

test_pred = final_model.predict(X_test, num_iteration=model.best_iteration)

In [34]:
## Submission File ##
submission_df = test.loc[:, ["id","sales"]]
submission_df['sales'] = np.expm1(test_pred)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv("submission_demand.csv",index=False)