# Import modules

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import RFE
from plotly.offline import iplot, init_notebook_mode
from plotly.subplots import make_subplots
init_notebook_mode()

import warnings
warnings.filterwarnings('ignore')

In [None]:
def path_join(name):
    path = '/kaggle/input/store-sales-time-series-forecasting'
    return os.path.join(path, name)


def describe(df):
    '''
    make dataframe which describe the details about null count, etc
    '''
    print(f'Shape : {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['DataType']).reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    summary['null count'] = df.isnull().sum().values
    summary['unique count'] = df.nunique().values
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    
    return summary

In [None]:
train = pd.read_csv(path_join('train.csv'))
test = pd.read_csv(path_join('test.csv'))
oil = pd.read_csv(path_join('oil.csv'))
holidays_events = pd.read_csv(path_join('holidays_events.csv'))
stores = pd.read_csv(path_join('stores.csv'))
transactions = pd.read_csv(path_join('transactions.csv'))

# 1st, EDA

## We need to explore df and try to figure out the key that data have! 

# Train

In [None]:
display(describe(train))

In [None]:
print(train['sales'].value_counts().sort_values()[:10])
print(train['onpromotion'].value_counts().sort_values()[:10])
print('\n\n')

_, axes = plt.subplots(2, 2, figsize=(18, 10), facecolor='lightgray')
plt.suptitle('Check the numeric distribution', color='blue', fontsize=30)

sns.distplot(train['sales'], ax=axes[0, 0])
axes[0, 0].set_title('sales displot', fontsize=25)

sns.boxplot(x='sales', data=train, ax=axes[0, 1])
axes[0, 1].set_title('sales boxplot', fontsize=25)

sns.histplot(x='onpromotion', data=train, bins=20, ax=axes[1, 0])
axes[1, 0].set_title('onpromotion hist', fontsize=25)

sns.boxplot(x='onpromotion', data=train, ax=axes[1, 1])
axes[1, 1].set_title('onpromotion boxplot', fontsize=25)

plt.tight_layout()
plt.show()

<div style="background-color:lightblue;padding:18px;text-align:center">
    <h3> we found sales col have outliers but <span style="color:red">this case the data can be happend</span> as real data.</h3>
    <h3> I don't clean any outliers in train </h3>
</div>

In [None]:
# train plotly for my skill, haha

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=train['store_nbr'].value_counts().index, values=train['store_nbr'].value_counts()),
              1, 1)
fig.add_trace(go.Pie(labels=train['family'].value_counts().index, values=train['family'].value_counts()),
              1, 2)

<div style="background-color:lightblue;padding:10px;text-align:center">
    <h3>number of family and store_nbr's ingredients are the same!</h3>
</div>

# transactions

In [None]:
display(describe(transactions))
sns.displot(x='transactions', data=transactions)

# holidays_events

In [None]:
display(describe(holidays_events))

for var in holidays_events['locale_name'].unique():
    print(var, holidays_events.query('locale_name==@var')['locale'].unique())

### enough to use only 'locale_name' instead of using with 'locale' becuase locale_name covers locale
#### ex: Santo Domingo must be 'local', Ecuador must be 'National' etc.

In [None]:
# description isn't able to be uses for analysis this case.. drop it
holidays_events = holidays_events.drop(['locale', 'description'], axis=1)

In [None]:
locale_name = holidays_events['locale_name'].value_counts()
fig = px.pie(stores, values=locale_name, names=locale_name.index)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")

fig.show()

del locale_name

In [None]:
# train to use plotly.
specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs, subplot_titles=['type_holiday', 'transferred'])
type_holiday = holidays_events['type'].value_counts()
transferred = holidays_events['transferred'].value_counts()

fig.add_trace(go.Pie(labels=type_holiday.index, values=type_holiday),
              row=1, col=1)
fig.add_trace(go.Pie(labels=transferred.index, values=transferred),
              row=1, col=2)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")
fig = go.Figure(fig)
fig.show()

del type_holiday
del transferred

In [None]:
# encode transffered 
holidays_events['transferred'] = holidays_events['transferred'].apply(lambda x: 1 if x else 0)

# stores

In [None]:
display(describe(stores))

In [None]:
for var in stores['city'].unique():
    print(var, stores.query('city==@var')['state'].unique())

print('-'*100)
for var in stores['city'].unique():
    print(var, stores.query('city==@var')['type'].unique())
    
print('-'*100)
for var in stores['city'].unique():
    print(var, stores.query('city==@var')['cluster'].unique())

### above this, it's enough to use only 'city' instead of 'state'

In [None]:
stores = stores.drop('state', axis=1)
city = stores['city'].value_counts()
cluster = stores['cluster'].value_counts()

specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs, subplot_titles=['city', 'cluster'])

fig.add_trace(go.Pie(labels=city.index, values=city), row=1, col=1)
fig.add_trace(go.Pie(labels=cluster.index, values=cluster), row=1, col=2)

fig.update_layout(
title_font_color="#fff",paper_bgcolor="#283747",title_font_size=20,title_x=.5,font_color="#bbb",
    plot_bgcolor="#D6EAF8")
fig = go.Figure(fig)
fig.show()

del city
del cluster

In [None]:
sns.countplot(x='type', data=stores)

# oil

In [None]:
display(describe(oil))
sns.displot(x='dcoilwtico', data=oil)

<div style="background-color:lightgreen; padding:18px">
<h3>I fill null with 0! <br>
    we usually fill null with some represent value as one of the approaches. 
    <br><br>
    but in this case, I think the null means no transaction and unofficial data. <br>
    Hence filling some values will lead to be misunderstood.
</h3>
</div>

In [None]:
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(0)

# data merge

In [None]:
# merge data
merge_data = train.merge(oil, on='date', how='left')
merge_data = merge_data.merge(holidays_events, on='date', how='left')
merge_data = merge_data.merge(stores, on='store_nbr', how='left')
merge_data = merge_data.merge(transactions, on=['date', 'store_nbr'], how='left')

In [None]:
# copy
merge_copy = merge_data.copy()

# change dtype and get the date col
merge_copy['date'] = pd.to_datetime(merge_copy['date']).dt.date
merge_copy['year'] = pd.to_datetime(merge_copy['date']).dt.year
merge_copy['month'] = pd.to_datetime(merge_copy['date']).dt.month
merge_copy['day'] = pd.to_datetime(merge_copy['date']).dt.day

describe(merge_copy)

<div style="background-color:lightgreen; padding:10px; text-align:center;">
<h4>I fill null with 0 in transaction, too!
</h4>
</div>

In [None]:
merge_copy['transactions'] = merge_copy['transactions'].fillna(0)
merge_copy['dcoilwtico'] = merge_copy['dcoilwtico'].fillna(0)

In [None]:
display(describe(merge_copy))

### This holiday null are huge so I will recreate holiday col that flag 1: the day is holiday, 0: not holiday

### We needn't care about what holiday is.

In [None]:
merge_copy['holiday_flag'] = [1 if not val else 0 for val in merge_copy['type_x'].isnull()]
merge_copy = merge_copy.drop(['type_x', 'locale_name', 'transferred'], axis=1)
merge_copy = merge_copy.rename(columns={'type_y': 'stores_type'})
display(describe(merge_copy))

## check moving avg
### we seek how trend 'sales' moved.

In [None]:
df = merge_copy.copy()
df = df.sort_values('date')
df_g = df[['date', 'sales']].groupby('date').agg(date_sum=('sales', np.mean))
# month avg 
df_g['moving_avg'] = df_g.date_sum.rolling(30, min_periods=3).mean()

plt.figure(figsize=(20, 5))
plt.plot(df_g['moving_avg'])
plt.show()

del df

It's increasing roughly as year goes by.

## Check each term and how distributions are!

In [None]:
_, axes = plt.subplots(1, 2, figsize=(15, 8))
df = merge_copy.groupby('year').agg(sales_mean=('sales', np.mean), dcoilwtico_mean=('dcoilwtico', np.mean))
sns.barplot(x=df.index, y='sales_mean', data=df, ax=axes[0])
axes[0].set_title('Mean sales each year', fontsize=20)

# df = merge_copy.groupby('year').agg(dcoilwtico_mean=('dcoilwtico', np.mean))
axes[1].set_title('Mean dcoilwtico each year', fontsize=20)
sns.barplot(x=df.index, y='dcoilwtico_mean', data=df, ax=axes[1])

In [None]:
df = merge_copy.groupby('month').agg(sales_mean=('sales', np.mean))
plt.figure(figsize=(15, 5))
sns.barplot(x=df.index, y='sales_mean', data=df)

December is higher than others.

In [None]:
df = merge_copy.groupby(['year', 'month'], as_index=False).agg(sales_mean=('sales', np.mean))
plt.figure(figsize=(20, 5))
plt.title('Mean sales each year-month', fontsize=20)
sns.barplot(x='month', y='sales_mean', data=df, hue='year')
plt.show()

#### Dec is so important for sale but data of 2017 doesn't exist.
#### So, I think 'month' cols can be removed.

In [None]:
df = merge_copy.groupby('day').agg(sales_mean=('sales', np.mean))
plt.figure(figsize=(15, 5))
plt.title('Mean sales each day', fontsize=20)
sns.barplot(x=df.index, y='sales_mean', data=df)
plt.show()

### Sales at the beginning and end of month tends to increase!!

In [None]:
df = merge_copy.groupby('cluster').agg(sales_mean=('sales', np.mean))
plt.figure(figsize=(15, 5))
plt.title('Mean sales each cluster', fontsize=20)
sns.barplot(x=df.index, y='sales_mean', data=df)
plt.show()

### Check the Correlation

In [None]:
plt.figure(figsize=(10, 10))
corr = merge_copy.corr()
sns.heatmap(corr, annot=True)

#### id- year is strong, sales-onpro is also strong correlation!
#### so onpromotion may be key to analysis??

In [None]:
del df
del merge_data

# Analysis the data!

<div style="background-color:lightblue; padding:15px;">
    <h2>Let's move to analysis. This time we try below aprroaches</h2>
    <ol>
        <h3><li><span style="color:red;">Simple LinearRegression</span>: <br>we need to think whether simple model works well or not before randomforest, lightgbm or some complicated and high model. <br>If it works, we can get the shortcut and can introduce practically and simply!</li></h3>
    <h3><li><span style="color:red;">Use Pipeline</span>: <br>Next, we use pipeline including pca to cut down dimensions then we figure out what eigens say about.</li></h3>
        <h3><li><span style="color:red;">Use RFE and get important cols</span>: <br>RFE can tell us what cols are important.</li></h3>
        <h3><li><span style="color:red;">RandomForest</span>: <br>Use RandomForest and this feature_importance tell us how important cols are like RFE!!</li></h3>
        <h3><li><span style="color:red;">LightGBM</span>: <br>lightBGM can analyze quickly and precisely! It can be really good model but I feel this model doesn't talk to me, haha</li></h3>
        <h3><li><span style="color:red;">Stacking</span>: <br>lightBGM can analyze quickly and precisely! It can be really good model but I feel this model doesn't talk to me, haha</li></h3>
    </ol>

</div>

### Prepare the data

In [None]:
data = merge_copy.copy().drop(['id', 'date'], axis=1)
data = pd.get_dummies(data, drop_first=True)
X = data.drop('sales', axis=1)
y = data['sales']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=12)

<div style="background-color:lightgray; padding:5px;">
<h2>1st Simple LinearRegression</h2>
</div>

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))

<div style="background-color:lightgray; padding:5px;">
<h2>2nd Use Pipeline</h2>
</div>

In [None]:
n = 3
pipe = Pipeline([('pca', PCA(n_components=n)), ('lr', LinearRegression())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(n, np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))

n = 10
pipe = Pipeline([('pca', PCA(n_components=n)), ('lr', LinearRegression())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(n, np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))

#### PCA seemed to  over-cut down. so this case it doesn't help.

<div style="background-color:lightgray; padding:5px;">
<h2>3rd Use RFE and get important cols</h2>
</div>

### We figure out top 10 important columns!
(I choose 10, but it's fine to use no matter how many number you want)

In [None]:
# features=10
rfe = RFE(estimator=LinearRegression(), n_features_to_select=10)
rfe.fit(X_train, y_train)
y_pred = rfe.predict(X_test)

print(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print('\n----top10 cols----\n')
for boolean, col in zip(rfe.support_, X.columns):
    if boolean:
        print(col)

<div style="background-color:lightgreen; padding:10px; text-align:center;">
<h3>As we estimate, onpromotion looks important!
</h3>
</div>

<div style="background-color:lightgray; padding:5px;">
<h2>4th RandomForest</h2>
</div>

In [None]:
rf = RandomForestRegressor(n_estimators=30, random_state=123, max_leaf_nodes=50, max_depth=30)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))

In [None]:
importances = np.array(rf.feature_importances_)
forest_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)[:16]
print(forest_importances[:16])
# std = np.std([
#     tree.feature_importances_ for tree in rf.estimators_], axis=0)

fig, ax = plt.subplots(1, 1, figsize=(20, 10))
# forest_importances.plot.bar(yerr=std, ax=ax)
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

<div style="background-color:lightgreen; padding:10px;">
    <h2>We get to know what the features are helpful to analyze better,<br> according to RFE and RandomForest!!!</h2>
    <h3>We cut down the uselesss col then we try linearReg</h3>
</div>

In [None]:
important_cols = ['store_nbr', 'onpromotion', 'cluster', 'transactions', 'year', 'family_BEVERAGES', 'family_CLEANING', 'family_DAIRY', 'family_GROCERY I', 'family_PRODUCE']
X_importance = X[important_cols]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_importance, y, test_size=0.2, random_state=123)

model_importance = LinearRegression()
model_importance.fit(X_train2, y_train2)
y_pred = model_importance.predict(X_test2)

print(np.sqrt(mean_squared_error(y_true=y_test2, y_pred=y_pred)))

<div style="background-color:lightgreen; padding:10px; text-align:center;">
<h2>Comparing to 1st try, the result is close to 1st result. 
    <br>This means that it's enough to use only 'important_cols'!!
</h2>
</div> 

## 5th lightGBM

In [None]:
# Preparing dataset for LightGBM
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {'metric' : 'rmse', 'seed': 123, 'verbosity':-1}

# train data
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=[lgb_test])

In [None]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
print(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))

<div style="background-color:lightgreen; padding:8px; text-align:center;">
    <h4>Exciting result!</h4>
    <h3>this model is pretty good for now. <br>However, I want more acc so try to use 'optuna' to tune hyperparams!</h3>
</div> 

## Optuna

In [None]:
# prepare validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=12)

def objective(trial):
    param = {
        'metric' : 'rmse', 
        'verbosity': -1, 
        'boosting_type': trial.suggest_categorical('hoge', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-7, 1.0)
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

    gbm = lgb.train(param, lgb_train, valid_sets=lgb_val, verbose_eval=False, early_stopping_rounds=30)
    
    y_pred = gbm.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
print(study.best_value)
print(study.best_params)

# Check the feature importance

In [None]:
_ = merge_copy.copy().drop(['id', 'date'], axis=1)
_ = pd.get_dummies(_, drop_first=True)
_X = _.drop('sales', axis=1)
_y = _['sales']

scaler = StandardScaler()
_X_scaled = scaler.fit_transform(_X)

_X_train, _X_test, _y_train, _y_test = train_test_split(_X_scaled, _y, test_size=0.2, random_state=12)
_X_train2, _X_val2, _y_train2, _y_val2 = train_test_split(_X_train, _y_train, test_size=0.2, random_state=12)

In [None]:
param = {
    'metric' : 'rmse', 
    'verbosity': -1, 
    'boosting_type': 'gbdt',
    'num_leaves': study.best_params['num_leaves'],
    'learning_rate': study.best_params['learning_rate']
}

lgb_train = lgb.Dataset(_X_train2, _y_train2)
lgb_val = lgb.Dataset(_X_val2, _y_val2, reference=lgb_train)

gbm = lgb.train(param, lgb_train, valid_sets=lgb_val, verbose_eval=False, early_stopping_rounds=30)

In [None]:
importance = pd.DataFrame(gbm.feature_importance(), index=_X.columns, columns=['importance'])

# plt.figure()
importance.sort_values(by='importance', ascending=False).plot.bar(figsize=(20, 8))

In [None]:
del _X_scaled
del _
del _X
del _y

## below code I'm fixing~~

## Last, stacking model
### before this, we remake the data which has only importance cols

In [None]:
data_important_cols = merge_copy.copy().drop(['id', 'date', 'dcoilwtico', 'holiday_flag', 'month', 'day', 'city', 'stores_type'], axis=1)
data_important_cols['family_BEVERAGES'] = data_important_cols['family'].apply(lambda x: 1 if x =='BEVERAGES' else 0)
data_important_cols['family_CLEANING'] = data_important_cols['family'].apply(lambda x: 1 if x =='CLEANING' else 0)
data_important_cols['family_DAIRY'] = data_important_cols['family'].apply(lambda x: 1 if x =='DAIRY' else 0)
data_important_cols['family_GROCERY'] = data_important_cols['family'].apply(lambda x: 1 if x =='GROCERY I' else 0)
data_important_cols['family_PRODUCE'] = data_important_cols['family'].apply(lambda x: 1 if x =='PRODUCE' else 0)
data_important_cols = data_important_cols.drop(['family'], axis=1)

describe(data_important_cols)

In [None]:
X = data_important_cols.drop('sales', axis=1)
y = data_important_cols['sales']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=123)

In [None]:
# # prepare the metrics list
# pipe_pred = []
# rf_pred = []
# gbm_pred = []

# # prepare the metrics list
# pipe_true = []
# rf_true = []
# gbm_true = []

# # pipeline
# model1 = Pipeline([('pca', PCA(n_components=10)), ('lr', LinearRegression())])
# # random forest
# model2 = RandomForestRegressor(n_estimators=50, random_state=123, max_leaf_nodes=100, max_depth=30)
# # lightgbm
# params = {'metric' : 'rmse', 'seed': 123, 'verbosity':200}
# model3 = lgb

# kfold = KFold(n_splits=5).split(X_train, y_train)

# for (train, val) in kfold:
#     X_train_cv = X_train[train]
#     y_train_cv = y_train.iloc[train]
#     X_val_cv = X_train[val]
#     y_val_cv = y_train.iloc[val]
    
#     lgb_train = lgb.Dataset(X_train_cv, y_train_cv)
#     lgb_val = lgb.Dataset(X_val_cv, y_val_cv)
    
#     # train
#     model1.fit(X_train_cv, y_train_cv)
#     model2.fit(X_train_cv, y_train_cv)
#     model_3 = model3.train(params, lgb_train, num_boost_round=200)
    
#     y_cv_pred1 = model1.predict(X_val_cv)
#     y_cv_pred2 = model2.predict(X_val_cv)
#     y_cv_pred3 = model_3.predict(X_val_cv, num_iteration=gbm.best_iteration)
    
#     pipe_pred.append(y_cv_pred1)
#     rf_pred.append(y_cv_pred2)
#     gbm_pred.append(y_cv_pred3)
    
#     # append y data
#     pipe_true.append(y_val_cv.values)
#     rf_true.append(y_val_cv.values)
#     gbm_true.append(y_val_cv.values)

In [None]:
# pipe_pred = np.concatenate(pipe_pred)
# rf_pred = np.concatenate(rf_pred)
# gbm_pred = np.concatenate(gbm_pred)
# pipe_true = np.concatenate(pipe_true)

# df = pd.DataFrame({'true': pipe_true, 'pipe': pipe_pred, 'rf': rf_pred, 'lgb': gbm_pred})

In [None]:
# X_stack = df.drop('true', axis=1)
# y_stack = df['true']

# X_stack_train, X_stack_test, y_stack_train, y_stack_test = train_test_split(X_stack, y_stack, test_size=0.2, random_state=123)

# meta_model = LinearRegression()
# meta_model.fit(X_stack_train, y_stack_train)

# meta_val_pred = meta_model.predict(X_stack_test)
# print ("stacking model: {:.4f}".format(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))))

## I think this model is somethins wrong, I'll fix this model so I use lightgbm this submittion for now

## ~~

# Adjust test data for lightgbm and submission

In [None]:
col = importance.query('importance>100').index

test_copy = test.copy()
# merge data
test_copy = test_copy.merge(oil, on='date', how='left')
test_copy = test_copy.merge(holidays_events, on='date', how='left')
test_copy = test_copy.merge(stores, on='store_nbr', how='left')
test_copy = test_copy.merge(transactions, on=['date', 'store_nbr'], how='left')
# change dtype and get the date col
test_copy['date'] = pd.to_datetime(test_copy['date']).dt.date
test_copy['year'] = pd.to_datetime(test_copy['date']).dt.year
test_copy['month'] = pd.to_datetime(test_copy['date']).dt.month
test_copy['day'] = pd.to_datetime(test_copy['date']).dt.day
# fillna with 0
test_copy['transactions'] = test_copy['transactions'].fillna(0)
test_copy['dcoilwtico'] = test_copy['dcoilwtico'].fillna(0)
# create new col as I did above
test_copy['holiday_flag'] = [1 if not val else 0 for val in test_copy['type_x'].isnull()]
# test_copy = merge_copy.drop(['type_x', 'locale_name', 'transferred'], axis=1)
test_copy = test_copy.rename(columns={'type_y': 'stores_type'})

test_copy = test_copy.drop(['id', 'date'], axis=1)
test_copy = pd.get_dummies(test_copy, drop_first=True)

In [None]:
describe(test_copy)

In [None]:
col = importance.query('importance>0').index
X2 = data[col]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X2)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=12)

# best param!
params = {
    'metric' : 'rmse', 
    'verbosity': -1, 
    'seed': 123,
    'boosting_type': 'gbdt',
    'num_leaves': study.best_params['num_leaves'], 
    'learning_rate': study.best_params['learning_rate']
}

# Preparing dataset for LightGBM
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)
# train data
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=[lgb_test], verbose_eval=False)


# choose importance cols
test_copy = test_copy[col]
scaler = StandardScaler()
test_scaled = scaler.fit_transform(test_copy)
prediction= gbm.predict(test_scaled, num_iteration=gbm.best_iteration)

del X2
del lgb_train
del lgb_test

In [None]:
submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')

submission['sales'] = [pred if pred >= 0 else 0 for pred in prediction]
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission

<div style="background-color:lightgreen; padding:10px; text-align:center;">
    <h1>Conclusion</h1>
    <h2>I tried EDA, Viz, and some methods.
        If you want to try more, Let's try!!
    </h2>
</h2>
<h3 style="color:red">Thank you for visiting my notebook. Feel free to upvotes or comment if you like mine!!</h3>
</div> 