In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from matplotlib import pyplot as plt
import seaborn as sns

# Reading the Data

In [None]:
# Reading the data
train_df = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/train.csv')
test_df = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/test.csv')
merged = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print("Concated Shift: ", merged.shape)
train_df.head()

# Basic Data Checkup

### Total Records In The Data

In [None]:
train_df.shape, test_df.shape, merged.shape

### Missing Values Check

In [None]:
train_df.isnull().sum()

### Outlier Check

In [None]:
# Outlier detection
plt.figure(figsize=(10,4))
sns.boxplot(x=merged['sales'])
plt.show()

# Transformation Steps

In [None]:
# Transformation
remove_outlier= True

def transform_data(df, remove_outlier=True):

    df = df.copy()
    df.drop('id', axis=1, inplace=True)
    df['date'] = pd.to_datetime(df['date'],
                                infer_datetime_format=True,
                                errors='coerce'
                               )

    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['week'] = df['date'].dt.week
    df['weekday'] = df['date'].dt.weekday
    df["is_wknd"] = df['date'].dt.weekday // 4
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)

    if remove_outlier:
        df = df[(df['sales'] <= 230) | (df['sales'].isnull())]
        
    return df
        
print("Before Transforming the data: ", merged.shape)
df = transform_data(merged)
print("After Transformin the data: ", df.shape)

In [None]:
df.head()

### Outlier Check Now

In [None]:
# Outlier detection
plt.figure(figsize=(10,4))
sns.boxplot(x=df['sales'])
plt.show()

# Feature Creation

In [None]:
lags = [91, 98, 105, 112, 119, 126, 182, 364, 546, 728]

def lag_features(df, lags):
    cols = ['date','store', 'item']
    train = df.copy()
    for lag in lags:
        lagged = train[['date', 'store', 'item', 'sales']].copy()
        lagged['date'] = lagged['date'] + pd.Timedelta(days=lag)
        df = pd.merge(df, lagged, on=cols, how='left', suffixes=("",f"_lag_{lag}"))
        
    return df

df = lag_features(df, lags)

# One hot encoding of Item and Stores
from sklearn.preprocessing import OneHotEncoder
en = OneHotEncoder(sparse=False)
enc_features = pd.DataFrame(en.fit_transform(df[['store','item']]), 
                            columns=en.get_feature_names_out())

df.drop(['store', 'item'], axis=1, inplace=True)

df = pd.concat([enc_features, df], axis=1)

In [None]:
df.tail()

# Train/Test Split

In [None]:
X = df[~df['sales'].isnull()]
test = df[df['sales'].isnull()]

# Filling Null With 0
X.fillna(0, inplace=True)
test.fillna(0, inplace=True)

x_train = X[(X['date'] <= '2017-09-01')].copy()
x_test = X[X['date'] > '2017-09-01'].copy()

y_train = x_train['sales'].copy()
y_test = x_test['sales'].copy()

x_train.drop(['sales', 'date'], axis=1, inplace=True)
x_test.drop(['sales', 'date'], axis=1, inplace=True)
test.drop(['sales', 'date'], axis=1, inplace=True)

x_train.shape, y_train.shape, x_test.shape, y_test.shape, test.shape

# Scaling of Features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler()

x_train = pd.DataFrame(scale.fit_transform(x_train),
                       columns=x_train.columns
                      )

x_test = pd.DataFrame(scale.transform(x_test),
                       columns=x_test.columns
                      )

test = pd.DataFrame(scale.transform(test),
                    columns=test.columns
                   )


x_train.head()

# Model Training & Model Testing

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# glm = LinearRegression()
# glm.fit(x_train, y_train)

# print("RMSE score")
# print(np.sqrt(mean_squared_error(y_test, glm.predict(x_test))))

# from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=100, max_depth=6)
# rf.fit(x_train, y_train)
# print("RMSE score")
# print(np.sqrt(mean_squared_error(y_test, rf.predict(x_test))))
import lightgbm as lgb
lgb_params = {'metric': {'mae'},
              'num_leaves': 12,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 1000,
              'early_stopping_rounds': 500,
              'nthread': -1}

lgbtrain = lgb.Dataset(data=x_train, label=y_train)
lgbval = lgb.Dataset(data=x_test, label=y_test, reference=lgbtrain)

model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
#                   feval=mean_squared_error,
                  verbose_eval=100)

# Preparing the Submission Files

In [None]:
sub = pd.DataFrame({'id':test_df['id'], 'sales':model.predict(test)})
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
sub.describe()