In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
import multiprocessing

from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import confusion_matrix, plot_roc_curve, accuracy_score, classification_report, r2_score

#Suppressing warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
df= pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")

In [None]:
df.to_csv("test")

# EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df[["num_sold"]].describe().T

In [None]:
df.isna().mean().sort_values(ascending=False)

In [None]:
n_unique = df.apply('nunique')
n_unique.index[n_unique == 1].to_list()

In [None]:
def describing_data(var):
    res = df.groupby(var)["num_sold"].describe()
    return res

In [None]:
describing_data(["store", "product", "country"])

In [None]:
sns.kdeplot(x="num_sold", hue ="country", data = df, fill = True);

In [None]:
p = sns.FacetGrid(df, row="product", col="store", height=3.5, sharey=False, sharex=True, xlim=[0,2000])
p.map_dataframe(sns.kdeplot,x="num_sold", hue ="country",  fill = True);

In [None]:
fig, ax = plt.subplots(2,1, figsize=[8,8])
sns.kdeplot(ax = ax[0], x= df["num_sold"], bw=0.3)
sns.kdeplot(ax = ax[1], x=np.log(df["num_sold"]), bw=0.3)
ax[0].set_title("N sold (target)")
ax[1].set_title("ln of N sold")
plt.show()

# Feature engineering

In [None]:
def feature_engineering(df):
    df_categorial = df.select_dtypes("object").drop("date", axis = 1)
    dummies = pd.get_dummies(df_categorial, drop_first = True)
    df["date"] = pd.to_datetime(df["date"])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df["weekday"] = df["date"].dt.weekday # not colinear with day, so include
    no_holidays = holidays.NO()
    fi_holidays = holidays.FI()
    se_holidays = holidays.SE()
    df["no_holiday"] = df["date"].transform(lambda x: x in no_holidays)*1
    df["se_holiday"] = df["date"].transform(lambda x: x in se_holidays)*1
    df["fi_holiday"] = df["date"].transform(lambda x: x in fi_holidays)*1
    df = pd.concat([df[["year", "month", "day", "weekday", "no_holiday", "se_holiday", "fi_holiday"]], dummies],1)
    return df

In [None]:
df_reg = feature_engineering(df)

In [None]:
df_reg.head()

# Split into traning and test set

In [None]:
y = np.log(df["num_sold"])
X = df_reg

In [None]:
idx_train = X[X["year"].isin([2015, 2016, 2017])].index
idx_test = X[X["year"].isin([2018])].index

In [None]:
X_train = X.iloc[idx_train]
y_train = y.iloc[idx_train]
X_test = X.iloc[idx_test]
y_test = y.iloc[idx_test]

# Evaluate various models

In [None]:
def evaluate_model(model):
    mod = model()
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    
    n = X_train.shape[0]
    p = X_train.shape[1]
    
    r2 = r2_score(y_test, y_pred)
    mae = np.abs(y_test - y_pred).mean()
    mse = ((y_test - y_pred)**2).mean()
    rmse = np.sqrt(mse)
    
    metrics = dict(r2=r2, mae=mae, mse=mse,rmse=rmse)

    return(metrics)

In [None]:
models = [
    LinearRegression,
    RandomForestRegressor, 
    XGBRegressor,   
    LGBMRegressor
]

scores = list()
for m in models:
    scores.append(evaluate_model(m))

In [None]:
df_scores = pd.DataFrame(scores, index = [m.__name__ for m in models]).sort_values("rmse")
df_scores

In [None]:
metrics = ["rmse", "mae"]
fig, ax = plt.subplots(1,2, figsize=(15,5))
for idx, m in enumerate(metrics):
    df_scores[m].plot.bar(ax=ax[idx])
    ax[idx].set_title(m)
plt.show()

# Xgboost hyperparameter tuning

In [None]:
xgb_reg = XGBRegressor(n_jobs=1, n_estimators = 500)

In [None]:
params = {
 "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
 "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
xgb_model = RandomizedSearchCV(estimator = xgb_reg, param_distributions = params, 
                                     scoring='neg_mean_squared_error', n_iter = 50, cv = tscv, verbose=0, 
                                     random_state=123, n_jobs = -1)

In [None]:
%%time
xgb_model.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_model.predict(X_test)

$r^2$ score (higher = better)

In [None]:
r2_score(y_test, y_pred_xgb) #0.969139

MSE-score (lower = better)

In [None]:
((y_test - y_pred_xgb)**2).mean() #0.012923

In [None]:
cv_results_xgb = pd.DataFrame(xgb_model.cv_results_).sort_values("rank_test_score")

In [None]:
param_cols = [col for col in cv_results_xgb.columns if "param_" in col]
param_cols.extend(["rank_test_score"])

Best params:

In [None]:
cv_results_xgb[param_cols].head()

# Light GBM hyperparameter tuning

In [None]:
lgbm_reg = LGBMRegressor(n_jobs=1, n_estimators=1000)

In [None]:
params = {
    "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth" : randint(3, 30),
    "num_leaves" : randint(6, 50),
    "min_child_weight" : [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1],
    'subsample': uniform(loc=0.2, scale=0.8),
    "colsample_bytree" : uniform(loc=0.4, scale=0.6)
}

In [None]:
lgbm_model = RandomizedSearchCV(estimator = lgbm_reg, param_distributions = params, 
                                     scoring='neg_mean_squared_error', n_iter = 100, cv = tscv, verbose=0, 
                                     random_state=123, n_jobs = -1)

In [None]:
%%time
lgbm_model.fit(X_train, y_train)

In [None]:
y_pred_lgbm = lgbm_model.predict(X_test)

$r^2$ score

In [None]:
r2_score(y_test, y_pred_lgbm) #0.971418

MSE-score

In [None]:
((y_test - y_pred_lgbm)**2).mean() #0.011969

Best params:

In [None]:
cv_results_lgbm = pd.DataFrame(lgbm_model.cv_results_).sort_values("rank_test_score")
param_cols = [col for col in cv_results_lgbm.columns if "param_" in col]
param_cols.extend(["rank_test_score"])
cv_results_lgbm[param_cols].head()

# Applying the best model to the test set

In [None]:
X_test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
X_test.head()

In [None]:
idx = X_test.row_id

In [None]:
X_test = feature_engineering(X_test)

In [None]:
y_pred = lgbm_model.predict(X_test)

In [None]:
pred_output = pd.DataFrame({'row_id': idx,'num_sold': np.round(np.exp(y_pred))})

In [None]:
pred_output.head()

In [None]:
pred_output.to_csv('submission.csv', index=False)