In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import plotly.express as px
import time
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
mse = mean_squared_error

def downcast(df: pd.DataFrame) -> pd.DataFrame:
    float_cols = [c for c in df if df[c].dtype in ["float64"]]
    int_cols = [c for c in df if df[c].dtype in ['int64']]
    df[float_cols] = df[float_cols].astype('float32')
    df[int_cols] = df[int_cols].astype('int16')
    return df

In [None]:
print("Loading chained predictions - basic model. If you do not want to use those check variable 'use_meta_feats' below")
if 1:
    print("Loading GBDT (Classification)+RF chained predictions - LB score is about 1.02 and a slightly different feature set has been used")
    chained_rf_test_preds = pd.read_pickle("../input/classification-rf/tme_chained_predictions.pkl")
    print("Check the errors (MAE)")
    
    chained_rf_test_preds["diff"] = (chained_rf_test_preds["meta_est_sales_rf"].fillna(0).clip(0,20) - chained_rf_test_preds["target"].clip(0,20)).abs()
    condition = "date_block_num>18 and date_block_num<33"
    l = chained_rf_test_preds.query(condition).shape[0]
    sns.lineplot(
        data=chained_rf_test_preds.query(condition).sample(int(0.25*l)),
        x="date_block_num", y="diff",
        markers=True, dashes=False, ci=90,
    )

In [None]:
# Load chained predictions - for a fast benchmarking
if 1:
    print("Loading chained predictions from optimized GBDT model - uses autoregressive features only. Optimization ran on the GBDT model parameters")
    chained_gbdt_test_preds = pd.read_csv("../input/lgb-simple-opt/chained_predictions.csv")
    # create a reference score
    for d in [18,30,31,32]:
        y_est = chained_gbdt_test_preds.query(f"date_block_num>={d} and date_block_num<33")["est_sales"]
        y_true = chained_gbdt_test_preds.query(f"date_block_num>={d} and date_block_num<33")["target"]

        print(f"MSE on the chained test set, from {d}: {mse(y_true.clip(0,20), y_est.clip(0,20))**2}")

    condition = "date_block_num>18 and date_block_num<33"
    chained_gbdt_test_preds["diff"] = (chained_gbdt_test_preds["est_sales"].clip(0,20) - chained_gbdt_test_preds["target"].clip(0,20)).abs()
    l = chained_gbdt_test_preds.query(condition).shape[0]
    sns.lineplot(
        data=chained_gbdt_test_preds.query(condition).sample(int(0.25*l)),
        x="date_block_num", y="diff",
        markers=True, dashes=False, ci=90,
    )

In [None]:
# Slightly improved versions of 
items = pd.read_csv('../input/data-preprocessing/items.csv')
shops = pd.read_csv('../input/data-preprocessing/shops.csv')
cats = pd.read_csv('../input/data-preprocessing/item_categories.csv')
train = pd.read_csv('../input/data-preprocessing/sales_train.csv')
test  = pd.read_csv('../input/data-preprocessing/test.csv').set_index("ID")

print("Downcasting data to save kernel memory")
dataframes = [train, shops, items, cats]
for d in dataframes:
    d = downcast(d)

# Load the feature engineering

Should be ready at this point.
Has been done in companion notebooks:
1. [general data processing](https://www.kaggle.com/code/andreawr/data-preprocessing)
2. [time (lag-based) features](https://www.kaggle.com/code/andreawr/time-feature-eng)

The task have been separated to make the most out of kernel memory.

In [None]:
start_train = 13
tme = downcast(pd.read_pickle("../input/time-feature-eng/tme_with_feature_eng_and_text.pkl").query(f"date_block_num>={start_train}"))
tme["item_cnt_month"] =  tme["item_cnt_month"].clip(0,50)

tme.sample(4)
sns.scatterplot(data=tme.query("item_cnt_month>0 and days_with_sales>=0").sample(8000),
                x="days_with_sales", y="target", alpha=.5)

In [None]:
print("Groups of features so far")

shop_cl_col = [c for c in tme.columns if c.startswith("shop_cl")]
shop_cl_col

import re
pattern = re.compile(r"_lag\d*")
lag_cols = [c for c in tme.columns if bool(pattern.search(c))] # time-based features, from notebook "time-feature-eng"

pattern = re.compile(r"_prev")
prev_cols = [c for c in tme.columns if bool(pattern.search(c))]

pattern = re.compile(r"txt_")
txt_cols = [c for c in tme.columns if bool(pattern.search(c))]


other_feats = ['avg_item_3mo',
               'diff_1yr',
               'roc_1_2',
               'roc_2_3',
               'roc_1_4',
               'roc_1_12',
               'diff_12_34',
               'month_num',
               'daydiff',
               'item_age',
               'item_age_in_shop',
               'price_greater_than80',
               'price_var_within_month',
               'price_max_avg_within_month',
               'shop_city_id',
               'shop_is_internet',
               'days_no_sales_beginning']


In [None]:
remainder = tme.columns
remainder = [c for c in remainder if c not in lag_cols]
remainder = [c for c in remainder if c not in prev_cols]
remainder = [c for c in remainder if c not in txt_cols]
remainder = [c for c in remainder if c not in other_feats]
remainder = [c for c in remainder if c not in shop_cl_col]

In [None]:
print(shop_cl_col)

In [None]:
assert "target" in remainder

In [None]:
# meta features from GBDT (LightGBM), optimized with Optuna
use_meta_feats = True
if use_meta_feats:
    print("using meta features")
    
    if "est_sales" in tme.columns:
        tme = tme.drop(columns=["est_sales"])
    if "meta_est_sales" in tme.columns:
        tme = tme.drop(columns=["meta_est_sales"])
    tme = tme.merge(chained_gbdt_test_preds[["item_id","shop_id","date_block_num","est_sales"]], on=["item_id","shop_id","date_block_num"])
    tme = tme.rename(columns={"est_sales":"meta_est_sales"})
else:
    tme["meta_est_sales"] = np.nan # features from GDBT
    # features from GDBT Classifier + RandomForestRegressor

In [None]:
assert "meta_est_sales" in tme.columns

In [None]:
# meta features from GBDT Classifier + RF regressor. No hyperparam optimization
if use_meta_feats:
    print("Using meta features from classifier + regressor")
    tme = tme.merge(chained_rf_test_preds[["item_id","shop_id","date_block_num","meta_has_sales_rf","meta_est_sales_rf"]],
                    on=["item_id","shop_id","date_block_num"], how="left")
    
    tme["gbdt_rf_diff"] = (tme["meta_has_sales_rf"]>=0.5).astype(int) * (tme["meta_est_sales"]-tme["meta_est_sales_rf"])
else:
    tme["gbdt_rf_diff"] = np.nan
    tme["meta_has_sales_rf"] = np.nan
    tme["meta_est_sales_rf"] = np.nan

In [None]:
assert "gbdt_rf_diff" in tme.columns
assert "meta_has_sales_rf" in tme.columns
assert "meta_est_sales_rf" in tme.columns

In [None]:
if use_meta_feats:
    sns.histplot(tme[["meta_est_sales","meta_est_sales_rf"]].dropna().clip(0,2).sample(6000))

In [None]:
for c in tme.columns:
    if c.startswith("txt"):
        tme[c] = tme[c].astype(bool)
tme = downcast(tme)

In [None]:
use_feats = ["shop_id", "item_id"] + other_feats + prev_cols + lag_cols + shop_cl_col # + txt_cols

print("List of low importance feature is manually mantained")
low_imp_feats = ['price_max_avg_within_month','shop_city_id','shop_is_internet','txt_adventur','txt_audiobook','txt_creed','txt_digipack','txt_figur','txt_firm','txt_sat','txt_tale',
                 'txt_classic','txt_collect','txt_dvd','txt_figurin','txt_mp3','txt_delivery','txt_best','txt_2cd','txt_box','txt_ps3','txt_black']
use_feats = [u for u in use_feats if u not in low_imp_feats]

if use_meta_feats:
    use_feats = use_feats + ["meta_est_sales_rf","gbdt_rf_diff","meta_has_sales_rf","meta_est_sales"]
use_feats = list(set(use_feats))

In [None]:
params = \
        {'lr': 0.12,
         'max_depth': 9,
         'n_estimators': 55,
         'bagging_fraction': 0.6,
         'min_data_in_leaf': 20,
         'max_bin': 130,
         'initial_sales_clipping': 20,
         'bagging_freq': 5,
         'objective': 'mse',
         'verbose': -1,
         'force_col_wise':True,
         'eval_metric': 'rmse'}

params["num_leaves"] = min(2 ** params["max_depth"] - 2, 4096)
params["bagging_freq"] = 5
params["objective"] = "mse"
params["learning_rate"] = params["lr"]
import lightgbm as lgb

# Train - checking CV scores

In [None]:
validate_to = 18
val = [18, 22, 25, 28, 30, tme.date_block_num.max()]
val = [28, tme.date_block_num.max()]

tme["est_sales"] = np.nan
scores = {}

for i, validate_to in enumerate(val):
    scores[validate_to] = {}
    X_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")[use_feats]
    y_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")["target"].clip(0, params["initial_sales_clipping"])

    X_val = X_train.sample(int(X_train.shape[0]*.1), random_state=2022) # used for early stopping
    y_val = y_train.loc[X_val.index].clip(0, params["initial_sales_clipping"])

    X_train = X_train.drop(X_val.index)
    y_train = y_train.drop(y_val.index)

    lgb_train_dataset = lgb.Dataset(X_train, y_train)
    lgb_eval_dataset = lgb.Dataset(X_val, y_val)

    model = lgb.train(
                      params, 
                      lgb_train_dataset,
                      valid_sets=(lgb_train_dataset, lgb_eval_dataset), 
                      feature_name = use_feats,
                      verbose_eval=5, 
                      early_stopping_rounds = 10
                     )
    
    if validate_to == tme.date_block_num.max() or (i+1)==len(val):
        next_val = tme.date_block_num.max()
    else:
        next_val = val[i+1] - 1
    
    # memory cleanup
    train_rmse = mean_squared_error(model.predict(X_train).clip(0,20), y_train.clip(0,20)) ** 0.5
    val_rmse = mean_squared_error(model.predict(X_val).clip(0,20), y_val.clip(0,20)) ** 0.5
    lgb_train_dataset = None
    lgb_eval_dataset = None
    X_train = None
    y_train = None
    X_val = None
    y_val = None
    
    # generate chained predictions
    print(f"Generating test predictions from {validate_to} to {next_val}")
    condition = f"date_block_num >= {validate_to} and date_block_num<={next_val}"
    X_test = tme.query(condition)[use_feats]
    y_test = tme.query(condition)["target"]
    
    test_preds = model.predict(X_test)
    tme.loc[X_test.index, "est_sales"] = test_preds # no clipping
    test_rmse = mean_squared_error(test_preds.clip(0,20), y_test.clip(0,20)) ** 0.5
    
    test_preds = None
    
    print(f"Train rmse: {train_rmse:.3f}")
    print(f"Val   rmse: {val_rmse:.3f}")
    if validate_to != 33:
        print(f"Test  rmse: {test_rmse:.3f}")
        model = None # do not keep the model in memory

In [None]:
#[100]	training's l2: 0.725106	valid_1's l2: 0.827131
tme["diff_meta"] = (tme["meta_est_sales"].clip(0,20) - tme["target"].clip(0,20)).abs()
tme["diff_est"] = (tme["est_sales"].clip(0,20) - tme["target"].clip(0,20)).abs()

data = tme.query(f"date_block_num>{val[0]} and date_block_num<33")[["date_block_num","diff_meta","diff_est"]].dropna()
print(data.shape[0])

In [None]:
print("Feature importance is better evaluated on the test set..")
lowest_imp_feats = []
for name, imp in zip(model.feature_name(), model.feature_importance()):
    print(f"{name} -> {imp}")
    
    if imp <=30:
        lowest_imp_feats.append(name)

In [None]:
lowest_imp_feats

# Submit (and serialize model)

In [None]:
if 0: # a sanity check on the meta features
    sns.lineplot(
            data=data.sample(int(0.35*data.shape[0])).melt(id_vars="date_block_num"),
            x="date_block_num", y="value", hue="variable",
            markers=True, dashes=False, ci=90,
        )

In [None]:
print(tme.query("date_block_num==33")[["est_sales"]].clip(0,20).mean())
print(tme.query("date_block_num==33")[["item_cnt_month"]].clip(0,20).mean())

preds = \
    tme.query("date_block_num==33")[["shop_id","item_id","est_sales","item_cnt_month"]]
preds["est_sales"] =  preds["est_sales"].clip(0, 20)
preds = preds.drop(columns=["item_cnt_month"])

print(preds.mean())
len_before = test.shape[0]

sample_submission = \
test.reset_index().merge(preds,
           on=["shop_id","item_id"], how="left").set_index("ID").rename(columns={"est_sales":"item_cnt_month"})

num_nans = sample_submission["item_cnt_month"].isna().sum()
if num_nans>0:
    print(f"Pay attention - there are {num_nans} in your iten_cnt_month. Filling with zero")
    sample_submission["item_cnt_month"] = sample_submission["item_cnt_month"].fillna(0)
    
assert sample_submission.shape[0] == len_before

sample_submission.item_cnt_month.mean()
sample_submission[["item_cnt_month"]].to_csv('../working/submission.csv')

In [None]:
model.save_model('lgb_classifier.txt', num_iteration=model.best_iteration) 

# Load by using:
#model = lgb.Booster(model_file='lgb_classifier.txt')