In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import plotly.express as px
import time
import matplotlib.pyplot as plt

def downcast(df: pd.DataFrame) -> pd.DataFrame:
    float_cols = [c for c in df if df[c].dtype in ["float64"]]
    int_cols = [c for c in df if df[c].dtype in ['int64']]
    df[float_cols] = df[float_cols].astype('float32')
    df[int_cols] = df[int_cols].astype('int16')
    return df


# Why

This notebook achieves 1.04771 on the public leaderboard, with minimal training times.
It is designed to create a meta feature (Out Of Fold) for subsequent models.

Data preprocessing is done in the companion notebook.

In [None]:
# Basic preprocessing is done in the corresponding notebook
items = pd.read_csv('../input/data-preprocessing/items.csv')
shops = pd.read_csv('../input/data-preprocessing/shops.csv')
cats = pd.read_csv('../input/data-preprocessing/item_categories.csv')
train = pd.read_csv('../input/data-preprocessing/sales_train.csv')
test  = pd.read_csv('../input/data-preprocessing/test.csv').set_index("ID")

print("Downcasting data to save kernel memory")
dataframes = [train, shops, items, cats]
for d in dataframes:
    d = downcast(d)

In [None]:
items

In [None]:
start_train = 13
tme = downcast(pd.read_pickle("../input/time-feature-eng/tme_with_feature_eng_and_text.pkl").query(f"date_block_num>={start_train}"))
# do this after lagging
tme["item_cnt_month"] =  tme["item_cnt_month"].clip(0,50)

tme.sample(4)
sns.scatterplot(data=tme.query("item_cnt_month>0 and days_with_sales>=0").sample(8000),
                x="days_with_sales", y="target", alpha=.5)

In [None]:
print("Groups of features so far")
import re
pattern = re.compile(r"_lag\d*")
lag_cols = [c for c in tme.columns if bool(pattern.search(c))]

pattern = re.compile(r"_prev")
prev_cols = [c for c in tme.columns if bool(pattern.search(c))]

pattern = re.compile(r"txt_")
txt_cols = [c for c in tme.columns if bool(pattern.search(c))]

other_feats = ['avg_item_3mo',
               'diff_1yr',
               'roc_1_2',
               'roc_2_3',
               'roc_1_4',
               'roc_1_12',
               'diff_12_34',
               'month_num',
               'daydiff',
               'item_age',
               'item_age_in_shop',
               'price_greater_than80',
               'price_var_within_month',
               'price_max_avg_within_month',
               'shop_city_id',
               'shop_is_internet',
               'days_no_sales_beginning']

In [None]:
sns.pairplot(tme.query(f"date_block_num>{start_train}").sample(2000)[["target", "item_price_avg",
                        "price_max_avg_within_month","item_age_in_shop","roc_1_12","item_age"]],
             hue="item_age"
            )


In [None]:
tme["mean_ratio"] = tme["mean_item_cnt_category_shop_prev_1mo"]/tme["mean_item_cnt_category_shop_prev_5mo"]

In [None]:
tme = tme.merge(shops, on="shop_id")

In [None]:
len_before = tme.shape[0]
tme = tme.merge(items, on="item_id")
assert tme.shape[0] == len_before

In [None]:
txt_cols = [f for f in tme.columns if f.startswith("txt")]
print(txt_cols)

In [None]:
use_feats = ["days_with_sales",
                "item_cnt_month",
                "avg_item_3mo",
                "days_no_sales_beginning",
                "item_cnt_month_lag1",
                "item_cnt_month_lag2",
                "item_cnt_month_lag12",
                "month_num",
                "daydiff",
                "mean_item_cnt_category_prev_3mo",
                "mean_item_cnt_category_shop_prev_1mo",
                "mean_item_cnt_category_shop_prev_5mo",
                "mean_ratio",
                "mean_item_cnt_shop_prev_6mo",
                "item_age",
                "item_age_in_shop",
                "price_var_within_monthp_lag1",
                "price_var_within_month",
                "revenue_month",
                "shop_is_internet",
                "item_cnt_month"] + ['txt_1', 'txt_2', 'txt_3', 'txt_4', 'txt_5', 'txt_6', 'txt_7', 'txt_8', 'txt_9', 'txt_10', 'txt_11', 'txt_12', 'txt_13', 'txt_14', 'txt_15', 'txt_16', 'txt_17', 'txt_18', 'txt_19', 'txt_20']

use_feats = list(set(use_feats))

In [None]:
print(f"Using {len(use_feats)} features")

In [None]:


tme[use_feats] = tme[use_feats].replace([np.inf, -np.inf], np.nan)
tme[use_feats] = tme[use_feats].fillna(tme[use_feats].mean()).fillna(0)

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import mean_squared_error

print("A very simple validation loop")

tme["meta_has_sales"] = np.nan
tme["est_sales"] = 0

y_limit = 1
thres = 0.5 # threshold used for the classification problem
scores = {}
start_train = 13

val = [18,22,24,26, 28, 30, 32, tme.date_block_num.max()]
val = [18,26, 28, 30, 32, tme.date_block_num.max()]


"""
 A first classification model predicts if there will be any sales at all
 - a second random forest 
"""
for i, validate_to in enumerate(val):
    scores[validate_to] = {}
    X_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")[use_feats]
    y_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")["target"]

    print(f">Fitting for fold validating to block num {validate_to}")
    clf = HistGradientBoostingClassifier(max_depth=14, max_bins=40, max_iter=15, learning_rate=0.15)
    clf.fit(X_train, y_train >= y_limit)
    
    
    print(f"  train score: {clf.score(X_train, y_train >= 1):.2f}")
    if validate_to == min(val):
        tme.loc[X_train.index, "meta_has_sales"] = clf.predict_proba(X_train)[:,1]
    
    if validate_to == tme.date_block_num.max() or i==(len(val)-1):
        next_val = tme.date_block_num.max()
    else:
        next_val = val[i+1] - 1
    condition = f"date_block_num >= {validate_to} and date_block_num<={next_val}"
    X_test = tme.query(condition)[use_feats]
    y_test = tme.query(condition)["target"]

    print(f"  test score: {clf.score(X_test, y_test >= y_limit):.2f}")
    tme.loc[X_test.index, "meta_has_sales"] = clf.predict_proba(X_test)[:,1]
    
    
    X_train_lvl2 = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to} and meta_has_sales>{thres}")[use_feats]
    y_train_lvl2 = tme.loc[X_train_lvl2.index, "target"].clip(0, 20)
    
    condition = condition + f" and meta_has_sales>{thres}"
    
    X_test_lvl2 = tme.query(condition)[use_feats]
    y_test_lvl2 = tme.query(condition)["target"].clip(0, 20)
    
    #reg = LinearRegression().fit(X_train_lvl2, y_train_lvl2)
    print(f"Fitting regressor on dataset with shape: {X_train_lvl2.shape}")
    reg = RandomForestRegressor(n_estimators=40,
                                min_samples_split=10,
                                max_features=10,
                                max_depth=11).fit(X_train_lvl2, y_train_lvl2)
    
    y_hat_test = reg.predict(X_test_lvl2).clip(0, 20)
    lr_mse_train = mean_squared_error(reg.predict(X_train_lvl2).clip(0, 20) ,y_train_lvl2)
    print(f"    LR mse (train): {lr_mse_train:.2f}")
    scores[validate_to]["train_mse"] = lr_mse_train
    
    # print and log
    scores[validate_to]["test_mse"] = np.nan
    if validate_to != 33:
        lr_mse_test = mean_squared_error(y_hat_test,y_test_lvl2)
        print(f"    LR mse (test): {lr_mse_test:.2f}")
        print(f"    consistency check, scoring when filling with mean (test): {mean_squared_error(y_hat_test*0+y_train_lvl2.mean(), y_test_lvl2):.2f}")
        scores[validate_to]["test_mse"] = lr_mse_test
    
    tme.loc[X_test_lvl2.index, "est_sales"] = reg.predict(X_test_lvl2)
    
    if i==0 or validate_to==33:
        print("output fimp here")

In [None]:
print("Sending meta features to output")
tme[["item_id","shop_id","date_block_num","est_sales","meta_has_sales","target"]].rename(columns={"est_sales":"meta_est_sales_rf", "meta_has_sales":"meta_has_sales_rf"}).to_pickle("tme_chained_predictions.pkl")

In [None]:
tme.to_pickle("tme_with_meta_features.pkl") 

# Submission
You can also submit from here as a sanity check.

In [None]:
y_hat_model, y_hat_persistent, y_true = tme.query(f"date_block_num>{min(val)} and date_block_num<33")["est_sales"].clip(0,20),\
                                        tme.query(f"date_block_num>{min(val)} and date_block_num<33")["item_cnt_month"].clip(0,20),\
                                        tme.query(f"date_block_num>{min(val)} and date_block_num<33")["target"].clip(0,20)

print(f"MSE model (on chained test): {mean_squared_error(y_true,y_hat_model):.3f} ")
print(f"MSE persistent (on chained test): {mean_squared_error(y_true,y_hat_persistent):.3f} ")

In [None]:
print("Naive grid search to find the blending coefficient - can be skipped by setting the value of the var best_alpha to a value between 0 and 1 ")
alpha = [0, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
best_alpha = 0
best_mse = 1e10

for a in alpha:
    y_hat_model, y_hat_persistent, y_true = tme.query(f"date_block_num>{min(val)} and date_block_num<31")["est_sales"],\
                                            tme.query(f"date_block_num>{min(val)} and date_block_num<31")["item_cnt_month"],\
                                            tme.query(f"date_block_num>{min(val)} and date_block_num<31")["target"].clip(0,20)
    
    y_blended = (y_hat_model * a + y_hat_persistent * (1-a)).clip(0,20)
    blended_mse = mean_squared_error(y_true,y_blended)
    model_mse = mean_squared_error(y_true,y_hat_model.clip(0,20))
    persistent_mse = mean_squared_error(y_true,y_hat_persistent.clip(0,20))
    
    if blended_mse < model_mse and blended_mse < persistent_mse and blended_mse < best_mse:
        print(f"Found a better alpha: {a}")
        best_alpha = a
        best_mse = blended_mse
    

y_hat_model, y_hat_persistent, y_true = tme.query(f"date_block_num>31 and date_block_num<33")["est_sales"],\
                                        tme.query(f"date_block_num>31 and date_block_num<33")["item_cnt_month"],\
                                        tme.query(f"date_block_num>31 and date_block_num<33")["target"].clip(0,20)
    
y_blended = (y_hat_model * a + y_hat_persistent * (1-a)).clip(0,20)
blended_mse = mean_squared_error(y_true,y_blended)
model_mse = mean_squared_error(y_true,y_hat_model.clip(0,20))
persistent_mse = mean_squared_error(y_true,y_hat_persistent.clip(0,20))
print(f"Blended MSE at best alpha on meta test: {blended_mse}")
print(f"Model MSE on meta test: {model_mse}")
print(f"Persistent MSE on meta test: {persistent_mse}")

In [None]:
tme["classification"] = (tme.meta_has_sales > thres).astype(int)
tme.groupby("classification")["target"].mean()

In [None]:
tme["clipped_target"] = tme["target"].clip(0,20)
tme["clipped_est_sales"] = tme["est_sales"].clip(0,20)

In [None]:
tme.query("meta_has_sales>0.5").sample(10000).plot(x="clipped_target",
                                                   y="clipped_est_sales",
                                                   kind="scatter", alpha=.1)

In [None]:
tme.query("meta_has_sales<0.3").sample(10000).plot(x="target",
                                                   y="est_sales",
                                                   kind="scatter", alpha=0.1)

In [None]:
sns.histplot(tme.query("date_block_num==33")["item_cnt_month"].clip(0,20))

In [None]:
print(tme.query("date_block_num==33")[["est_sales"]].clip(0,20).mean())
print(tme.query("date_block_num==33")[["item_cnt_month"]].clip(0,20).mean())

In [None]:
preds = \
    tme.query("date_block_num==33")[["shop_id","item_id","est_sales","item_cnt_month"]]
preds["est_sales"] =  (best_alpha * preds["est_sales"] + (1-best_alpha) * preds["item_cnt_month"]).clip(0, 20)
preds = preds.drop(columns=["item_cnt_month"])

In [None]:
preds.mean()

In [None]:
len_before = test.shape[0]

sample_submission = \
test.reset_index().merge(preds,
           on=["shop_id","item_id"]).set_index("ID").rename(columns={"est_sales":"item_cnt_month"})

assert sample_submission.shape[0] == len_before

In [None]:
sample_submission.item_cnt_month.mean()

In [None]:
sample_submission

In [None]:
sample_submission[["item_cnt_month"]].to_csv('../working/submission.csv')

In [None]:
train.date_block_num.max()