In [None]:
!pip install -U scikit-learn ray[tune] bayesian-optimization

In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error
mse = mean_squared_error

In [None]:
# load the ready-made dataset from the parent notebook
tme = pd.read_pickle("../input/classification-linr-experiment/tme_with_meta_features.pkl")

In [None]:
tme.sample(12)

In [None]:
tme.columns

In [None]:
num_boost_round = 100

In [None]:
tme["month_num"] = 1 + (tme["date_block_num"] % 12)
tme["month_num"]

In [None]:
tme["diff_1yr"] = (tme["item_cnt_month_lag1"] - tme["item_cnt_month_lag12"]) / tme["item_cnt_month_lag12"]

In [None]:
from ray import tune
from operator import itemgetter

use_feats = ["shop_id", "item_id", "item_cnt_month",
             "item_cnt_month_lag1", "item_cnt_month_lag2","item_cnt_month_lag3","item_cnt_month_lag12",
             "daydiff","num_item_category_shop_prev_1mo","mean_ratio",
             "mean_item_cnt_category_prev_3mo","num_item_category_shop_prev_5mo","mean_item_cnt_category_shop_prev_12mo",
             "item_age","item_age_in_shop",
             "month_num", "diff_1yr"]
categorical_features = ['item_id','shop_id']

#tme = tme[use_feats]
start_train = 13

class LightGbmRegressorTrainable(tune.Trainable):# https://docs.ray.io/en/latest/tune/api_docs/trainable.html#function-api
    def setup(self, config:dict, data:dict=None):
        # setup function is invoked once training starts.
        self.x = 0
        self.total_loss = 0 
        self.average_loss = 0
        self.losses = {"train": [], "val": []}
        
        # Hyperparams to tune
        
        self.lr = config.get("lr", 0.15)
        self.max_depth = config.get("max_depth", None)
        self.max_bin = config.get("max_bin", 255)
        self.n_estimators = config.get("n_estimators", None)
        self.min_data_in_leaf  = int(config.get("min_data_in_leaf", 20))
        self.bagging_fraction = config.get("bagging_fraction", .75)
        self.initial_sales_clipping = int(config.get("initial_sales_clipping", 50))
        
        self.tme = data["tme"]
        self.val = data["val"]
        print(f"Validating to {self.val}")
        
    def step(self):  # This is called iteratively.
        train_score = None
        cv_score = None
        if self.x <= len(self.val):
            validate_to = self.val[self.x]
            
            X_train = self.tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")[use_feats].copy(deep=True)
            y_train = self.tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")["target"].copy(deep=True).clip(0,self.initial_sales_clipping) # arbitrary

            X_val = X_train.sample(int(X_train.shape[0]*.15), random_state=2022) # used for early stopping
            y_val = y_train.loc[X_val.index]

            X_train = X_train.drop(X_val.index)
            y_train = y_train.drop(y_val.index)

            lgb_train_dataset = lgb.Dataset(X_train, y_train)
            lgb_eval_dataset = lgb.Dataset(X_val, y_val)
            
            params = {
                        'objective': 'mse',
                        'metric': 'rmse',
                        'n_estimators': self.n_estimators,
                        "max_depth": self.max_depth,
                        "max_bin": self.max_bin,
                        'num_leaves': min(2 ** self.max_depth - 2, 4096),
                        'learning_rate': self.lr,
                        "min_data_in_leaf": self.min_data_in_leaf,
                        #'feature_fraction': 0.5,
                        'bagging_fraction': self.bagging_fraction,
                        'bagging_freq': 5,
                        'seed': 2022,
                        'verbose': 0
                    }

            model = lgb.train(
                              params, 
                              lgb_train_dataset,
                              num_boost_round=num_boost_round,
                              valid_sets=(lgb_train_dataset, lgb_eval_dataset), 
                              feature_name = use_feats,
                              categorical_feature = categorical_features,
                              early_stopping_rounds = 10
                             )
            
            train_rmse = mean_squared_error(model.predict(X_train).clip(0,20), y_train.clip(0,20)) ** 0.5
            val_rmse = mean_squared_error(model.predict(X_val).clip(0,20), y_val.clip(0,20)) ** 0.5
            
            self.losses["train"].append(train_rmse)
            self.losses["val"].append(val_rmse)
            
            self.total_loss = self.total_loss + val_rmse
            self.average_loss = self.total_loss / (1+self.x)
            
        else:
            print("Do noting")
        self.x += 1
        
        return {"score": self.average_loss, "train_score": train_rmse, "cv_score": val_rmse, "losses": self.losses}

In [None]:
from ray.tune.schedulers import ASHAScheduler
import os
checkpoint_dir = "../working/exp-ray"
SMOKE_TEST = False
RUN_OPT = True

if RUN_OPT:
    os.makedirs(checkpoint_dir, exist_ok=True)

    asha_scheduler = ASHAScheduler(
                                   time_attr='training_iteration',
                                   metric='score',
                                   mode='min',
                                   max_t=99999,
                                   grace_period=2, # only stop trials at least this old in time. 
                                   reduction_factor=2,
                                   brackets=1)

    validations_starting_points = [28, 30, 32]
    analysis = tune.run(
                            tune.with_parameters(LightGbmRegressorTrainable,
                                                 data={"tme": tme, "val": validations_starting_points}), #,15,17,18,19
                            num_samples=2 if SMOKE_TEST else 10,
                            max_concurrent_trials = 1,
                            stop = {"training_iteration": len(validations_starting_points)},
                            config={
                                    "lr": tune.choice([0.016,0.12]),
                                    "max_depth": tune.choice([5,6,7,10,14]),
                                    "n_estimators": tune.choice([50, 100]),
                                    "bagging_fraction": tune.uniform(.5, .8),
                                    "min_data_in_leaf": tune.choice([10, 20, 50, 120, 250]),
                                    "max_bin": tune.choice([200]),
                                    "initial_sales_clipping": tune.choice([20, 50, 100])
                                  },
                            scheduler=asha_scheduler,
                            resources_per_trial={"cpu": 2, "gpu": 0},
                            time_budget_s = 60*60,
                            name="experiment_3",
                            local_dir=checkpoint_dir,
                            resume=False, # resumes if previous trial crashed
                            verbose=0
                           )

In [None]:
if RUN_OPT:
    print(analysis.get_best_config(metric="score", mode="min"))

In [None]:
if RUN_OPT:
    add_cols = [c for c in analysis.results_df if c.startswith("config.")]
    display(analysis.results_df[["cv_score","train_score","experiment_tag","time_total_s","training_iteration"] + add_cols].sort_values("cv_score", ascending=True))

In [None]:
if RUN_OPT:
    dfs = analysis.trial_dataframes

    # Plot by epoch
    ax = None  # This plots everything on the same plot
    for d in dfs.values():
        ax = d.cv_score.plot(ax=ax, legend=False)


In [None]:

if not RUN_OPT:
    params = {'lr': 0.12, 'max_depth': 5, 'n_estimators': 100, 'bagging_fraction': 0.5846880206153061, 'min_data_in_leaf': 20, 'max_bin': 255, 'initial_sales_clipping': 20, 'num_leaves': 30, 'bagging_freq': 5, 'objective': 'mse'}
else:
    print("Got new params from the optimization")
    params = analysis.get_best_config(metric="score", mode="min")
    
params["num_leaves"] = min(2 ** params["max_depth"] - 2, 4096)
params["bagging_freq"] = 5
params["objective"] = "mse"
print(params)

In [None]:
tme

In [None]:
print(num_boost_round)
print(params)

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error

In [None]:
val = [18, 25, 28, 32, tme.date_block_num.max()]
scores = {}
tme["est_sales"] = np.nan

for i, validate_to in enumerate(val):
    scores[validate_to] = {}
    X_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")[use_feats].copy(deep=True)
    y_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")["target"].copy(deep=True).clip(0,params["initial_sales_clipping"]) # arbitrary
    
    X_val = X_train.sample(int(X_train.shape[0]*.15), random_state=2022) # used for early stopping
    y_val = y_train.loc[X_val.index]
    
    X_train = X_train.drop(X_val.index)
    y_train = y_train.drop(y_val.index)
    
    lgb_train_dataset = lgb.Dataset(X_train, y_train)
    lgb_eval_dataset = lgb.Dataset(X_val, y_val)
    
    model = lgb.train(
                      params, 
                      lgb_train_dataset,
                      num_boost_round=num_boost_round,
                      valid_sets=(lgb_train_dataset, lgb_eval_dataset), 
                      feature_name = use_feats,
                      categorical_feature = categorical_features,
                      verbose_eval=5, 
                      early_stopping_rounds = 10
                     )

    if validate_to == tme.date_block_num.max() or (i+1)==len(val):
        next_val = tme.date_block_num.max()
    else:
        next_val = val[i+1] - 1
    
    print(f"Generating test predictions from {validate_to} to {next_val}")
    condition = f"date_block_num >= {validate_to} and date_block_num<={next_val}"
    X_test = tme.query(condition)[use_feats]
    y_test = tme.query(condition)["target"].copy(deep=True)
    
    tme.loc[X_test.index, "est_sales"] = model.predict(X_test) # not clipping
    
    train_rmse = mean_squared_error(model.predict(X_train).clip(0,20), y_train.clip(0,20)) ** 0.5
    val_rmse = mean_squared_error(model.predict(X_val).clip(0,20), y_val.clip(0,20)) ** 0.5
    test_rmse = mean_squared_error(model.predict(X_test).clip(0,20), y_test.clip(0,20)) ** 0.5
        
    print(f"Validating to {num_boost_round}")
    print(f"Train rmse: {train_rmse:.3f}")
    print(f"Val   rmse: {val_rmse:.3f}")
    if validate_to != 33:
        print(f"Test  rmse: {test_rmse:.3f}")

In [None]:
tme[["item_id","shop_id","date_block_num","est_sales","target"]].to_csv("chained_predictions.csv")

In [None]:
tme.query("date_block_num<33 and date_block_num>20").sample(15000)[["target","est_sales"]].clip(0,20).plot(x="target",
                                                                                                           y="est_sales",
                                                                                                           kind="scatter", alpha=.1)

In [None]:
preds = \
    tme.query("date_block_num==33")[["shop_id","item_id","est_sales"]]

In [None]:
print(preds.mean())

test  = pd.read_csv('../input/data-preprocessing/test.csv').set_index("ID")
len_before = test.shape[0]
sample_submission = \
test.reset_index().merge(preds,
           on=["shop_id","item_id"]).set_index("ID").rename(columns={"est_sales":"item_cnt_month"})

sample_submission["item_cnt_month"] = sample_submission["item_cnt_month"].clip(0, 20)
assert sample_submission.shape[0] == len_before

In [None]:
import seaborn as sns
sns.histplot(sample_submission["item_cnt_month"])

In [None]:
sample_submission[["item_cnt_month"]].to_csv('../working/submission.csv')