This notebook shows a part of Nth solution.

Please see [Part of 22nd solution - single LGBM (Private:0.03038)](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/324152) for detail. 

Please see [22nd-place-lgbm-model-single-infer](https://www.kaggle.com/code/iwatatakuya/22nd-place-lgbm-model-single-infer) for inference part.

Local(9/16~22):0.03559
Public:0.03022
Private:0.03038

In [None]:
path = "../input/h-and-m-personalized-fashion-recommendations/"

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import gc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import datetime
import itertools
import os
from contextlib import redirect_stdout
from tqdm.notebook import tqdm

In [None]:
rand = 64
lgb_params = {
    "objective": "binary",
    "boosting": "gbdt",
    "max_depth": -1,
    "num_leaves": 40,
    "subsample": 0.8,
    "subsample_freq": 1,
    "bagging_seed": rand,
    "learning_rate": 0.05,
    "feature_fraction": 0.6,
    "min_data_in_leaf": 100,
    "lambda_l1": 0,
    "lambda_l2": 0,
    "random_state": rand,
    "metric": "auc",#"binary_logloss",
    "verbose": -1
}


In [None]:
tran_dtypes = {"t_dat":"str",
               "customer_id":"str",
               "article_id":"int",
               "product_code":"int",
               "price":"float",
               "sales_channel_id":"int"}
art_dtypes = {"article_id":"int",
              "product_code":"int",
              "product_type_no":"int",
              "graphical_appearance_no":"int",
              "colour_group_code":"int",
              "department_no":"int",
              "index_code":"str",
              "index_group_no":"int",
              "section_no":"int",
              "garment_group_no":"int"}
cust_dtypes = {"customer_id":"str"}

obj = "class" # "class" or "rank"
N = 15000
n_iter = 2 # num of iteration
idx_file = "exp05"
len_hist = 366
n_round = 2000
n_splits = 1
tr_set = [1,8,15,22] # set of train date
len_tr = 7 # length of validation period
nobuy = 20 # num of negative samples

In [None]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
def read_data(day_oldest):
    df_art = pd.read_csv(path+"articles.csv",dtype=art_dtypes)
    le = LabelEncoder()
    le.fit(df_art["index_code"].unique())
    df_art["index_code"] = le.transform(df_art["index_code"])

    df_trans = pd.read_csv(path+"transactions_train.csv",dtype=tran_dtypes)
    df_trans["t_dat"] = pd.to_datetime(df_trans["t_dat"],format="%Y-%m-%d")

    df_trans = df_trans.query(f"t_dat >= '{day_oldest}'").copy()
    df_trans = df_trans.drop_duplicates(["customer_id","article_id","t_dat"])
    df_trans = df_trans.merge(df_art[["article_id","product_code","product_type_no","graphical_appearance_no","colour_group_code","department_no","index_code","index_group_no","section_no","garment_group_no"]],how="left",on="article_id")

    df_cust = pd.read_csv(path+"customers.csv",dtype=cust_dtypes)
    df_cust["age"] = df_cust["age"].fillna(df_cust["age"].mean())
    df_cust[["FN","Active"]] = df_cust[["FN","Active"]].fillna(0)
    df_cust["club_member_status"] = df_cust["club_member_status"].apply(lambda x:1 if x == "ACTIVE" else 0)
    df_cust["fashion_news_frequency"] = df_cust["fashion_news_frequency"].apply(lambda x:0 if x == "NONE" else 1)

    dict_vec = {}
    vec_art = np.load("../input/h-m-rapids-article2vec/articles.npy")
    df_vec = pd.concat([df_art["article_id"],pd.DataFrame(vec_art)],axis=1)
    for i in range(len(vec_art)):
        dict_vec[df_art["article_id"][i]] = vec_art[i]
    del vec_art,df_vec


    return df_trans,df_art,df_cust, dict_vec

In [None]:
def feat_store(df_trans,l_cust,ds,de,dsr,der,dsh,deh):
    feat ={}

    df_trans_yesterday = df_trans.query("(t_dat == @der)")
    df_trans_recent = df_trans.query("(t_dat >= @dsr) and (t_dat <= @der)")
    df_trans_hist = df_trans.query("(t_dat >= @dsh) and (t_dat <= @deh)")

    feat["art_buy_hist"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(art_buy_hist="count")
    feat["art_buy_recent"] = df_trans_recent.groupby(["article_id"])["t_dat"].agg(art_buy_recent="count")
    feat["art_buy_yesterday"] = df_trans_yesterday.groupby(["article_id"])["t_dat"].agg(art_buy_yesterday="count")
    df_buy1 = df_trans_hist.groupby("article_id")["customer_id"].nunique().reset_index().rename(columns={"customer_id":"cnt_buy1"})
    df_buy2 = df_trans_hist[df_trans_hist.duplicated(["customer_id","article_id"])].copy()
    df_buy2 = df_buy2.drop_duplicates(["customer_id","article_id"])
    df_buy2 = df_buy2.groupby("article_id")["article_id"].agg(cnt_buy2='count').reset_index()
    df_buy = pd.merge(df_buy1,df_buy2,how="left",on="article_id").fillna(0)
    df_buy["rebuy_rate"] = df_buy["cnt_buy2"]/df_buy["cnt_buy1"]
    feat["rebuy_rate"] = df_buy[["article_id","rebuy_rate"]]

    df_trans_yesterday = df_trans_yesterday.query("(customer_id in @l_cust)")
    df_trans_recent = df_trans_recent.query("(customer_id in @l_cust)")
    df_trans_hist = df_trans_hist.query("(customer_id in @l_cust)")
    feat["rate_sales_channel_hist"] = df_trans_hist.groupby(["customer_id"])["sales_channel_id"].agg(rate_sales_channel_hist="mean")
    feat["rate_sales_channel_recent"] = df_trans_recent.groupby(["customer_id"])["sales_channel_id"].agg(rate_sales_channel_recent="mean")
    feat["n_buy_hist"] = df_trans_hist.groupby(["customer_id","article_id"])["t_dat"].agg(n_buy_hist="count")
    feat["n_buy_recent"] = df_trans_recent.groupby(["customer_id","article_id"])["t_dat"].agg(n_buy_recent="count")
    feat["days_after_buy"] = df_trans_hist.groupby(["customer_id","article_id"])["t_dat"].agg(days_after_buy=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_all"] = df_trans_hist.groupby(["customer_id"])["t_dat"].agg(n_buy_hist_all="count")
    feat["n_buy_recent_all"] = df_trans_recent.groupby(["customer_id"])["t_dat"].agg(n_buy_recent_all="count")
    feat["days_after_buy_all"] = df_trans_hist.groupby(["customer_id"])["t_dat"].agg(days_after_buy_all=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_prod"] = df_trans_hist.groupby(["customer_id","product_code"])["t_dat"].agg(n_buy_hist_prod="count")
    feat["n_buy_recent_prod"] = df_trans_recent.groupby(["customer_id","product_code"])["t_dat"].agg(n_buy_recent_prod="count")
    feat["days_after_buy_prod"] = df_trans_hist.groupby(["customer_id","product_code"])["t_dat"].agg(days_after_buy_prod=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_ptype"] = df_trans_hist.groupby(["customer_id","product_type_no"])["t_dat"].agg(n_buy_hist_ptype="count")
    feat["n_buy_recent_ptype"] = df_trans_recent.groupby(["customer_id","product_type_no"])["t_dat"].agg(n_buy_recent_ptype="count")
    feat["days_after_buy_ptype"] = df_trans_hist.groupby(["customer_id","product_type_no"])["t_dat"].agg(days_after_buy_ptype=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_graph"] = df_trans_hist.groupby(["customer_id","graphical_appearance_no"])["t_dat"].agg(n_buy_hist_graph="count")
    feat["n_buy_recent_graph"] = df_trans_recent.groupby(["customer_id","graphical_appearance_no"])["t_dat"].agg(n_buy_recent_graph="count")
    feat["days_after_buy_graph"] = df_trans_hist.groupby(["customer_id","graphical_appearance_no"])["t_dat"].agg(days_after_buy_graph=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_col"] = df_trans_hist.groupby(["customer_id","colour_group_code"])["t_dat"].agg(n_buy_hist_col="count")
    feat["n_buy_recent_col"] = df_trans_recent.groupby(["customer_id","colour_group_code"])["t_dat"].agg(n_buy_recent_col="count")
    feat["days_after_buy_col"] = df_trans_hist.groupby(["customer_id","colour_group_code"])["t_dat"].agg(days_after_buy_col=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_dep"] = df_trans_hist.groupby(["customer_id","department_no"])["t_dat"].agg(n_buy_hist_dep="count")
    feat["n_buy_recent_dep"] = df_trans_recent.groupby(["customer_id","department_no"])["t_dat"].agg(n_buy_recent_dep="count")
    feat["days_after_buy_dep"] = df_trans_hist.groupby(["customer_id","department_no"])["t_dat"].agg(days_after_buy_dep=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_idx"] = df_trans_hist.groupby(["customer_id","index_code"])["t_dat"].agg(n_buy_hist_idx="count")
    feat["n_buy_recent_idx"] = df_trans_recent.groupby(["customer_id","index_code"])["t_dat"].agg(n_buy_recent_idx="count")
    feat["days_after_buy_idx"] = df_trans_hist.groupby(["customer_id","index_code"])["t_dat"].agg(days_after_buy_idx=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_idxg"] = df_trans_hist.groupby(["customer_id","index_group_no"])["t_dat"].agg(n_buy_hist_idxg="count")
    feat["n_buy_recent_idxg"] = df_trans_recent.groupby(["customer_id","index_group_no"])["t_dat"].agg(n_buy_recent_idxg="count")
    feat["days_after_buy_idxg"] = df_trans_hist.groupby(["customer_id","index_group_no"])["t_dat"].agg(days_after_buy_idxg=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_sec"] = df_trans_hist.groupby(["customer_id","section_no"])["t_dat"].agg(n_buy_hist_sec="count")
    feat["n_buy_recent_sec"] = df_trans_recent.groupby(["customer_id","section_no"])["t_dat"].agg(n_buy_recent_sec="count")
    feat["days_after_buy_sec"] = df_trans_hist.groupby(["customer_id","section_no"])["t_dat"].agg(days_after_buy_sec=lambda x:(ds - max(x)).days)
    feat["n_buy_hist_garm"] = df_trans_hist.groupby(["customer_id","garment_group_no"])["t_dat"].agg(n_buy_hist_garm="count")
    feat["n_buy_recent_garm"] = df_trans_recent.groupby(["customer_id","garment_group_no"])["t_dat"].agg(n_buy_recent_garm="count")
    feat["days_after_buy_garm"] = df_trans_hist.groupby(["customer_id","garment_group_no"])["t_dat"].agg(days_after_buy_garm=lambda x:(ds - max(x)).days)
    feat["art_id_recent"] = df_trans_recent.groupby("customer_id")["article_id"].apply(list).rename("art_id_recent")

    del df_trans_yesterday, df_trans_recent, df_trans_hist, df_buy1, df_buy2, df_buy
    gc.collect()

    return feat

In [None]:
def add_feat(df,ds,de,dsr,der,dsh,deh,feat,dict_vec):
  # rate_sales_channel_hist
  df = df.merge(feat["rate_sales_channel_hist"],how="left",left_on=["customer_id"], right_index=True)
  # rate_sales_channel_recent
  df = df.merge(feat["rate_sales_channel_recent"],how="left",left_on=["customer_id"], right_index=True)  
  # art_buy_hist
  df = df.merge(feat["art_buy_hist"],how="left",left_on=["article_id"], right_index=True)
  # art_buy_recent
  df = df.merge(feat["art_buy_recent"],how="left",left_on=["article_id"], right_index=True)  
  # art_buy_yesterday
  df = df.merge(feat["art_buy_yesterday"],how="left",left_on=["article_id"], right_index=True)  
  # n_buy_hist
  df = df.merge(feat["n_buy_hist"],how="left",left_on=["customer_id","article_id"], right_index=True)
  # n_buy_recent
  df = df.merge(feat["n_buy_recent"],how="left",left_on=["customer_id","article_id"], right_index=True)
  # days_after_buy
  df = df.merge(feat["days_after_buy"],how="left",left_on=["customer_id","article_id"], right_index=True)
  # n_buy_hist_all
  df = df.merge(feat["n_buy_hist_all"],how="left",left_on=["customer_id"], right_index=True)
  # n_buy_recent_all
  df = df.merge(feat["n_buy_recent_all"],how="left",left_on=["customer_id"], right_index=True)
  # days_after_buy_all
  df = df.merge(feat["days_after_buy_all"],how="left",left_on=["customer_id"], right_index=True)
  # n_buy_hist_prod
  df = df.merge(feat["n_buy_hist_prod"],how="left",left_on=["customer_id","product_code"], right_index=True)
  # n_buy_recent_prod
  df = df.merge(feat["n_buy_recent_prod"],how="left",left_on=["customer_id","product_code"], right_index=True)
  # days_after_buy_prod
  df = df.merge(feat["days_after_buy_prod"],how="left",left_on=["customer_id","product_code"], right_index=True)
  # n_buy_hist_ptype
  df = df.merge(feat["n_buy_hist_ptype"],how="left",left_on=["customer_id","product_type_no"], right_index=True)
  # n_buy_recent_ptype
  df = df.merge(feat["n_buy_recent_ptype"],how="left",left_on=["customer_id","product_type_no"], right_index=True)
  # days_after_buy_ptype
  df = df.merge(feat["days_after_buy_ptype"],how="left",left_on=["customer_id","product_type_no"], right_index=True)
  # n_buy_hist_graph
  df = df.merge(feat["n_buy_hist_graph"],how="left",left_on=["customer_id","graphical_appearance_no"], right_index=True)
  # n_buy_recent_graph
  df = df.merge(feat["n_buy_recent_graph"],how="left",left_on=["customer_id","graphical_appearance_no"], right_index=True)
  # days_after_buy_graph
  df = df.merge(feat["days_after_buy_graph"],how="left",left_on=["customer_id","graphical_appearance_no"], right_index=True)
  # n_buy_hist_col
  df = df.merge(feat["n_buy_hist_col"],how="left",left_on=["customer_id","colour_group_code"], right_index=True)
  # n_buy_recent_col
  df = df.merge(feat["n_buy_recent_col"],how="left",left_on=["customer_id","colour_group_code"], right_index=True)
  # days_after_buy_col
  df = df.merge(feat["days_after_buy_col"],how="left",left_on=["customer_id","colour_group_code"], right_index=True)
  # n_buy_hist_dep
  df = df.merge(feat["n_buy_hist_dep"],how="left",left_on=["customer_id","department_no"], right_index=True)
  # n_buy_recent_dep
  df = df.merge(feat["n_buy_recent_dep"],how="left",left_on=["customer_id","department_no"], right_index=True)
  # days_after_buy_dep
  df = df.merge(feat["days_after_buy_dep"],how="left",left_on=["customer_id","department_no"], right_index=True)
  # n_buy_hist_idx
  df = df.merge(feat["n_buy_hist_idx"],how="left",left_on=["customer_id","index_code"], right_index=True)
  # n_buy_recent_idx
  df = df.merge(feat["n_buy_recent_idx"],how="left",left_on=["customer_id","index_code"], right_index=True)
  # days_after_buy_idx
  df = df.merge(feat["days_after_buy_idx"],how="left",left_on=["customer_id","index_code"], right_index=True)
  # n_buy_hist_idxg
  df = df.merge(feat["n_buy_hist_idxg"],how="left",left_on=["customer_id","index_group_no"], right_index=True)
  # n_buy_recent_idxg
  df = df.merge(feat["n_buy_recent_idxg"],how="left",left_on=["customer_id","index_group_no"], right_index=True)
  # days_after_buy_idxg
  df = df.merge(feat["days_after_buy_idxg"],how="left",left_on=["customer_id","index_group_no"], right_index=True)
  # n_buy_hist_sec
  df = df.merge(feat["n_buy_hist_sec"],how="left",left_on=["customer_id","section_no"], right_index=True)
  # n_buy_recent_sec
  df = df.merge(feat["n_buy_recent_sec"],how="left",left_on=["customer_id","section_no"], right_index=True)
  # days_after_buy_sec
  df = df.merge(feat["days_after_buy_sec"],how="left",left_on=["customer_id","section_no"], right_index=True)
  # n_buy_hist_garm
  df = df.merge(feat["n_buy_hist_garm"],how="left",left_on=["customer_id","garment_group_no"], right_index=True)
  # n_buy_recent_garm
  df = df.merge(feat["n_buy_recent_garm"],how="left",left_on=["customer_id","garment_group_no"], right_index=True)
  # days_after_buy_garm
  df = df.merge(feat["days_after_buy_garm"],how="left",left_on=["customer_id","garment_group_no"], right_index=True)
  # rebuy_rate
  df = df.merge(feat["rebuy_rate"],how="left",on="article_id")
  # sim_article
  df = df.merge(feat["art_id_recent"],how="left",left_on="customer_id", right_index = True)
  sim_max,sim_sum,sim_mean = [],[],[]
  # display(df[["article_id","art_id_recent"]].head())
  tmp = df[["article_id","art_id_recent"]].values
  for i in range(len(df)):
      if not isinstance(tmp[i][1],list):
        sim_max.append(0);sim_sum.append(0);sim_mean.append(0)
      else:
        list_sim = [cos_sim(dict_vec[tmp[i][0]],dict_vec[x]) for x in tmp[i][1]]
        sim_max.append(max(list_sim))
        sim_sum.append(sum(list_sim))
        sim_mean.append(np.mean(list_sim))
  df["sim_max"] = sim_max
  df["sim_sum"] = sim_sum
  df["sim_mean"] = sim_mean
  df = df.drop(["art_id_recent"], axis = 1)
  # fillna
  df[["n_buy_hist","n_buy_recent","n_buy_hist_all","n_buy_recent_all","n_buy_hist_prod","n_buy_recent_prod","n_buy_hist_ptype","n_buy_recent_ptype","n_buy_hist_graph","n_buy_recent_graph",
      "n_buy_hist_col","n_buy_recent_col","n_buy_hist_dep","n_buy_recent_dep","n_buy_hist_idx","n_buy_recent_idx","n_buy_hist_idxg","n_buy_recent_idxg","n_buy_hist_sec","n_buy_recent_sec",
      "n_buy_hist_garm","n_buy_recent_garm","art_buy_yesterday","art_buy_recent","art_buy_hist","rebuy_rate", "sim_max", "sim_sum", "sim_mean"]] =\
  df[["n_buy_hist","n_buy_recent","n_buy_hist_all","n_buy_recent_all","n_buy_hist_prod","n_buy_recent_prod","n_buy_hist_ptype","n_buy_recent_ptype","n_buy_hist_graph","n_buy_recent_graph",
      "n_buy_hist_col","n_buy_recent_col","n_buy_hist_dep","n_buy_recent_dep","n_buy_hist_idx","n_buy_recent_idx","n_buy_hist_idxg","n_buy_recent_idxg","n_buy_hist_sec","n_buy_recent_sec",
      "n_buy_hist_garm","n_buy_recent_garm","art_buy_yesterday","art_buy_recent","art_buy_hist","rebuy_rate", "sim_max", "sim_sum", "sim_mean"]].fillna(0)

  df[["days_after_buy","days_after_buy_all","days_after_buy_prod","days_after_buy_ptype","days_after_buy_graph","days_after_buy_col","days_after_buy_dep","days_after_buy_idx",
      "days_after_buy_idxg","days_after_buy_sec","days_after_buy_garm"]] = \
  df[["days_after_buy","days_after_buy_all","days_after_buy_prod","days_after_buy_ptype","days_after_buy_graph","days_after_buy_col","days_after_buy_dep","days_after_buy_idx",
      "days_after_buy_idxg","days_after_buy_sec","days_after_buy_garm"]].fillna(10+len_hist)

  df[["rate_sales_channel_hist","rate_sales_channel_recent"]] = df[["rate_sales_channel_hist","rate_sales_channel_recent"]].fillna(1.5)
  
  return df

In [None]:
def recommend_train(day_start_val):
    day_start = [day_start_val - datetime.timedelta(days=i-1+len_tr) for i in tr_set]
    day_end = [day_start_val - datetime.timedelta(days=i) for i in tr_set]
    day_start_rec = [x - datetime.timedelta(days=7) for x in day_start]
    day_end_rec = [x - datetime.timedelta(days=1) for x in day_start]
    day_start_hist = [x - datetime.timedelta(days=len_hist) for x in day_start]
    day_end_hist = [x - datetime.timedelta(days=1) for x in day_start]
    day_start_rec_test = day_start_val - datetime.timedelta(days=7)
    day_end_rec_test = day_start_val - datetime.timedelta(days=1)
    day_start_hist_test = day_start_val - datetime.timedelta(days=1+len_hist)
    day_end_hist_test = day_start_val - datetime.timedelta(days=1)
    day_end_val = day_start_val + datetime.timedelta(days=6)

    df_trans, df_art, df_cust, dict_vec = read_data(day_oldest = day_start_hist[-1])

    q_date = ""
    for i in range(len(day_start)):
        if i == 0: q_date = f"((t_dat >= '{day_start[0]}') and (t_dat <= '{day_end[0]}'))"
        else: q_date = q_date + f" or ((t_dat >= '{day_start[i]}') and (t_dat <= '{day_end[i]}'))"
    top_art_all = df_trans.query(q_date).groupby("article_id")["t_dat"].count().sort_values(ascending = False).index[:N].tolist()

    list_df_buy = []
    list_list_cust =[]
    # make posivive samples
    for i in range(len(day_start)):
        list_df_buy.append(df_trans.query(f"(t_dat >= '{day_start[i]}') and (t_dat <= '{day_end[i]}') and (article_id in @top_art_all)").drop_duplicates(["customer_id","article_id"])[["customer_id","article_id"]].copy())
        list_df_buy[i]["target"] = 1
        list_list_cust.append(list_df_buy[i]["customer_id"].unique().tolist()) 
    # make negative samples(random pick)
    for iter_train in tqdm(range(n_iter)):
        list_df_nobuy = []
        list_train =[]
        for i in range(len(day_start)):
            list_df_nobuy.append(pd.concat([pd.DataFrame({"customer_id":x,"article_id":random.sample(top_art_all,nobuy)}) for x in list_list_cust[i]]))
            list_df_nobuy[i]["target"] = 0
            list_train.append(pd.concat([list_df_buy[i],list_df_nobuy[i]]).drop_duplicates(["customer_id","article_id"]))
        del list_df_nobuy
        display(list_train[0]["target"].value_counts())

        # add feature
        df_train = pd.DataFrame()
        for i in tqdm(range(len(day_start))):
            feat = feat_store(df_trans,list_list_cust[i],day_start[i],day_end[i],day_start_rec[i],day_end_rec[i],day_start_hist[i],day_end_hist[i])
            list_train[i] = list_train[i].merge(df_art[["article_id","product_code","product_type_no","graphical_appearance_no","colour_group_code","department_no","index_code","index_group_no","section_no","garment_group_no"]],how="left",on="article_id")
            list_train[i] = list_train[i].merge(df_cust[["customer_id","age","FN","Active","club_member_status","fashion_news_frequency"]],how="left",on="customer_id")
            df_train = df_train.append(add_feat(list_train[i],day_start[i],day_end[i],day_start_rec[i],day_end_rec[i],day_start_hist[i],day_end_hist[i],feat,dict_vec))
            del feat
        del list_train
        gc.collect()

        # train lgbm
        X_train = df_train.drop(["customer_id","product_code","product_type_no","department_no","target"],axis=1)
        y_train = df_train["target"]
        del df_train

        list_model = []
        if n_splits == 1:
            X_tr, X_va, y_tr, y_va = train_test_split(X_train,y_train,stratify = y_train)
            d_tr = lgb.Dataset(X_tr, label=y_tr,  free_raw_data=False)
            d_va = lgb.Dataset(X_va, label=y_va,  free_raw_data=False)
            list_model.append(lgb.train(lgb_params, train_set=d_tr, num_boost_round=n_round, valid_sets=[d_tr,d_va], verbose_eval=500, early_stopping_rounds=100))
        else:
            folds = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = rand)
            for tr_idx,va_idx in folds.split(X_train,y_train):
                X_tr, X_va, y_tr, y_va = X_train.iloc[tr_idx], X_train.iloc[va_idx], y_train.iloc[tr_idx], y_train.iloc[va_idx] 
                d_tr = lgb.Dataset(X_tr, label=y_tr,  free_raw_data=False)
                d_va = lgb.Dataset(X_va, label=y_va,  free_raw_data=False)
                list_model.append(lgb.train(lgb_params, train_set=d_tr, num_boost_round=n_round, valid_sets=[d_tr,d_va], verbose_eval=500, early_stopping_rounds=100))
        # save model
        pd.to_pickle(list_model,f"models_{iter_train}.pkl")
        del X_train, y_train, X_tr, X_va, y_tr, y_va, d_tr, d_va
        gc.collect()
    del df_trans, df_art, df_cust
    gc.collect()
    return 0

In [None]:
recommend_train(datetime.datetime(2020,9,23))