In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from typing import List
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import sys

import gc
print(sys.version)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
cnt = 0; show = 20
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if cnt >= show:
            break
        print(os.path.join(dirname, filename))
        cnt += 1

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Process **
1) **Load the entire training data** 

**Thank @Rob Mulla for the low memory parquet dataset**  https://www.kaggle.com/code/robikscube/fast-data-loading-and-low-mem-with-parquet-files

In [None]:
# train_df = pd.read_pickle("/kaggle/input/ubiquantpicklepython37float32/train3-7-float32-reduced.pkl")
train_df = pd.read_parquet("/kaggle/input/ubiquant-parquet/train_low_mem.parquet")
features_300 = [f"f_%d" % i for i in range(300)]
print(train_df.shape)
size = sys.getsizeof(train_df)
print(f"The train_df loaded consumes %.2f MB memory." % (size/(1024*1024)))

cols = ['target']
for i in range(20):
    cols.append(f"f_%d" % i)
print(train_df[cols].describe())
train_df.reset_index(drop=True, inplace=True)
train_df.drop(columns=['row_id'], inplace=True)
del cols, os, sys

2) **Process the entire training data**

**Thank yoshi_k for sharing some good plotting techques.** https://www.kaggle.com/code/yoshikuwano/eda-and-train-by-rapids-and-tabnet/notebook

In [None]:
# Import useful packages and set default plotting params
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import random

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 12, 'axes.grid': True, 
                    'grid.color': 'gray', 'grid.linestyle': '--'})

In [None]:
# Methods to generate plots
class EntireAnalysis:
    
    def __init__(self, inputDF: 'pandas.core.frame.DataFrame'):
        """ inputDF contains all columns same as the original train.csv file """
        self.df = inputDF
    
    def histograms(self, usrColor: str='orange', Range: List[int]=None):
        """ Plot target value histogram and time-step histogram of investment ids. 
        Range: [min_val, max_val] for the target bins """
        fig, ax = plt.subplots()
        ax.hist(self.df['target'], bins=200, color=usrColor, range=Range)
        ax.set_title("Target Histogram")
        ax.set_ylabel("Frequency")
        ax.xaxis.set_major_locator(MultipleLocator(1))
        ax.xaxis.set_minor_locator(MultipleLocator(0.2))
        fig.set_figwidth(15)
        plt.show()
        
        timeStepCnt = self.df.groupby(['investment_id'])['investment_id'].count()
        invIDs = timeStepCnt.index.tolist()
        timeCnts = timeStepCnt.values.tolist()
        fig, ax = plt.subplots()
        ax.plot(invIDs[:500], timeCnts[:500], color='green')
        # ax.hist(timeStepCnt, bins=200, color=usrColor)
        ax.set_title("Time Steps per Investment Histogram")
        ax.set_xlabel("Investment ID")
        ax.xaxis.set_major_locator(MultipleLocator(50))
        ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
        ax.xaxis.set_minor_locator(MultipleLocator(10))
        ax.set_xticklabels(labels=ax.get_xticks().astype(int), rotation=60)
        fig.set_figwidth(15)
        plt.show()
        
    def timeTrend(self, usrColor: str):
        """ Display # of investments as a function of time_id, and target mean and std vs. time. """
        invCnts = self.df.groupby(['time_id'])['investment_id'].count()
        target_mean = self.df.groupby(['time_id'])['target'].mean()
        target_std = self.df.groupby(['time_id'])['target'].std()
        time_ids = invCnts.index.tolist()              # 有用
        invCnts = invCnts.tolist()
        fig, axes = plt.subplots(2, 1, figsize=[15, 10])
        axes[0].plot(time_ids, invCnts, color=usrColor)
        axes[0].set_xlabel("Time_ids")
        axes[0].set_ylabel("Number of investments")
        
        ax1 = axes[1]
        ax1.plot(time_ids, target_mean, color='b', label='Mean')
        ax1.set_xlabel("Time_ids")
        ax1.set_ylabel("Target Mean", color='b')
        
        # 有用, Create a twin axis sharing the x axis
        ax2 = ax1.twinx()
        ax2.plot(time_ids, target_std, color='gold', label='Std Dev')
        ax2.set_xlabel("Time_ids")
        ax2.set_ylabel("Target Standard Dev", color='gold')
        handle1, label1 = ax1.get_legend_handles_labels()
        handle2, label2 = ax2.get_legend_handles_labels()
        ax1.legend([handle2[0],handle1[0]], [label2[0],label1[0]], loc='best')
        for i in range(2):
            axes[i].xaxis.set_major_locator(MultipleLocator(50))      # 有用, 定义major tick间隔
            axes[i].xaxis.set_minor_locator(MultipleLocator(10))      # 有用, 定义minor tick间隔
            
        plt.subplots_adjust(hspace=0.25)
        plt.show()
        
    def investTimeTrend(self, usrColors: List[str]=['darkcyan', 'r', 'gold']):
        """ Randomly pick 3 investment ids, make target vs. time plots. """
        invIDs = self.df['investment_id'].unique().tolist()
        random.shuffle(invIDs)
        invIDs = invIDs[:3]
        fig, axes = plt.subplots(3, 1, figsize=[15, 12])
        
        for i in range(3):
            idx = (self.df['investment_id'] == invIDs[i])
            time_ids = self.df.loc[idx, 'time_id'].values.tolist()
            targets = self.df.loc[idx, 'target'].values.tolist()
            intvls = partitionTime(time_ids)
            for start, end in intvls:
                axes[i].plot(time_ids[start:end+1], targets[start:end+1], color=usrColors[i])
            axes[i].set_xlabel("Time_ids")
            axes[i].set_ylabel("Target")
            axes[i].set_title(f"Investment %d" % invIDs[i])
        
        plt.subplots_adjust(hspace=0.5)
        plt.show()
    
    def featureMeanStd(self, color1: str='b', color2: str='gold'):
        """ Make combined plots for mean and standard deviations for all features """
        features = [f"f_%d" % j for j in range(300)]
        means = self.df[features].mean(axis=0)          # pandas.core.series.Series
        means = means.tolist()                          # Convert series to list
        stds = self.df[features].std(axis=0)
        stds = stds.tolist()
        print(f"There are %d means computed; there are %d stds computed." 
             % (len(means), len(stds)))
        
        fig, axes = plt.subplots(3, 1, figsize=(20, 15))
        for i in range(3):
            print(f"Making plot %d..." % (i+1))
            start, end = i*100, (i+1)*100
            ax1 = axes[i]
            color1 = 'b'
            ax1.plot(means[start:end], color=color1, label='Mean')
            ax1.set_xticks(np.arange(100))
            ax1.set_xticklabels(labels=features[start:end], rotation=90)
            ax1.set_xlim(-1, 100)
            ax1.set_ylim(-0.6, 0.6)
            ax1.tick_params(axis='y', color=color1, labelcolor=color1)
            ax1.set_ylabel('Mean', color=color1)
            # 有用, Create a twin axis sharing the x axis
            ax2 = ax1.twinx()           
            color2 = 'orange'
            ax2.plot(stds[start:end], color=color2, label='Standard Dev')
            ax2.set_xticks(np.arange(100))
            ax2.set_xticklabels(labels=features[start:end], rotation=90)
            ax2.set_xlim(-1, 100)
            ax2.set_ylim(0, 1.2)
            ax2.tick_params(axis='y', color=color2, labelcolor=color2)
            ax2.set_ylabel('Standard Deviation', color=color2)
            
            handle1, label1 = ax1.get_legend_handles_labels()
            handle2, label2 = ax2.get_legend_handles_labels()
            ax1.legend([handle2[0],handle1[0]], [label2[0],label1[0]], loc='best')
        
        plt.subplots_adjust(hspace=0.25)
        plt.show()
        
    def featureDist(self, 
                    feaNames: List[str],       # the list of feature names
                    usrColor: str,             # the color of the histogram
                    th: float,                 # corr threshold to delete feature
                    fea2Corr: dict=None):      # feature->targetCorrelation mapping.
        """ Given a list of feature names, plot the logarithmic-scale histograms. """
        n = len(feaNames); i = 0
        means = self.df[feaNames].mean(axis=0)
        means = means.to_list()
        stds = self.df[feaNames].std(axis=0)
        stds = stds.to_list()
        del_fs = []
        
        while i < n:
            fig, axes = plt.subplots(1, 3, figsize=[18, 6])
            for j in range(3):
                if i + j == n:
                    break
                axes[j].hist(self.df[feaNames[i+j]], bins=100, 
                             range=[-10, 10], color=usrColor)
                axes[j].set_title(feaNames[i+j] + " Log Histogram")
                axes[j].set_ylim(1, 10 ** 6)
                axes[j].set_yscale('log')
                axes[j].xaxis.set_major_locator(MultipleLocator(5))
                axes[j].xaxis.set_minor_locator(MultipleLocator(0.1))
                axes[j].set_xlabel(f"Mean = %.3f, Std_dev = %.3f." % (means[i+j], stds[i+j]))
                if fea2Corr:
                    corr = fea2Corr[feaNames[i+j]]
                    axes[j].text(0.99, 0.99, f"TargetCorr: %.3f" % corr, 
                                 va='top', ha='right', transform=axes[j].transAxes)
                    if abs(corr) < th:
                        del_fs.append(feaNames[i+j])
            
            plt.subplots_adjust(hspace=0.5)
            plt.show()
            i += 3
            
        return del_fs
    
def partitionTime(time_ids: List[int]) -> List[List[int]]:
    """ Given a list of time_ids, partition into several continuous intervals [[start1, end1], [start2, end2], ...] 将time_id分成连续的区间。 """
    intervals = []
    start = 0; end = start
    prevTime = time_ids[0]
    for idx, tid in enumerate(time_ids[1:], start=1):
        if tid == prevTime + 1:
            prevTime += 1
            end = idx
        else:
            intervals.append([start, end])
            start = idx; end = start
            prevTime = tid
    
    intervals.append([start, end])
    return intervals

In [None]:
EA_engine = EntireAnalysis(train_df)

In [None]:
# Plot target value histogram and time-step histogram of investment ids. 
EA_engine.histograms(Range=[-5, 5])

In [None]:
# Display # of investments as a function of time_id, and target mean and std vs. time. 
EA_engine.timeTrend('darkcyan')

In [None]:
# Display mean and standard deviations for all 300 features
# EA_engine.featureMeanStd()

In [None]:
# Randomly pick 3 investment ids, make target vs. time plots.
EA_engine.investTimeTrend()

In [None]:
import argparse
args = argparse.Namespace(
    seed = 2021,
    n_folds = 4, 
    W = 1,
    n_threads = 2,
    n_models = 8,
)

In [None]:
del EA_engine, EntireAnalysis
_ = gc.collect()
time.sleep(5)

3) **Remove some least important features (by training light gradient boosting regressors with all features)**

In [None]:
# Remove outliers
def getOutliers(input_df: 'pandas.core.frame.DataFrame', k: int):
    """ Remove data with any feature that is outside of [mean-std*k, mean+std*k] """
    outliers = []
    outFeatures = []
    features = [f"f_%d" % i for i in range(300)]
    means = input_df[features].mean(axis=0).tolist()
    stds = input_df[features].std(axis=0).tolist()
    
    for i, fea in enumerate(features):
        mu, sigma = means[i], stds[i]
        currOut = input_df[(input_df[fea] > mu+sigma*k)|(input_df[fea] < mu-sigma*k)]
        outliers.extend(currOut.index.tolist())
        outFeatures.extend([fea for _ in range(len(currOut))])
        
    outlier_df = pd.DataFrame({'idx': outliers, 'feature': outFeatures})
    outlier_df.drop_duplicates(subset='idx', inplace=True)
    
    return outlier_df

def prepare_features(input_df: 'pandas.core.frame.DataFrame',
                     combineF: List[str], 
                     removeF: List[str]) -> List[str]:
    """ Given a training dataframe, combine features based on combineF, remove features
    in removeF. Then return a list of features for training. """
    # Create combined features
    for combine in combineF:
        first, second = combine.split('&')
        input_df[combine] = input_df[first] + input_df[second]
    # Drop specified features
    input_df.drop(columns=removeF, inplace=True)
    
    use_features = ['investment_id', 'time_id']
    # use_features = ['time_id']
    columns = input_df.columns.tolist()
    for col in columns:
        if 'f_' in col:
            use_features.append(col)
    
    return use_features

In [None]:
# out_df = getOutliers(train_df, 30)
# feature_outs = out_df.groupby(['feature'])['idx'].count()
# print(feature_outs.sort_values(ascending=False)[:20])

In [None]:
print(train_df.shape)
# Remove outliers
# train_df.drop(out_df['idx'].tolist(), inplace=True)
# train_df.reset_index(drop=True, inplace=True)

# Create 8 combined features, accd to https://www.kaggle.com/code/yoshikuwano/eda-and-train-by-rapids-and-tabnet/notebook
# combine_fs = ['f_231&f_250', 'f_118&f_280', 'f_155&f_297', 'f_25&f_237',
#               'f_179&f_265', 'f_119&f_270', 'f_71&f_197', 'f_21&f_65']
combine_fs = []

# Based on feature rank vs. target mean correlation (average on time_id), pick some least important ones
# https://www.kaggle.com/code/marketneutral/stacking-feature-importance/notebook
remove_fs = ['f_97','f_228','f_72','f_49','f_124','f_205','f_148','f_262','f_288','f_258',
             'f_9','f_144','f_4','f_129','f_266','f_166','f_43','f_245','f_12','f_141',]
# remove_fs = []
print(f"There are %d features %r will be removed." % (len(remove_fs), remove_fs))

features = prepare_features(train_df, combine_fs, remove_fs)
print(train_df.shape, features)

# del out_df
_ = gc.collect()
time.sleep(6)

4) **Split training samples into folds**

In [None]:
def add_fold_by_timeID(df, continuous=False):
    """ Split the dataframe into n_folds by time_id. If continuous is True, then time steps within every fold
    are continuous. """
    # Initialize fold values to be -1
    df['fold'] = -1
    time_ids = df['time_id'].unique().tolist()
    fold2cnt = dict()
    for fold in range(args.n_folds):
        fold2cnt[fold] = 0
    
    if continuous:
        avg_n = df.shape[0] // args.n_folds
        fold = 0; currCnt = 0
        for time_id in tqdm(time_ids):
            time_id_idx = (df['time_id'] == time_id)
            cnt = sum(time_id_idx)
            if currCnt + cnt <= avg_n:
                currCnt += cnt
            elif fold < args.n_folds-1:              # If not the last fold, increment fold by 1
                fold += 1; currCnt = cnt
            else:                                    # Assign all remaining samples to the last fold
                currCnt += cnt
            df.loc[time_id_idx, 'fold'] = fold
            fold2cnt[fold] += cnt
    else:
        for time_id in tqdm(time_ids):
            fold = random.randint(0, args.n_folds-1)
            time_id_idx = (df['time_id'] == time_id)
            df.loc[time_id_idx, 'fold'] = fold
            fold2cnt[fold] += sum(time_id_idx)
    
    df['fold'] = df['fold'].astype('int32')
    print(f"Number of samples for each fold: %r." % fold2cnt)
    print(f"Dataframe has %d samples in total, %d have assigned fold." % (len(df), sum(fold2cnt.values())))
    

def add_fold_by_investID(df):
    # Initialize fold values to be -1
    df['fold'] = -1
    invIDs = df['investment_id'].unique().tolist()
    fold2cnt = dict()
    for fold in range(args.n_folds):
        fold2cnt[fold] = 0
    
    for invID in tqdm(invIDs):
        fold = random.randint(0, args.n_folds-1)
        invID_idx = (df['investment_id'] == invID)
        # idx = [i for i, val in enumerate(time_id_bool) if val]
        df.loc[invID_idx, 'fold'] = fold
        fold2cnt[fold] += sum(invID_idx)
        
    print(f"Number of samples for each fold: %r." % fold2cnt)
    print(f"Dataframe has %d samples in total, %d have assigned fold." % (len(df), sum(fold2cnt.values())))

In [None]:
front_bool = (train_df['time_id'] < 80)
front_idx = [idx for idx, val in enumerate(front_bool) if val]
train_df.drop(front_idx, inplace=True)
# add_fold_by_timeID(train_df, True)
del front_bool, front_idx
print(train_df.shape)

In [None]:
def updateFeatureImp(model, features, feature2Imp) -> None:
    """ Helper function to aggregate feature importance from a model. """
    featureImps = model.feature_importances_
    for idx, feature in enumerate(features):
        imp = featureImps[idx]
        if feature not in feature2Imp:
            feature2Imp[feature] = [imp]
        else:
            feature2Imp[feature].append(imp)

# ** Model Training **
# ** LightGBM Regressior. Original **

In [None]:
del AutoMinorLocator, FormatStrFormatter, MultipleLocator, getOutliers
del add_fold_by_timeID, add_fold_by_investID, plt, random
import lightgbm as lgb

In [None]:
features.remove('investment_id')
print(features[:5])
print(features[-10:], len(features))

_ = gc.collect()
time.sleep(10)

In [None]:
# Train method for LGBMRegressor
# def LGBMR_train(df_train, features, train_params, args, useWts=False):
#     """ Perform cross validation on the light GBMRegressor, defined by train_params, 
#         using df_train and given features. """
#     train_corrs = []; val_corrs = []
#     feature2Imp = dict()
#     for i in tqdm(range(args.n_folds)):
#         start = time.time()
#         train_idx = df_train['fold'] != i
#         timeIDs = df_train.loc[train_idx, 'time_id'].values.tolist()
#         train_X = df_train.loc[train_idx, features].values
#         train_y = df_train.loc[train_idx, 'target'].values
#         _ = gc.collect()
#         print(f"Fold %d, %d training samples, %d validation samples." % (i, len(train_y), len(df_train)-len(train_y)))
#         # Train the model
#         my_LGBMR = lgb.LGBMRegressor(**train_params)
#         if useWts:
#             weights = df_train.loc[train_idx, 'weight'].values.tolist()
#             print(f"%d Training weights used." % len(weights))
#         else:   
#             weights = None
#         my_LGBMR.fit(train_X, train_y, sample_weight=weights)
#         end = time.time()
#         print(f"It took %.2f minutes to train this light GBMRegressor." % ((end-start)/60) )
#         updateFeatureImp(my_LGBMR, features, feature2Imp)
#         pred_tr_y = my_LGBMR.predict(train_X)
#         train_corr = meanCorr(timeIDs, train_y, pred_tr_y)
#         del train_X, train_y, train_idx
#         _ = gc.collect()
#         time.sleep(10)
#         
#         # Test the model on validation set
#         valid_idx = df_train['fold'] == i
#         timeIDs = df_train.loc[valid_idx, 'time_id'].values.tolist()
#         valid_X = df_train.loc[valid_idx, features].values
#         valid_y = df_train.loc[valid_idx, 'target'].values
#         del valid_idx
#         _ = gc.collect()
#         pred_val_y = my_LGBMR.predict(valid_X)
#         valid_corr = meanCorr(timeIDs, valid_y, pred_val_y)
#         print(f"Training correlation %.4f, validation correlation %.4f." % (train_corr, valid_corr))
#         train_corrs.append(train_corr); val_corrs.append(valid_corr)
#         
#         del my_LGBMR, valid_X, valid_y
#         _ = gc.collect()
#         time.sleep(8)
#         
#     Tr_corr = sum(train_corrs) / args.n_folds
#     Val_corr = sum(val_corrs) / args.n_folds
#     print(f"Model hyperparams: %r, average train corr %.4f, average valid corr %.4f."
#          % (train_params, Tr_corr, Val_corr))
#     feature2Imp = {feature: sum(imp)/len(imp) for feature, imp in sorted(feature2Imp.items(),
#                                                                   key=lambda x: sum(x[1])/len(x[1]),
#                                                                   reverse=True)}
#     print(f"Most important 30 features: %r." % (list(feature2Imp.items())[:30]) )
#     print(f"Least important 30 features: %r." % (list(feature2Imp.items())[-30:]) )
    
def meanCorr(time_ids: List[int], true_ys: List[float], pred_ys: List[float]) -> float:
    """ Compute the mean correlation based on time_id (all indices for one time_id need to be continuous) """
    timeID2Idx = dict()            # Stores time_id -> [start_idx, end_idx] mapping
    for idx, time_id in enumerate(time_ids):
        if time_id not in timeID2Idx:
            timeID2Idx[time_id] = [idx, idx]
        else:
            timeID2Idx[time_id][1] = idx
            
    corrs = []
    for start, end in timeID2Idx.values():
        curr_corr = np.corrcoef(true_ys[start:end+1], pred_ys[start:end+1])[0, 1]
        if np.isnan(curr_corr):
            continue
        corrs.append(curr_corr)
    
    return np.mean(corrs)

In [None]:
# LGBMR_params = dict(objective='regression',
#                         n_estimators=250, 
#                         learning_rate=0.06,
#                         max_depth=8,
#                         num_leaves=200,
#                         max_bin=127,
#                         min_child_samples=500,
#                         device_type='gpu',
#                         reg_lambda=80,
#                         verbosity=-1,
#                         n_jobs=args.n_threads,
#                         # feature_fraction=0.7,
#                         # bagging_fraction=0.9,
#                         # random_state=seed,
#                         )
# LGBMR_train(train_df, features, LGBMR_params, args)

In [None]:
time_ids = train_df['time_id'].values
invest_ids = train_df['investment_id'].unique().tolist()
train_y = train_df['target'].values
train_df.drop(columns=['investment_id', 'target'], inplace=True)
gc.collect()
time.sleep(8)

train_X = train_df[features].values
del train_df
print(train_X.shape)
gc.collect()
time.sleep(8)

In [None]:
# Use all the samples to train light GBMR, model ensemble
feature2Imp = dict() 
pred_ys = None
for j in range(args.n_models):
    seed = 2015 + j
    LGBMR_params = dict(objective='regression',
                        n_estimators=300, 
                        learning_rate=0.06,
                        max_depth=8,
                        num_leaves=200,
                        max_bin=127,
                        min_child_samples=500,
                        device_type='gpu',
                        reg_lambda=80,
                        verbosity=-1,
                        n_jobs=args.n_threads,
                        feature_fraction=0.8,
                        # bagging_fraction=0.9,
                        random_state=seed,
                        )
    print(f"Model %d, %d training samples" % (j, len(train_y)) )
    LGBMR = lgb.LGBMRegressor(**LGBMR_params)
    start = time.time()
    LGBMR.fit(train_X, train_y, sample_weight=None)
    print(f"It took %.2f minutes to train this light GBMRegressor." % ((time.time()-start)/60) )
    updateFeatureImp(LGBMR, features, feature2Imp)
    
    start = time.time()
    pred_y = LGBMR.predict(train_X)
    LGBMR.booster_.save_model(f"/kaggle/working/model%d.txt" % j)
    if j == 0:
        pred_ys = np.array([pred_y])
    else:
        pred_ys = np.append(pred_ys, [pred_y], axis=0)
    print(f"It took %.2f minutes to predict and save the model." % ((time.time()-start)/60) )
    del LGBMR, pred_y
    gc.collect()
    time.sleep(10)
    
mean_y = np.mean(pred_ys, axis=0)
tr_corr = meanCorr(time_ids, train_y, mean_y)
print(f"Training correlation %.4f." % tr_corr)
del pred_ys, mean_y, tr_corr
_ = gc.collect()
time.sleep(10)

In [None]:
feature2Imp = {f: sum(imp)/len(imp) for f, imp in sorted(feature2Imp.items(),
                                                         key=lambda x: sum(x[1])/len(x[1]),
                                                         reverse=True)}
print(list(feature2Imp.items())[:30])

In [None]:
print(list(feature2Imp.items())[-30:])

In [None]:
del train_X, train_y, time_ids
# del train_df
gc.collect()
time.sleep(10)

# ** Evaluation **

In [None]:
## Make submission (GradientBoostingRegressor)
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
newCnt = 0
models = []
for j in range(args.n_models):
    LGBMR = lgb.Booster(model_file=(f'/kaggle/working/model%d.txt' % j))
    models.append(LGBMR)
    del LGBMR
    _ = gc.collect()
    time.sleep(6)

for (df_test, df_submission) in iter_test:
    # Extract 'time_id' from 'row_id'
    df_test['time_id'] = df_test.row_id.str.extract(r'(\d+)_.*').astype(np.uint16)
    # Create features same as df_train
    cnt = 0
    test_investIDs = df_test['investment_id'].values.tolist()
    for test_investID in test_investIDs:
        if test_investID not in invest_ids:
            cnt += 1
    _ = prepare_features(df_test, combine_fs, remove_fs)
    test_X = df_test[features].values
    ys = None
    for j in range(args.n_models):
        y = models[j].predict(test_X)
        if j == 0:
            ys = np.array([y])
        else:
            ys = np.append(ys, [y], axis=0)
    df_submission['target'] = np.mean(ys, axis=0)
    newCnt += cnt
    env.predict(df_submission) 

print(f"There are %d new investments." % newCnt)