In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split, KFold
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from category_encoders.target_encoder import TargetEncoder

In [None]:
#parquet file comes from here: https://www.kaggle.com/avijitduttta/xgboost-basicml-solution
#Thanks a lot!
df_train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
print(df_train.shape)
print(df_train.investment_id.nunique())
df_train.head()

In [None]:
te = TargetEncoder(smoothing=0.2, cols=["investment_id"])
te.fit_transform(df_train["investment_id"], df_train["target"]).astype("float32")

df_train["investment_te"] = 0.

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
features += ['investment_id', 'investment_te']
investment_id = df_train['investment_id']
time_id = df_train['time_id']

In [None]:
features = ['investment_id','investment_te',
 'f_196', 'f_0', 'f_284', 'f_98', 'f_99', 'f_151', 'f_208', 'f_7', 'f_142', 'f_227', 'f_123', 'f_83', 'f_74', 'f_103', 'f_91', 'f_35', 'f_81', 'f_70', 'f_257', 'f_277', 'f_224', 'f_230', 'f_77', 'f_218', 'f_182', 'f_187', 'f_11', 'f_167', 'f_102', 'f_13', 'f_177', 'f_232', 'f_248', 'f_63', 'f_296', 'f_71', 'f_198', 'f_265', 'f_145', 'f_21', 'f_170', 'f_290', 'f_200', 'f_174', 'f_237', 'f_149', 'f_138', 'f_37', 'f_112', 'f_68', 'f_61', 'f_22', 'f_75', 'f_8', 'f_23', 'f_287', 'f_266', 'f_252', 'f_189', 'f_239', 'f_231', 'f_121', 'f_105', 'f_283', 'f_181', 'f_220', 'f_215', 'f_286', 'f_275', 'f_30', 'f_27', 'f_18', 'f_108', 'f_202', 'f_236', 'f_132', 'f_201', 'f_15', 'f_110', 'f_114', 'f_118', 'f_53', 'f_93', 'f_183', 'f_185', 'f_276', 'f_73', 'f_85', 'f_186', 'f_199', 'f_109', 'f_48', 'f_289', 'f_84', 'f_19', 'f_229', 'f_16', 'f_166', 'f_6', 'f_78', 'f_44', 'f_64', 'f_191', 'f_154', 'f_115', 'f_225', 'f_251', 'f_260', 'f_42', 'f_57', 'f_26', 'f_46', 'f_127', 'f_263', 'f_126', 'f_129', 'f_10', 'f_141', 'f_80', 'f_246', 'f_24', 'f_219', 'f_40', 'f_128', 'f_38', 'f_100', 'f_55', 'f_124', 'f_5', 'f_165', 'f_66', 'f_122', 'f_33', 'f_222', 'f_135', 'f_282', 'f_153', 'f_62', 'f_176', 'f_34', 'f_299', 'f_82', 'f_293', 'f_92', 'f_281', 'f_210', 'f_111', 'f_253', 'f_244', 'f_238', 'f_241', 'f_45', 'f_162', 'f_32', 'f_20', 'f_9', 'f_12', 'f_258', 'f_120', 'f_175', 'f_56', 'f_146'
 ]

df_train = df_train.loc[:,['target']+features]

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                #    df[col] = df[col].astype(np.float32)
                #else:
                df[col] = df[col].astype(np.float16)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

df_train = reduce_mem_usage(df_train)

In [None]:
class stack_model:
    def __init__(self,base_models,meta_model,train_X,train_y):
        self.base_models = base_models
        self.meta_model  = meta_model
        self.train_X = train_X
        self.train_y = train_y
        #self.time_id = time_id

    def fit(self,cvFolds = 5, random_state= 42, categorical=[], target_encoding=False):
        meta_model_data = pd.DataFrame()
        for i,val in enumerate(self.base_models):
            meta_model_data[f"class_{i}"] = None
        meta_model_data["actual"] = None
        #meta_model_data["time_id"] = None

        # Create kfolds
        #kf = KFold(n_splits= cvFolds, shuffle=False)
        kf = TimeSeriesSplit(n_splits= cvFolds+5)
        for i, (train_index, test_index) in enumerate(kf.split(self.train_X)):
            print("Fold Number: ", i)
            if i < 6:
                continue
            X_train = self.train_X[train_index]
            X_test = self.train_X[test_index]

            y_train = self.train_y[train_index]
            y_test = self.train_y[test_index]

            # Encodings
            ## target encoding
            if target_encoding:
                encoder_df = pd.DataFrame(X_train[:, :2])
                encoder_df.columns = ["investment_id", "investment_te"]
                encoder_df["target"] = y_train
                te = TargetEncoder(smoothing=0.2, cols=["investment_id"])
                X_train[:, 1:2] = te.fit_transform(encoder_df["investment_id"], encoder_df["target"]).astype("float32")
                encoder_test = pd.DataFrame(X_test[:, 0:1])
                encoder_test.columns = ["investment_id"]
                X_test[:, 1:2] = te.transform(encoder_test["investment_id"]).astype("float32")
                del encoder_df
                gc.collect()
                

            # Get predictions from all the base models
            this_fold_df = pd.DataFrame()
            this_fold_df["actual"] = y_test.tolist()
            for i,val in enumerate(self.base_models):
                X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, 
                                                                test_size=0.2,
                                                                shuffle=False)
                self.base_models[i].fit(X_train,y_train,
                                        eval_set=[(X_val, y_val)],
                                        verbose=False,
                                        early_stopping_rounds=15)
                this_fold_df[f"class_{i}"] = self.base_models[i].predict(X_test)
                #this_fold_df["time_id"] = self.time_id[test_index]
                print(f"Model {i}, MSE SCORE: ", mean_squared_error(y_test, this_fold_df[f"class_{i}"]))
                #print("Correlation result from this fold= ", evaluate(this_fold_df, "actual", f"class_{i}"))
                print("Simple corr: ", pearsonr(this_fold_df[f"actual"], this_fold_df[f"class_{i}"]))
            #this_fold_df["actual"] = y_test.tolist()
            meta_model_data = meta_model_data.append(this_fold_df)
        #meta_model_data = meta_model_data.dropna()
        X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(
                                                    meta_model_data.drop("actual", axis =1), 
                                                    meta_model_data["actual"],
                                                    test_size=0.2, 
                                                    shuffle=False)
        self.meta_model.fit(X_meta_train, y_meta_train,
                            eval_set=(X_meta_test, y_meta_test),
                            verbose=50,
                            early_stopping_rounds=20)
        #Now release the memory by deleting  train_X since we dont need
        self.train_X = None


    def predict(self,test_X):
        meta_X = pd.DataFrame()
        res = pd.DataFrame()
        for i,val in enumerate(self.base_models):
            meta_X[f"class_{i}"] = self.base_models[i].predict(test_X)
        res[f"class_"] = self.meta_model.predict(meta_X.values)
        return res["class_"].values

Let's create a holdout set and also take a partition of train because of memory limitations

In [None]:
#HOLD OUT
holdout = df_train.iloc[int(len(df_train)*0.9):]
df_train = df_train.iloc[int(len(df_train)*0.4):] #int(len(df_train)*0.9)

gc.collect()

Now we will have 3 model and with their out of fold predictions we will build another model

In [None]:
meta_models = {}
scalers = {}

#sc = StandardScaler()
X = df_train[features].values
y = df_train['target'].values

catboost1 = CatBoostRegressor(random_seed=42, task_type='GPU')
xgboost1 = XGBRegressor(random_state=42, tree_method='gpu_hist',  n_estimators=800,
    learning_rate=0.05,
    max_depth=12,
    subsample=0.9,
    colsample_bytree=0.9,
    #colsample_bylevel=0.75,
    missing=-999,)
lgbm1 = LGBMRegressor(random_state=42, 
                      n_estimators=800, 
                      device_type='gpu', 
                      extra_trees=True,
                      lambda_l1=.2,
                      lambda_l2=.2,
                      feature_fractio=.8)

base_models = [
    catboost1,
    xgboost1,
    lgbm1
]

meta_model = CatBoostRegressor(iterations=800, random_state=42, task_type='GPU', verbose=0)
meta_models["meta1"] = stack_model(base_models,meta_model,X,y)
meta_models["meta1"].fit(cvFolds = 5, random_state= 42, categorical=[], target_encoding=True)

In [None]:
#Lets test it with holdout:
holdout["preds"] = meta_models["meta1"].predict(holdout[features].values)
print(f"Correlation: {pearsonr(holdout['target'].values, holdout['preds'].values)[0]}")

In [None]:
import gc
gc.collect()
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()



In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df["investment_te"] = te.transform(test_df["investment_id"]).astype("float32")
    x_tt = test_df.loc[:, features].values
    preds = meta_models["meta1"].predict(x_tt)
    sample_prediction_df['target'] = preds
    env.predict(sample_prediction_df) 
    #display(sample_prediction_df)