In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
#from sklearn.linear_model import LinearRegression
import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# check on the panda version and its dependencies
# i run this from time to time to ensure all is up to date
pd.__version__
#pd.show_versions()

In [None]:
# kfolds-5/train_folds_5.csv is an output from https://www.kaggle.com/code/andrewnuk/kfolds-5

df_train = pd.read_csv('/kaggle/input/kfolds-5/train_folds_5.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
df_sampleSubmission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
# detect number of folds
fold_no = df_train['kfolds'].max() +1
fold_no

In [None]:
# output files from https://www.kaggle.com/code/andrewnuk/models-level1-stacking/notebook

df1 = pd.read_csv("../input/models-level1-stacking/train_pred_1_FE_20220524.csv")
df1.columns = ["id", "pred_1"]
df2 = pd.read_csv("../input/models-level1-stacking/train_pred_2_FE_20220524.csv")
df2.columns = ["id", "pred_2"]
df3 = pd.read_csv("../input/models-level1-stacking/train_pred_3_FE_20220524.csv")
df3.columns = ["id", "pred_3"]

df_test1 = pd.read_csv("../input/models-level1-stacking/test_pred_1_FE_20220524.csv")
df_test1.columns = ["id", "pred_1"]
df_test2 = pd.read_csv("../input/models-level1-stacking/test_pred_2_FE_20220524.csv")
df_test2.columns = ["id", "pred_2"]
df_test3 = pd.read_csv("../input/models-level1-stacking/test_pred_3_FE_20220524.csv")
df_test3.columns = ["id", "pred_3"]

df_train = df_train.merge(df1, on="id", how="left")
df_train = df_train.merge(df2, on="id", how="left")
df_train = df_train.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

In [None]:
df_train.head()

In [None]:
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

In [None]:
# taken from the hypertuning notebook

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'use_label_encoder': False,
    'n_estimators': 10000,
    'learning_rate': 0.049543722885399176,
    'reg_lambda': 1.878873269789419,
    'reg_alpha': 0.1292588205628619,
    'subsample': 0.4597818683023742,
    'colsample_bytree': 0.7949921440875072,
    'max_depth': 5}


In [None]:
final_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(fold_no):
    xtrain =  df_train[df_train['kfolds'] != fold].reset_index(drop=True)
    xvalid = df_train[df_train['kfolds'] == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
#     ordinal_encoder = preprocessing.OrdinalEncoder()
#     xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
#     xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
#     xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
#     model = XGBRegressor(random_state=0, n_jobs=6) # i have 8 cores but want to keep 2 open
    
    model = XGBRegressor(random_state=0, n_jobs=-1, **params)
       
    model.fit(xtrain, ytrain, early_stopping_rounds=1000, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    #rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    roc = roc_auc_score(yvalid, preds_valid)
    print(fold, roc)
    scores.append(roc)

print(np.mean(scores), np.std(scores))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("level1_train_pred_1_20220524.csv", index=False)

df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
df_sampleSubmission.columns = ["id", "pred_1"]
df_sampleSubmission.to_csv("level1_test_pred_1_20220524.csv", index=False)

In [None]:
# column_names = ["id", "target"]

# # df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
# df_sampleSubmission.to_csv("submission20220524d.csv", header=column_names, index=False)

In [None]:
df_sampleSubmission.head()

In [None]:
# kfolds-5/train_folds_5.csv is an output from https://www.kaggle.com/code/andrewnuk/kfolds-5

df_train = pd.read_csv('/kaggle/input/kfolds-5/train_folds_5.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
df_sampleSubmission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
# output files from https://www.kaggle.com/code/andrewnuk/models-level1-stacking/notebook

df1 = pd.read_csv("../input/models-level1-stacking/train_pred_1_FE_20220524.csv")
df1.columns = ["id", "pred_1"]
df2 = pd.read_csv("../input/models-level1-stacking/train_pred_2_FE_20220524.csv")
df2.columns = ["id", "pred_2"]
df3 = pd.read_csv("../input/models-level1-stacking/train_pred_3_FE_20220524.csv")
df3.columns = ["id", "pred_3"]

df_test1 = pd.read_csv("../input/models-level1-stacking/test_pred_1_FE_20220524.csv")
df_test1.columns = ["id", "pred_1"]
df_test2 = pd.read_csv("../input/models-level1-stacking/test_pred_2_FE_20220524.csv")
df_test2.columns = ["id", "pred_2"]
df_test3 = pd.read_csv("../input/models-level1-stacking/test_pred_3_FE_20220524.csv")
df_test3.columns = ["id", "pred_3"]

df_train = df_train.merge(df1, on="id", how="left")
df_train = df_train.merge(df2, on="id", how="left")
df_train = df_train.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

In [None]:
df_test = df_test[useful_features]

In [None]:
# taken from the hypertuning notebook

params_cb = {
    'loss_function': 'CrossEntropy',
    'eval_metric': 'AUC',
    'bootstrap_type': 'Bernoulli',
    'n_estimators': 10000,
    'learning_rate': 0.04639909669169314,
    'l2_leaf_reg': 4.764270064283827,
    'min_data_in_leaf': 47,
    'depth': 6,
    'leaf_estimation_iterations': 3,
    'subsample': 0.8221602391299252}

In [None]:
final_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(fold_no):
    xtrain =  df_train[df_train['kfolds'] != fold].reset_index(drop=True)
    xvalid = df_train[df_train['kfolds'] == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()    

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
#     ordinal_encoder = preprocessing.OrdinalEncoder()
#     xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
#     xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
#     xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
#     model = XGBRegressor(random_state=0, n_jobs=6) # i have 8 cores but want to keep 2 open
    
    model = CatBoostClassifier(random_state=0,  **params_cb)
       
    model.fit(xtrain, ytrain, early_stopping_rounds=1000, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:, -1]
    test_preds = model.predict_proba(xtest)[:, -1]
    final_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))    
    #rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    roc = roc_auc_score(yvalid, preds_valid)
    print(fold, roc)
    scores.append(roc)
   
    
print(np.mean(scores), np.std(scores))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("level1_train_pred_2_20220524.csv", index=False)

df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
df_sampleSubmission.columns = ["id", "pred_2"]
df_sampleSubmission.to_csv("level1_test_pred_2_20220524.csv", index=False)

In [None]:
# column_names = ["id", "target"]

# # df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
# df_sampleSubmission.to_csv("submission20220524e.csv", header=column_names, index=False)

In [None]:
df_sampleSubmission.head()

In [None]:
# kfolds-5/train_folds_5.csv is an output from https://www.kaggle.com/code/andrewnuk/kfolds-5

df_train = pd.read_csv('/kaggle/input/kfolds-5/train_folds_5.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
df_sampleSubmission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')


In [None]:
# output files from https://www.kaggle.com/code/andrewnuk/models-level1-stacking/notebook

df1 = pd.read_csv("../input/models-level1-stacking/train_pred_1_FE_20220524.csv")
df1.columns = ["id", "pred_1"]
df2 = pd.read_csv("../input/models-level1-stacking/train_pred_2_FE_20220524.csv")
df2.columns = ["id", "pred_2"]
df3 = pd.read_csv("../input/models-level1-stacking/train_pred_3_FE_20220524.csv")
df3.columns = ["id", "pred_3"]

df_test1 = pd.read_csv("../input/models-level1-stacking/test_pred_1_FE_20220524.csv")
df_test1.columns = ["id", "pred_1"]
df_test2 = pd.read_csv("../input/models-level1-stacking/test_pred_2_FE_20220524.csv")
df_test2.columns = ["id", "pred_2"]
df_test3 = pd.read_csv("../input/models-level1-stacking/test_pred_3_FE_20220524.csv")
df_test3.columns = ["id", "pred_3"]

df_train = df_train.merge(df1, on="id", how="left")
df_train = df_train.merge(df2, on="id", how="left")
df_train = df_train.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

In [None]:
df_test = df_test[useful_features]

In [None]:
# taken from the hypertuning notebook

params_lgb = {
    'objective': 'binary',
    'metric': 'auc',
    'n_estimators': 20000,
    'learning_rate': 0.040751948678898225,
    'reg_lambda': 0.0011184733873157485,
    'reg_alpha': 0.18066237242292785,
    'subsample': 0.24508506693514687,
    'subsample_freq': 1,
    'colsample_bytree': 0.5051094430082244,
    'min_child_weight': 3,
    'min_child_samples': 126}


In [None]:
final_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(fold_no):
    xtrain =  df_train[df_train['kfolds'] != fold].reset_index(drop=True)
    xvalid = df_train[df_train['kfolds'] == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()    

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
#     ordinal_encoder = preprocessing.OrdinalEncoder()
#     xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
#     xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
#     xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
#     model = XGBRegressor(random_state=0, n_jobs=6) # i have 8 cores but want to keep 2 open
    
    model = lgb.LGBMClassifier(random_state=0, n_jobs=-1, **params_lgb)
       
    model.fit(xtrain, ytrain, early_stopping_rounds=1000, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:, -1]
    test_preds = model.predict_proba(xtest)[:, -1]
    final_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))    
    #rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    roc = roc_auc_score(yvalid, preds_valid)
    print(fold, roc)
    scores.append(roc)

print(np.mean(scores), np.std(scores))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("level1_train_pred_3_20220524.csv", index=False)

df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
df_sampleSubmission.columns = ["id", "pred_3"]
df_sampleSubmission.to_csv("level1_test_pred_3_20220524.csv", index=False)

In [None]:
# column_names = ["id", "target"]

# # df_sampleSubmission.target = np.mean(np.column_stack(final_predictions), axis=1)
# df_sampleSubmission.to_csv("submission20220524f.csv", header=column_names, index=False)

In [None]:
df_sampleSubmission.head()