In [None]:
import pandas as pd
from pathlib import Path

import optuna
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


data_dir = Path('../input/tabular-playground-series-nov-2021/')

df_train = pd.read_csv(
    data_dir / "train.csv",
    index_col='id',
    #nrows=540000,# comment this row to use the full dataset
)

df_train.head(20)

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
# Downcasting the traind dataset.
for col in df_train.columns:
    
    if df_train[col].dtype == "float64":
        df_train[col] = pd.to_numeric(df_train[col], downcast="float")
        
    if df_train[col].dtype == "int64":
        df_train[col] = pd.to_numeric(df_train[col], downcast="integer")

In [None]:
# Heatmap to View Missing Values by Variable
plt.figure(figsize = (14,6))
p = sns.heatmap(df_train.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
p.axes.set_title("Valores Ausentes", fontsize = 20)

In [None]:
# Check the unique values
{df_train[col].nunique():col for col in df_train.columns if df_train[col].nunique() > 0}

# Exploratory data analysis

In [None]:
df_train.describe().T.style.background_gradient(cmap='Blues')

In [None]:
corr = df_train.corr()
corr[['target']].sort_values(by = 'target',ascending = False).style.background_gradient()

In [None]:
# View Dataset Class Distribution

sns.set(style="whitegrid")

# Using a bar chart to show the distribution of classes
bp = sns.countplot(x=df_train['target'])
plt.title("Dataset Class Distribution")
bp.set_xticklabels(["0","1"])
plt.show()

In [None]:
# function for applying recursive scaling
def recursive_scaler(x, n_scaler, scaler):
    for e in range(1,n_scaler):
        x = scaler.transform(x)
    return x

In [None]:
# Drop Cols
dropcols = ['f2','f35','target']

In [None]:
# Features and target
FEATURES = df_train.drop(dropcols, axis = 1)
TARGET = df_train['target'].astype(int).astype(str)

In [None]:
features = FEATURES.columns

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(FEATURES, TARGET, 
                                                      train_size=0.67, test_size=0.33, random_state=42, shuffle=True)

In [None]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import gc

In [None]:
import os
import random
import numpy as np
from sklearn.model_selection import RepeatedKFold
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
from catboost import Pool
from sklearn.utils import resample
import multiprocessing
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, MaxAbsScaler

In [None]:
#%%time
#n_trials = int(9)
#SEED = 143

In [None]:
#%%time
#Function to seed everything
#def seed_everything(seed):
#    random.seed(seed)
#    np.random.seed(seed)
#    os.environ['PYTHONHASHSEED'] = str(seed)
#seed_everything(SEED)

In [None]:
#n_scaler = 4

#X = np.array(X_train)
#y = np.array(y_train)

#scaler = MinMaxScaler()
#scaler.fit(X)
#X = scaler.transform(X)

#X  = recursive_scaler(X, n_scaler, scaler)

#X = pd.DataFrame(X, columns = features)

#X['sum']  = X[features].sum(axis=1)
#X['mean'] = X[features].mean(axis=1)
#X['std']  = X[features].std(axis=1)
#X['max']  = X[features].max(axis=1)
#X['min']  = X[features].min(axis=1)
#X['kurt'] = X[features].kurtosis(axis=1)

#X = np.array(X)


In [None]:
#def objective(trial):
    # Parameters
#    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=int(SEED), shuffle=True)
#    train_pool = Pool(train_x, train_y)
#    test_pool = Pool(test_x, test_y)    
#    params = {'iterations' : trial.suggest_int('iterations', 50, 1600),                         
#            'depth' : trial.suggest_int('depth', 2, 10),                                       
#            'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
#            'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
#            'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
#            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
#        }
    # Learning
#    model = CatBoostClassifier(
#            loss_function="Logloss",
#            eval_metric="AUC",
#            task_type="GPU",
#            l2_leaf_reg=0.2,
#            random_seed=SEED,
#            border_count=64,
#            **params
#    )        
#    model.fit(train_pool)
    # Predict
#    preds = model.predict_proba(test_x)
   # Evaluation
#    ROC_AUC_Score = roc_auc_score(test_y, preds[:, 1])
#    print('ROC AUC Score of CatBoost =', ROC_AUC_Score)
#    return ROC_AUC_Score

In [None]:
#%%time
#study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)))
#study.optimize(objective, n_trials = n_trials)

In [None]:
# This is nice handy constant to turn on and off the GPU. When `False`
# the notebook will ignore the GPU even when present.
#GPU_ENABLED = True

In [None]:
#X = np.array(X_train)
#y = np.array(y_train)

#scaler = QuantileTransformer()
#scaler.fit(X)
#X  = recursive_scaler(X, 2, scaler)
#X = scaler.transform(X)

#X = pd.DataFrame(X, columns = features)

#X['sum']  = X[features].sum(axis=1)
#X['mean'] = X[features].mean(axis=1)
#X['std']  = X[features].std(axis=1)
#X['max']  = X[features].max(axis=1)
#X['min']  = X[features].min(axis=1)
#X['kurt'] = X[features].kurtosis(axis=1)

#X = np.array(X)

In [None]:
#def train_model_for_study(X, y, model):
#    X_train, X_valid, y_train, y_valid = train_test_split(
#        X, 
#        y, 
#        test_size=0.33, 
#        random_state=42,
#        shuffle=True
#    )


#    model.fit(
#       X_train, 
#       y_train,
     #  sample_weight=classes_weights,
#        early_stopping_rounds=15,
#        eval_set=[(X_train, y_train), (X_valid, y_valid)],
#        eval_metric="auc",
#        verbose=True
#    )

#    yhat = model.predict_proba(X_valid)
#    score = roc_auc_score(y_valid, yhat[:, 1])
#    return score 

In [None]:
#def objective_xgb(trial):
#   """
#    Objective function to tune an `XGBRegressor` model.
#    """

#    params = {
#        'n_estimators': trial.suggest_int("n_estimators", 1000, 10000),
#        'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
#        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
#        "subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
#        "learning_rate": trial.suggest_float("learning_rate", 0.01, 2.0, log=True),
#        'max_depth': trial.suggest_int("max_depth", 2, 9),
#        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
#    }

#    if GPU_ENABLED:
#        params["tree_method"] = "gpu_hist"
#        params["predictor"] = "gpu_predictor"

#    model = XGBClassifier(
#        booster="gbtree",
#        objective="binary:logistic",
#        n_jobs=-1, 
#        random_state=42,
#        **params
#    )

#    return train_model_for_study(X, y, model)

In [None]:
#study_xgb = optuna.create_study(direction="maximize")
#study_xgb.optimize(objective_xgb, n_trials=9)
#study_xgb.best_params

In [None]:
#%%time
#study = optuna.create_study()
#study.optimize(objective, n_trials = 100)

In [None]:
import pickle

In [None]:
#%%time
# Save
#pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
#print('CatBoost Hyperparameter:', study.best_trial.params)

In [None]:
# CastBoost model configuration
ctb = CatBoostClassifier(iterations=1462, 
                        learning_rate = 0.08775145655977466, 
                        random_strength = 15, 
                        l2_leaf_reg = 0.2, 
                        depth = 3,
                        bagging_temperature=1.0190751501900164,
                        od_type='IncToDec',
                        loss_function="Logloss",
                        eval_metric='AUC:type=Ranking',
                        random_state=42)

In [None]:
# XGboost model configuration
xgb = XGBClassifier(
        learning_rate= 0.02003527792413422,
        reg_alpha = 0.004448966694735556,
        reg_lambda = 2.610959038520974,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.6183087610995104,
        objective='binary:logistic',
        n_estimators=4884,
        eval_metric='auc',
        n_jobs=-1,
        tree_method='gpu_hist',
        predictor = "gpu_predictor",
        # Uncomment if you want to use GPU. Recommended for whole training set.
        #tree_method='gpu_hist',
        random_state=42,
        )

# Applying cross-validation using two models in parallel and storing the best results of each model.

In [None]:
kf = StratifiedShuffleSplit(n_splits=9, test_size=0.33, random_state=42)
#kf = StratifiedKFold(n_splits=6, shuffle=False)
#cv_score = []
cv_score = {}
i=1
n_scaler1 = 4
n_scaler2 = 2

X_1 = np.array(X_train)
X_2 = np.array(X_train)
y = np.array(y_train)

scaler1 = MinMaxScaler()
scaler1.fit(X_1)

scaler2 = QuantileTransformer()
scaler2.fit(X_2)

X_1  = recursive_scaler(X_1, n_scaler1, scaler1)
X_2  = recursive_scaler(X_2, n_scaler2, scaler2)
#X_2  = scaler2.transform(X_2)

X_1 = pd.DataFrame(X_1, columns = features)

X_1['sum']  = X_1[features].sum(axis=1)
X_1['mean'] = X_1[features].mean(axis=1)
X_1['std']  = X_1[features].std(axis=1)
X_1['max']  = X_1[features].max(axis=1)
X_1['min']  = X_1[features].min(axis=1)
X_1['kurt'] = X_1[features].kurtosis(axis=1)

X_1 = np.array(X_1)

X_2 = pd.DataFrame(X_2, columns = features)

X_2['sum']  = X_2[features].sum(axis=1)
X_2['mean'] = X_2[features].mean(axis=1)
X_2['std']  = X_2[features].std(axis=1)
X_2['max']  = X_2[features].max(axis=1)
X_2['min']  = X_2[features].min(axis=1)
X_2['kurt'] = X_2[features].kurtosis(axis=1)

X_2 = np.array(X_2)

X_valid_1 = recursive_scaler(X_valid, n_scaler1, scaler1)
X_valid_2 = recursive_scaler(X_valid, n_scaler2, scaler2)
#X_valid_2  = scaler2.transform(X_valid)

X_valid_1 = pd.DataFrame(X_valid_1, columns = features)
X_valid_2 = pd.DataFrame(X_valid_2, columns = features)


X_valid_1['sum']  = X_valid_1[features].sum(axis=1)
X_valid_1['mean'] = X_valid_1[features].mean(axis=1)
X_valid_1['std']  = X_valid_1[features].std(axis=1)
X_valid_1['max']  = X_valid_1[features].max(axis=1)
X_valid_1['min']  = X_valid_1[features].min(axis=1)
X_valid_1['kurt'] = X_valid_1[features].kurtosis(axis=1)

X_valid_1 = np.array(X_valid_1)

X_valid_2['sum']  = X_valid_2[features].sum(axis=1)
X_valid_2['mean'] = X_valid_2[features].mean(axis=1)
X_valid_2['std']  = X_valid_2[features].std(axis=1)
X_valid_2['max']  = X_valid_2[features].max(axis=1)
X_valid_2['min']  = X_valid_2[features].min(axis=1)
X_valid_2['kurt'] = X_valid_2[features].kurtosis(axis=1)

X_valid_2 = np.array(X_valid_2)

for train_index, test_index in kf.split(X_1, y):
    print(train_index)  
    print('{} of KFold {}'.format(i,kf.n_splits)) 
    xtr_1,xvl_1 = X_1[train_index],X_1[test_index]
    xtr_2,xvl_2 = X_2[train_index],X_2[test_index]
    ytr,yvl = y[train_index],y[test_index]

    #model
    eval_set_1 = [(xtr_1,ytr), (xvl_1,yvl)]
    eval_set_2 = [(xtr_2,ytr), (xvl_2,yvl)]
    
    
    ctb.fit(xtr_1,ytr, early_stopping_rounds=15, eval_set=eval_set_1, verbose=False)
    xgb.fit(xtr_2,ytr, early_stopping_rounds=15, eval_set=eval_set_2, verbose=False)
    score1 = roc_auc_score(np.array(y_valid),ctb.predict_proba(X_valid_1)[:, 1])
    score2 = roc_auc_score(np.array(y_valid),xgb.predict_proba(X_valid_2)[:, 1])
    print('ROC AUC score1:',score1)
    print('ROC AUC score2:',score2)
    
    print(train_index.shape)
    print(test_index.shape)
    if score1 > score2:
        #cv_score.append(score1) 
        cv_score.update({'ctb'+str(i): score1})
        pickle.dump(ctb, open("model"+str(i)+".pickle.dat", "wb"))
    else:
        #cv_score.append(score2)
        cv_score.update({'xgb'+str(i): score2})
        pickle.dump(xgb, open("model"+str(i)+".pickle.dat", "wb"))
    i+=1

In [None]:
# Check Scores
cv_score

In [None]:
# load model from file
mdl1 = pickle.load(open("model1.pickle.dat", "rb"))
mdl2 = pickle.load(open("model2.pickle.dat", "rb"))
mdl3 = pickle.load(open("model3.pickle.dat", "rb"))
mdl4 = pickle.load(open("model4.pickle.dat", "rb"))
mdl5 = pickle.load(open("model5.pickle.dat", "rb"))
mdl6 = pickle.load(open("model6.pickle.dat", "rb"))
mdl7 = pickle.load(open("model7.pickle.dat", "rb"))
mdl8 = pickle.load(open("model8.pickle.dat", "rb"))
mdl9 = pickle.load(open("model9.pickle.dat", "rb"))

In [None]:
#y_pred1 = ctb1.predict_proba(X_valid_1)
#y_pred2 = ctb2.predict_proba(X_valid_1)
#y_pred3 = ctb3.predict_proba(X_valid_1)
#y_pred4 = ctb4.predict_proba(X_valid_1)
#y_pred5 = ctb5.predict_proba(X_valid_1)
#y_pred6 = ctb6.predict_proba(X_valid_1)
#y_pred7 = ctb7.predict_proba(X_valid_1)
#y_pred8 = ctb8.predict_proba(X_valid_1)
#y_pred9 = ctb9.predict_proba(X_valid_1)

In [None]:
#y_pred_1 = np.mean([y_pred1, y_pred2, y_pred3, y_pred4, y_pred5, y_pred6, y_pred7, y_pred8, y_pred9], axis=0)

## Applying the models with the best scores to the validation data then I extract the average of the prediction results.

In [None]:
modelapply = [ mdl1, mdl2, mdl3, mdl4, mdl5, mdl6, mdl7, mdl8, mdl9 ]

In [None]:
modelapply2 = [[], []]
i = 0
for item in cv_score.keys():
    if 'ctb' in item:
        modelapply2[0].append('X_valid_1')
    else:
        modelapply2[0].append('X_valid_2')
    modelapply2[1].append(modelapply[i])
    i = i + 1

In [None]:
X_valid_1.shape

In [None]:
#def predict_subset(X_1, X_2, X1_name, ModelApply):
#    y_pred = [[],[],[],[],[],[],[],[],[]]
#    start = 0
#    for i in range(0,9):
#        print(ModelApply[0][i])
#        model = ModelApply[1][i]

#        if ModelApply[0][i] == 'X_valid_1':
#            chunk_size = int(X_1.shape[0] / 9)
#            subset = X_1[start:start + chunk_size]
#            y_pred[i] = model.predict_proba(subset)
#        else:
#            chunk_size = int(X_2.shape[0] / 9)
#            subset = X_2[start:start + chunk_size]
#            y_pred[i] = model.predict_proba(subset)
        #start = start + chunk_size
#    return y_pred

In [None]:
def predict_model(X_1, X_2, X1_name, ModelApply):
    y_pred = [[],[],[],[],[],[],[],[],[]]
    for i in range(0,9):
        print(ModelApply[0][i])
        model = ModelApply[1][i]

        if ModelApply[0][i] == 'X_valid_1':
            y_pred[i] = model.predict_proba(X_1)
        else:
            y_pred[i] = model.predict_proba(X_2)
    return y_pred

In [None]:
y_pred = predict_model(X_valid_1, X_valid_2, 'X_valid_1', modelapply2)

In [None]:
y_pred_1 = np.mean([y_pred[0], y_pred[1], y_pred[2], y_pred[3], y_pred[4], y_pred[5], y_pred[6], y_pred[7], y_pred[8]], axis=0)

In [None]:
y_pred_1

In [None]:
#flatlist=[element.tolist() for sublist in y_pred for element in sublist]

In [None]:
#y_pred_1 = np.array(flatlist)

In [None]:
# retrieve just the probabilities for the positive class
pos_probs = y_pred_1[:, 1]
# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y_valid.astype(str).astype(int), pos_probs)
# plot model roc curve
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_valid.astype(str).astype(int), y_pred_1[:,1])

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
import scikitplot as skplt
skplt.metrics.plot_roc(y_valid.astype(str).astype(int), y_pred_1, figsize=(10, 8))

In [None]:
# reading test data
X_test = pd.read_csv(data_dir / "test.csv", index_col='id')

## Applying the models with the best scores to the test data then I extract the average of the prediction results.

In [None]:
# Downcasting the test dataset.
for col in X_test.columns:
    
    if X_test[col].dtype == "float64":
        X_test[col] = pd.to_numeric(X_test[col], downcast="float")
        
    if X_test[col].dtype == "int64":
        X_test[col] = pd.to_numeric(X_test[col], downcast="integer")

In [None]:
X_test.shape

In [None]:
X_test.describe().T.style.background_gradient(cmap='Blues')

In [None]:
# get predictions
dropcols = ['f2','f35']
Xt = X_test.drop(dropcols, axis = 1)
columns = Xt.columns

Xt1  = recursive_scaler(Xt, n_scaler1, scaler1)
Xt2  = recursive_scaler(Xt, n_scaler2, scaler2)
#Xt2  = scaler2.transform(Xt)

Xt1 = pd.DataFrame(Xt1, columns = features, )
Xt2 = pd.DataFrame(Xt2, columns = features)

Xt1['sum']  = Xt1[features].sum(axis=1)
Xt1['mean'] = Xt1[features].mean(axis=1)
Xt1['std']  = Xt1[features].std(axis=1)
Xt1['max']  = Xt1[features].max(axis=1)
Xt1['min']  = Xt1[features].min(axis=1)
Xt1['kurt'] = Xt1[features].kurtosis(axis=1)

Xt1 = np.array(Xt1)

Xt2['sum']  = Xt2[features].sum(axis=1)
Xt2['mean'] = Xt2[features].mean(axis=1)
Xt2['std']  = Xt2[features].std(axis=1)
Xt2['max']  = Xt2[features].max(axis=1)
Xt2['min']  = Xt2[features].min(axis=1)
Xt2['kurt'] = Xt2[features].kurtosis(axis=1)

Xt2 = np.array(Xt2)
#y_pred = ctb.predict_proba(np.array(Xt))

In [None]:
y_test = predict_model(Xt1, Xt2, 'Xt1', modelapply2)

In [None]:
y_test_1 = np.mean([y_test[0], y_test[1], y_test[2], y_test[3], y_test[4], y_test[5], y_test[6], y_test[7], y_test[8]], axis=0)

In [None]:
#y_test = predict_subset(Xt1, Xt2, 'Xt1', modelapply2)

In [None]:
#flatlist=[element.tolist() for sublist in y_test for element in sublist]

In [None]:
#y_test_1 = np.array(flatlist)

In [None]:
#y_pred1 = ctb1.predict_proba(Xt1)
#y_pred2 = ctb2.predict_proba(Xt1)
#y_pred3 = ctb3.predict_proba(Xt1)
#y_pred4 = ctb4.predict_proba(Xt1)
#y_pred5 = ctb5.predict_proba(Xt1)
#y_pred6 = ctb6.predict_proba(Xt1)
#y_pred7 = ctb7.predict_proba(Xt1)
#y_pred8 = ctb8.predict_proba(Xt1)
#y_pred9 = ctb9.predict_proba(Xt1)

In [None]:
#y_test_1 = np.mean([y_pred1, y_pred2, y_pred3, y_pred4, y_pred5, y_pred6, y_pred7, y_pred8, y_pred9], axis=0)

In [None]:
y_pred_test = pd.Series(
    y_test_1[:, 1],
    index=X_test.index,
    name='target',
)

In [None]:
y_pred_test

In [None]:
# Create submission file
y_pred_test.to_csv("submission.csv")