# 1.Import and function

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

# import tabpfn

# from autosklearn.classification import AutoSklearnClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score, classification_report, roc_curve, confusion_matrix, plot_roc_curve, precision_score, recall_score
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler, SMOTENC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


import xgboost
import lightgbm
import catboost

pd.set_option('display.max_rows', 30)

import warnings
warnings.filterwarnings('ignore')

def convert_feature_to_label(df, col):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    return df

def convert_feature_to_one_hot(df, col):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    oh = OneHotEncoder(handle_unknown='ignore')
    df_oh = pd.DataFrame(oh.fit_transform(df[[col]]).toarray())
    df = df.join(df_oh)

    rename_dict = {}

    for i, name in zip(df_oh.columns, le.classes_):
        rename_dict[i] = str(col) + '_' + str(name)

    df = df.rename(columns=rename_dict)
    df.drop(col, axis = 1, inplace = True)

    return df

def scale_feature(col, method='std'):
    if method == 'std':
        std = np.std(col)
        mean = np.mean(col)
        return (col - mean) / std
    elif method =='minmax':
        min = np.min(col)
        max = np.max(col)
        return (col - min) / (max - min)


def calc_metrics(y_true, y_pred, metrics=[]):
    out = []
    #y_pred = y_pred >= 0.8357
    for met in metrics:
        if not met == roc_auc_score:
            out.append(met(y_true, y_pred.argmax(1)))
        else:
            out.append(met(y_true, np.max(y_pred, 1)))

    return out


def calc_class_weights(df, target='sga', type='log'):
    if type == 'log':
        valid_class = df[target].value_counts().sort_index().values
        # 1 / (1+ln(x))
        class_weights = 1./np.log1p(valid_class)
        # Normalize class_weights and multiply with number of classes
        class_weights = class_weights / class_weights.sum() * 2

    elif type == 'normal':
        classes = df[target].value_counts().sort_index().values
        class_weights = classes/df.shape[0]
        class_weights = 1 - class_weights
        class_weights = class_weights / class_weights.sum() * 2

    else:
        class_weights = None

    return class_weights

# def ohe(df):
#     # OHE for categorical data
#     if set(('presentation', 'placenta_site')).issubset(df.columns):
#         for col in ['presentation', 'placenta_site']:
#             df = convert_feature_to_one_hot(df, col)
#     for col in ['cord', 'hypertension', 'diabetes']:
#         df = convert_feature_to_one_hot(df, col)
#     for col in ['gender', 'smoking']:
#         df = convert_feature_to_label(df, col)

#     # OHE for ordinal data
#     df = df.replace({'oligohydramnios' : 0, 'normal' : 1, 'polyhydramnios' : 2})
#     return df
def ohe(df):
    # OHE for categorical data
    categorical_cols = ['presentation', 'placenta_site', 'cord', 'hypertension', 'diabetes', 'gender', 'smoking']
    for col in categorical_cols:
        if col in df.columns:
            df = convert_feature_to_one_hot(df, col)

    # OHE for ordinal data: 'af' categories
    df = df.replace({'oligohydramnios' : 0, 'normal' : 1, 'polyhydramnios' : 2, 'increased': 3, 'reduced' : 4, 'anhydramnios' : 5})

    return df


def data_impute(df):
    # Iterative data imputation
    imputer = IterativeImputer(random_state = 123)
    imputed = imputer.fit_transform(df)
    return pd.DataFrame(imputed, columns = df.columns)


def remove_multicollinearity(df, thresh = 10):
    vif_info = pd.DataFrame()
    vif_info['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif_info['Column'] = df.columns
    lst = vif_info.values.tolist()
    num_lst = [row[1] for row in lst if row[0] != np.inf]
    num_lst = df[num_lst]

    idx = 100
    while idx >= thresh:
        vif_info = pd.DataFrame()
        vif_info['VIF'] = [variance_inflation_factor(num_lst.values, i) for i in range(num_lst.shape[1])]
        vif_info['Column'] = num_lst.columns
        lst =  vif_info.values.tolist()
        idx = [index for index, item in enumerate(lst) if item == max(lst)][0]
        if idx >= thresh:
            df.drop(lst[idx[0]], axis = 1, inplace = True)

    return df


def ignore_low_variance(df, thresh = 0.1, label = 'sga'):
    label_col = df[label]
    selector = VarianceThreshold(threshold = thresh)
    temp = pd.DataFrame(selector.fit_transform(df))
    df = df.loc[:, selector.get_support()]
    df[label] = label_col
    return df


def pca(df, num_comp):
    # Normalization and transformation for numerical data
    continuous_col = ['ac', 'bpd', 'cm', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'hl', 'tcd', 'mother_age_at_start_date', 'mother_height', 'mother_weight']
    # if 'mother_height' in df.columns:
    #     continuous_col = ['ac', 'bpd', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'mother_age_at_start_date', 'mother_height', 'mother_weight']
    # else:
    #     continuous_col = ['ac', 'bpd', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'mother_age_at_start_date', 'bmi']

    # print(continuous_col)

    for col in continuous_col:
        if col in df.columns:
            try:
                df[col] = scale_feature(df[col], method='std')
                df[col] = PowerTransformer(method="yeo-johnson", standardize=False, copy=True).fit_transform(df[col])
            except:
                continue

    df_cont = df[continuous_col]
    df_cat_label = df.drop(df_cont, axis = 1)
    pca = PCA(n_components = num_comp)
    temp = pd.DataFrame(pca.fit_transform(df_cont))
    temp = temp.rename(columns={0: 'ac', 1: 'bpd', 2: 'cm', 3: 'efw_centile', 4: 'efw', 5: 'fl', 6: 'ga', 7: 'hc', 8: 'hl', 9: 'tcd', 10: 'mother_age_at_start_date', 11: 'mother_height', 12: 'mother_weight'})

    # if 'mother_height' in df.columns:
    #     temp = temp.rename(columns={0: 'ac', 1: 'bpd', 2: 'cm', 3: 'efw_centile', 4: 'efw', 5: 'fl', 6: 'ga', 7: 'hc', 8: 'hl', 9: 'tcd', 10: 'mother_age_at_start_date', 11: 'mother_height', 12: 'mother_weight'})
    # else:
    #     temp = temp.rename(columns={0: 'ac', 1: 'bpd', 2: 'cm', 3: 'efw_centile', 4: 'efw', 5: 'fl', 6: 'ga', 7: 'hc', 8: 'hl', 9: 'tcd', 10: 'mother_age_at_start_date', 11: 'bmi'})
    # print(temp.columns)
    # print(pca.explained_variance_ratio_)
    result = pd.concat([df_cat_label, temp], axis = 1)
    return result

def pca_bmi(df, num_comp):
    # Normalization and transformation for numerical data
    continuous_col = ['ac', 'bpd', 'cm', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'hl', 'tcd', 'mother_age_at_start_date', 'bmi'] #

    for col in continuous_col:
        if col in df.columns:
            try:
                df[col] = scale_feature(df[col], method='std')
                df[col] = PowerTransformer(method="yeo-johnson", standardize=False, copy=True).fit_transform(df[col])
            except:
                continue

    df_cont = df[continuous_col]
    df_cat_label = df.drop(df_cont, axis = 1)
    pca = PCA(n_components = num_comp)
    temp = pd.DataFrame(pca.fit_transform(df_cont))
    temp = temp.rename(columns={0: 'ac', 1: 'bpd', 2: 'cm', 3: 'efw_centile', 4: 'efw', 5: 'fl', 6: 'ga', 7: 'hc', 8: 'hl', 9: 'tcd', 10: 'mother_age_at_start_date', 11: 'bmi'}) #

    # print(pca.explained_variance_ratio_)
    result = pd.concat([df_cat_label, temp], axis = 1)
    return result

## method 1
def ToNormal(df, type = 'MinMax'):
    # MINMAX normalization for numerical data
    if 'mother_height' in df.columns:
        continuous_col = ['ac', 'bpd', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'mother_age_at_start_date', 'mother_height', 'mother_weight']
    else:
        continuous_col = ['ac', 'bpd', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'mother_age_at_start_date', 'bmi']

    if type == 'MinMax':
        scaler = MinMaxScaler()
    if type == 'ZScore':
    ## z-score
        scaler = StandardScaler()
    df[continuous_col] = scaler.fit_transform(df[continuous_col])
    df_cont = df[continuous_col]
    df_cat_label = df.drop(df_cont, axis=1)
    return df_cont.join(df_cat_label)

## method 2
# def ToNormal(df):
    # MINMAX normalization for numerical data

    # continuous_col = ['ac', 'bpd', 'efw_centile', 'efw', 'fl', 'ga', 'hc', 'mother_age_at_start_date', 'mother_height', 'mother_weight']
    # for col in continuous_col:
    #     try:
    #         df[col] = scale_feature(df[col], method='minmax')
    #         # df[col] = PowerTransformer(method="yeo-johnson", standardize=False, copy=True).fit_transform(df[col])
    #     except:
    #         continue
    # df_cat_label = df.drop(continuous_col, axis = 1)
    # return df_cat_label.join(df[continuous_col])


def sgkf(df, label = 'sga'):
    sgkf = StratifiedGroupKFold(shuffle=True, random_state=123, n_splits = 5)

    df['fold'] = -1

    for fold_num, (_, val_idx) in enumerate(sgkf.split(df, df[label], groups=df.id)):
        df.loc[val_idx, 'fold'] = fold_num

    # tri2[tri2['fold'] == 0].sga.value_counts()
    return df.drop('id', axis=1)


# 2.Model
- LR
- LightLGM
- SVCrbf
- SVClr
- Catboost
- Gaussian
- lgbm
- xgboost
- random forest

## LR

In [None]:
def LR(df, ctrl, label = 'sga'):

    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])
    # print(train_cols)
    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    for fold in range(5):
        param_grid = {
            'class_weight' : ['balanced']
            }

        lr = LogisticRegression(class_weight='balanced')
        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        # sample_weights = [class_weights[int(x)] for x in train_Y]
        # lr.fit(train_X, train_Y)

        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
        grid_search.fit(train_X, train_Y)
        best_params = grid_search.best_params_

        lr = LogisticRegression(**best_params)
        lr.fit(train_X, train_Y)


        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])


        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)

    # print(train_cols)

    print()
    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')

    return




## LightLGM

In [None]:
def LightLGM(df, ctrl, label = 'sga'):
# lr = LogisticRegression()
# lr = lightgbm.LGBMClassifier('gbdt', learning_rate=0.2)
# lr = xgboost.XGBClassifier()
# lr= tabpfn.TabPFNClassifier(N_ensemble_configurations=16, only_inference=False, combine_preprocessing=True)
# lr = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=300, max_depth=100)
    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    class_weights = calc_class_weights(df, target= label, type='normal')
    reg_range = np.arange(0.1, 1.1, 0.1)
    rate = np.arange(0, 1, 0.01)

    for fold in range(5):
        param_grid = {
            'boosting_type': ['gbdt'], #default
            'class_weight': ['balanced'], #
            # 'colsample_bytree': [1.0], #default
            # 'importance_type': ['split'], #default
            'learning_rate': [0.4], #
            'max_depth': [-1], #
            'min_child_samples': [13], #
            'min_child_weight': [0.001], #default
            'min_split_gain': [0.0], #
            'n_estimators': [100], #
            'n_jobs': [-1], #default
            'num_leaves': [27], #
            'random_state': [123], #
            'reg_alpha': [0.0], #
            'reg_lambda': [0.0],
            'silent': [True],
            'subsample': [1.], #
            # 'subsample_for_bin': [200000], #default
            # 'subsample_freq': [0], #default
              }

        lr = lightgbm.LGBMClassifier()
        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        sample_weights = [class_weights[int(x)] for x in train_Y]
        lr.fit(train_X, train_Y)


        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5)
        grid_search.fit(train_X, train_Y, sample_weight=sample_weights)
        best_params = grid_search.best_params_


        lr = lightgbm.LGBMClassifier(**best_params) #
        lr = lightgbm.LGBMClassifier(class_weight='balanced', learning_rate=0.4, min_child_samples=13,
               num_leaves=27, random_state=123)
        lr.fit(train_X, train_Y, sample_weight=sample_weights, verbose=100)


        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])


        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)

    print(lr, lr.get_params())
    # print(df.columns)
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')
    # lst.to_csv('../data/tri3_predict_proba.csv', index_label=None, index=None)
    # return np.mean(oof_roc_auc)
    return




## SVC rbf

In [None]:

def SVCrbf(df, ctrl, label = 'sga'):

    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])
    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    # class_weights = calc_class_weights(df, target= label, type='normal')
    degrees = np.arange(1, 11, 3)
    for fold in range(5):
        param_grid = {
            'kernel': ['rbf'],
            'C' : [1],  # 76.48, 77.5
            'gamma' : ['scale'],
            'probability' : [True],
            'random_state': [123],
            'degree' : [1] #degrees
            }

        lr = SVC(probability = True, kernel='rbf', random_state = 123) #, gamma='scale' C = 0.1, degree = 8,kernel = 'rbf' / 'poly' / 'linear', gamma = 'auto',

        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        # sample_weights = [class_weights[int(x)] for x in train_Y]
        # lr.fit(train_X, train_Y)

        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
        grid_search.fit(train_X, train_Y)
        best_params = grid_search.best_params_

        lr = SVC(**best_params)
        lr.fit(train_X, train_Y)

        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])

        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)

    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')

    return



## SVC lr

In [None]:
def SVClr(df, ctrl, label = 'sga'):
# lr = LogisticRegression()
# lr = lightgbm.LGBMClassifier('gbdt', learning_rate=0.2)
# lr = xgboost.XGBClassifier()
# lr= tabpfn.TabPFNClassifier(N_ensemble_configurations=16, only_inference=False, combine_preprocessing=True)
# lr = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=300, max_depth=100)
    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])
    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    # class_weights = calc_class_weights(df, target= label, type='normal')

    for fold in range(5):
        param_grid = {
            'kernel': ['linear'],
            'C' : [1],  # 76.48, 77.5
            'gamma' : ['scale'],
            'probability' : [True],
            'random_state': [123],
            'degree' : [3]
            }

        lr = SVC(probability = True, kernel='linear', random_state = 123, C = 1) #, gamma='scale' C = 0.1, degree = 8,kernel = 'rbf' / 'poly' / 'linear', gamma = 'auto',

        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        # sample_weights = [class_weights[int(x)] for x in train_Y]
        # lr.fit(train_X, train_Y)

        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
        grid_search.fit(train_X, train_Y)
        best_params = grid_search.best_params_

        lr = SVC(**best_params)
        lr.fit(train_X, train_Y)

        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])

        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)

    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')

    return

## Catboost

In [None]:
def Catboost(df, ctrl, label='sga'):

    train_cols = df.columns
    train_cols = train_cols.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []
    oof_thresh = []
    temp = None

    class_weights = calc_class_weights(df, target=label, type='normal')

    ran = np.arange(0.01,0.07, 0.01)
    ran1 = np.arange(0.1,0.7,0.1)

    for fold in range(5):
        param_grid = {

            'iterations': [300], ## number↑, time cost↑； ROC↓, Accuracy↑
            'learning_rate': [0.01], ## √ rate↑, ROC↓, Accuracy↓
            'depth': [6],
            'l2_leaf_reg': [1], # √ [1, 2, 3]
            'random_strength': [0.2], # √ 0.3, 0.1, 0.2,
            # 'bagging_temperature': [None],
            'border_count': [32], # √ 64, 128， time↑  auc↓ roc↑
            'subsample': [0.1],# √, (0.2, 0.6)[0.1] auc↑ roc↓

            'bootstrap_type': ['Bernoulli'], # √ CPU - 'MVS'; GPU-'Bayesian'， 'Poisson'
            'boosting_type': ['Ordered'], # √ 'Plain'
            'feature_border_type': ['GreedyLogSum'], # √, 'Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy'
            'fold_permutation_block': [1], # √ (1,+inf)
            'grow_policy': ['SymmetricTree'], # √ 'SymmetricTree','Lossguide','Depthwise'
            'leaf_estimation_backtracking': ['AnyImprovement'], #√
            'leaf_estimation_iterations': [1],#√
            'leaf_estimation_method': ['Newton'], # √ 'Newton', 'Gradient'
            'nan_mode': ['Min'],# √ 'Min', 'Max', 'Forbidden'
            'sampling_frequency': ['PerTreeLevel'],# √ 'PerTreeLevel', 'PerTree'
            'use_best_model': [True], # √

            'custom_metric' : ['AUC', 'BalancedAccuracy', 'F1'], # √
            # 'custom_metric' : ['BalancedAccuracy', 'AUC', 'F1'], # √

            # 'scale_pos_weight': [ class_weights[1]/class_weights[0]], #1,
            # 'loss_function' : ['Logloss'], #,'CrossEntropy', 'F1', 'AUC' √
            'loss_function' : ['CrossEntropy'] # if 'scale_pos_weight' default √
            }
        cat_model = CatBoostClassifier()

        ros = RandomOverSampler(random_state=123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        val_df = df[df['fold'] == fold].reset_index(drop=True)

        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        sample_weights = [class_weights[int(x)] for x in train_Y]

        grid_search = GridSearchCV(cat_model, param_grid=param_grid, cv=5)
        grid_search.fit(train_X, train_Y, eval_set=(train_X, train_Y), sample_weight=sample_weights, use_best_model=True, verbose=0)
        best_params = grid_search.best_params_

        lr = CatBoostClassifier(**best_params)
        lr.fit(train_X, train_Y, eval_set=(train_X, train_Y), sample_weight=sample_weights, verbose=0)
        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(
            val_df[label],
            out,
            metrics=[
                balanced_accuracy_score,
                roc_auc_score,
                f1_score,
                precision_score,
                recall_score
            ]
        )

        threshold = []
        accuracy = []
        for p in np.unique(lr.predict_proba(train_X)[:,1]):
            threshold.append(p)
            y_pred = (lr.predict_proba(train_X)[:,1] >= p).astype(int)
            accuracy.append(balanced_accuracy_score(train_Y, y_pred))
        thresh = threshold[np.argmax(accuracy)]

        # lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)
        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)
        oof_thresh.append(thresh)
        val_df = pd.concat([val_df, pd.DataFrame(out2)], axis = 1)
        temp = pd.concat([temp, val_df], axis = 0)

    # print(classification_report(val_df.sga, out2, target_names=['non-sga', 'sga']))
    print(best_params)
    print(f'OOF Threshold: {np.mean(oof_thresh):.4f} (±{np.std(oof_thresh):.4f})')
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')
    return

## Gaussian

In [None]:
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
def Gaussian(df, ctrl, label = 'sga'):
# lr = LogisticRegression()
# lr = lightgbm.LGBMClassifier('gbdt', learning_rate=0.2)
# lr = xgboost.XGBClassifier()
# lr= tabpfn.TabPFNClassifier(N_ensemble_configurations=16, only_inference=False, combine_preprocessing=True)
# lr = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=300, max_depth=100)
    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    class_weights = calc_class_weights(df, target= label, type='normal')
    reg_range = np.arange(0.1, 1.1, 0.1)
    rate = np.arange(0, 1, 0.01)

    for fold in range(5):
        param_grid = {
            'kernel': [RBF()],
            # 'alpha': [0.01, 0.1, 1.0],
            'optimizer': ['fmin_l_bfgs_b'],
            'n_restarts_optimizer': [1],
            'random_state': [123]
            }
        lr = GaussianProcessClassifier()

        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        sample_weights = [class_weights[int(x)] for x in train_Y]
        # lr.fit(train_X, train_Y)

        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
        grid_search.fit(train_X, train_Y)
        best_params = grid_search.best_params_

        lr = GaussianProcessClassifier(**best_params)
        lr.fit(train_X, train_Y)


        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])


        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)
    print(train_cols)
    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')

    return

## lgbm

In [None]:
def lgbm(df, ctrl, label = 'sga'):
# lr = LogisticRegression()
# lr = lightgbm.LGBMClassifier('gbdt', learning_rate=0.2)
# lr = xgboost.XGBClassifier()
# lr= tabpfn.TabPFNClassifier(N_ensemble_configurations=16, only_inference=False, combine_preprocessing=True)
# lr = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=300, max_depth=100)
    lst = pd.DataFrame()
    train_cols = df.columns.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    class_weights = calc_class_weights(df, target= label, type='normal')
    reg_range = np.arange(0.1, 1.1, 0.1)
    rate = np.arange(0, 1, 0.01)

    for fold in range(5):
        param_grid = {
            'boosting_type': ['gbdt'], #default
            'class_weight': ['balanced'], #
            # 'colsample_bytree': [1.0], #default
            # 'importance_type': ['split'], #default
            'learning_rate': [0.4], #
            'max_depth': [-1], #
            'min_child_samples': [13], #
            'min_child_weight': [0.001], #default
            'min_split_gain': [0.0], #
            'n_estimators': [100], #
            'n_jobs': [-1], #default
            'num_leaves': [27], #
            'random_state': [123], #
            'reg_alpha': [0.0], #
            'reg_lambda': [0.0],
            'silent': [True],
            'subsample': [1.], #
            # 'subsample_for_bin': [200000], #default
            # 'subsample_freq': [0], #default
              }

        lr = lightgbm.LGBMClassifier()
        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        sample_weights = [class_weights[int(x)] for x in train_Y]
        # lr.fit(train_X, train_Y)


        grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5)
        grid_search.fit(train_X, train_Y, sample_weight=sample_weights)
        best_params = grid_search.best_params_


        lr = lightgbm.LGBMClassifier(**best_params)
        lr.fit(train_X, train_Y, sample_weight=sample_weights, verbose=100)


        out = lr.predict_proba(val_df[train_cols])
        out2 = lr.predict(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])


        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

        lst = pd.concat([lst, pd.concat([val_df, pd.DataFrame(out), pd.DataFrame(out2)], axis = 1)], axis = 0)
    print(train_cols)
    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')
    # lst.to_csv('../data/tri3_predict_proba.csv', index_label=None, index=None)
    # return np.mean(oof_roc_auc)
    return

## xgboost

In [None]:
def xgboost(df, ctrl, label = 'sga'):

    train_cols = df.columns
    train_cols = train_cols.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []

    class_weights = calc_class_weights(df, target= label, type='normal')

    for fold in range(5):
        # lr = RandomForestClassifier(max_features = None, bootstrap = False, class_weight = 'balanced_subsample',
        # n_estimators = 1, max_depth = 2, min_impurity_decrease = 0.017, random_state = 123)
        # lr = RandomForestClassifier(random_state = 123)
        # lr = SVC(probability = True, kernel='rbf', random_state = 123) #, gamma='scale' C = 0.1, degree = 8,kernel = 'rbf' / 'poly', gamma = 'auto',
        # lr  = catboost.CatBoostClassifier(verbose=0,  boosting_type='Ordered', learning_rate=0.03,
        # loss_function='CrossEntropy', custom_metric=['BalancedAccuracy', 'AUC', 'F1'], od_pval=0, l2_leaf_reg=5, one_hot_max_size=10,
        # bootstrap_type='Bayesian', random_strength=0.3)#, train_dir='../src/catboost_train'
        paras = {'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'border_count': 32, 'custom_metric': 'AUC', 'depth': 6, 'feature_border_type': 'GreedyLogSum',
         'fold_permutation_block': 1, 'grow_policy': 'SymmetricTree', 'iterations': 100, 'l2_leaf_reg': 1, 'leaf_estimation_backtracking': 'AnyImprovement',
         'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'learning_rate': 0.01, 'loss_function': 'CrossEntropy', 'nan_mode': 'Min',
         'random_strength': 0.1, 'sampling_frequency': 'PerTreeLevel', 'subsample': 0.1, 'use_best_model': True}
        # lr  = catboost.CatBoostClassifier(**paras)
        lr = catboost.CatBoostClassifier()
        # lr = lightgbm.LGBMClassifier(class_weight='balanced', learning_rate=0.4, min_child_samples=13, num_leaves=27, random_state=123)
        # lr = GaussianProcessClassifier(kernel='RBF', optimizer= 'fmin_l_bfgs_b', n_restarts_optimizer= 1)
        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        # sample_weights = [class_weights[int(x)] for x in train_Y]

        lr.fit(train_X, train_Y)
        out = lr.predict_proba(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])
        out2 = lr.predict(val_df[train_cols])

        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)

    print(train_df)
    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')
    # lst.to_csv('../data/tri3_predict_proba.csv', index_label=None, index=None)
    # return np.mean(oof_roc_auc)
    return



## random forest

In [None]:

## for tri2
lr = RandomForestClassifier(n_estimators = 56,  max_depth = 3, criterion = 'entropy', min_samples_split = 0.27, min_samples_leaf = 8,
                                    bootstrap = False, ccp_alpha = 0.05, max_features = None, min_impurity_decrease = 0.0332, random_state = 123)


## for tri3
lr = RandomForestClassifier(max_features = None, bootstrap = False, class_weight = 'balanced_subsample',
                n_estimators = 1, max_depth = 2, min_impurity_decrease = 0.017, random_state = 123)  # criterion = 'log_loss', balanced_subsample

In [None]:
def rf(df, ctrl, label = 'sga'):

    train_cols = df.columns
    train_cols = train_cols.drop([label, 'fold'])

    oof_acc = []
    oof_roc_auc = []
    oof_f1 = []
    oof_prec = []
    oof_rec = []
    oof_thresh = []

    class_weights = calc_class_weights(df, target= label, type='normal')

    for fold in range(5):

        ## zw sga + lbw better
        ## this is status_change↓
        # - tri2 a_acc=0.7771, auc=0.7753 **
        # - tri3 a_acc=0.7775, auc=0.4329
        lr = RandomForestClassifier(n_estimators = 56,  max_depth = 3, criterion = 'entropy', min_samples_split = 0.27, min_samples_leaf = 8,
                                    bootstrap = False, ccp_alpha = 0.05, max_features = None, min_impurity_decrease = 0.0332, random_state = 123)
                            # lr = RandomForestClassifier(n_estimators = 56, criterion = 'entropy', max_depth = 3, min_samples_split = 0.27, min_samples_leaf = 8,
                                                        #   bootstrap = False, ccp_alpha = 0.05, max_features = None, min_impurity_decrease = 0.0332, random_state = 123)
                            # lr = RandomForestClassifier(max_features = None, bootstrap = False, n_estimators = 1, max_depth = 1, criterion = "entropy",
                            #                            min_impurity_decrease = 0.1, verbose = 0, random_state = 123, min_samples_leaf = 1, ccp_alpha = 0.2)

        ## yifong status_change↓ better
        # - tri2 a_acc=0.7735, auc=0.7673
        # - tri3 a_acc=0.7775, auc=0.7024  **
        # lr = RandomForestClassifier(max_features = None, bootstrap = False, class_weight = 'balanced_subsample',
                    #   n_estimators = 1, max_depth = 2, min_impurity_decrease = 0.017, random_state = 123)  # criterion = 'log_loss', balanced_subsample


        ros = RandomOverSampler(random_state = 123)
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        train_X, train_Y = ros.fit_resample(train_df[train_cols], train_df[label])
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        # sample_weights = [class_weights[int(x)] for x in train_Y]

        lr.fit(train_X, train_Y)
        out = lr.predict_proba(val_df[train_cols])
        acc, roc_auc, f1, prec, rec = calc_metrics(val_df[label], out, metrics=[balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score])
        out2 = lr.predict(val_df[train_cols])
        threshold = []
        accuracy = []
        for p in np.unique(lr.predict_proba(train_X)[:,1]):
            threshold.append(p)
            y_pred = (lr.predict_proba(train_X)[:,1] >= p).astype(int)
            accuracy.append(balanced_accuracy_score(train_Y, y_pred))
        thresh = threshold[np.argmax(accuracy)]


        oof_acc.append(acc)
        oof_roc_auc.append(roc_auc)
        oof_f1.append(f1)
        oof_prec.append(prec)
        oof_rec.append(rec)
        oof_thresh.append(thresh)
        val_df = pd.concat([val_df, pd.DataFrame(out2)], axis = 1)

    # print(train_df)
    print(lr, lr.get_params())
    print(f'OOF Prec Score: {np.mean(oof_prec):.4f} (±{np.std(oof_prec):.4f})')
    print(f'OOF Recall Score: {np.mean(oof_rec):.4f} (±{np.std(oof_rec):.4f})')
    print(f'OOF F1 Score: {np.mean(oof_f1):.4f} (±{np.std(oof_f1):.4f})')
    print(f'OOF Balanced Accuracy: {np.mean(oof_acc):.4f} (±{np.std(oof_acc):.4f})')
    print(f'OOF ROC AUC Score: {np.mean(oof_roc_auc):.4f} (±{np.std(oof_roc_auc):.4f})')
    # lst.to_csv('../data/tri3_predict_proba.csv', index_label=None, index=None)
    # return np.mean(oof_roc_auc)
    return



# 3.Start run

## 3.1 tri2

In [None]:
tri2 = pd.read_csv('./cyf_clean_tri2.csv')
tri2.drop(['lbw', 'sga', 'cur_sga'], axis = 1, inplace = True)

# tri2 = ToNormal(tri2)
tri2 = ohe(tri2)
tri2 = data_impute(tri2)

# tri2 = remove_multicollinearity(tri2, thresh = 90)
# tri2 = ignore_low_variance(tri2, thresh = 0.01, label = 'status_change')
# tri2 = pca_bmi(tri2, 11)  # 11 13

tri2 = sgkf(tri2, label = 'status_change')
# mlp(tri2)
tri2['af'] = tri2['af'].round()
temp = rf(tri2, 0.025, label = 'status_change')

RandomForestClassifier(bootstrap=False, ccp_alpha=0.05, criterion='entropy',
                       max_depth=3, max_features=None,
                       min_impurity_decrease=0.0332, min_samples_leaf=8,
                       min_samples_split=0.27, n_estimators=56,
                       random_state=123) {'bootstrap': False, 'ccp_alpha': 0.05, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0332, 'min_samples_leaf': 8, 'min_samples_split': 0.27, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 56, 'n_jobs': None, 'oob_score': False, 'random_state': 123, 'verbose': 0, 'warm_start': False}
OOF Prec Score: 0.6459 (±0.2655)
OOF Recall Score: 0.6194 (±0.1583)
OOF F1 Score: 0.6179 (±0.2097)
OOF Balanced Accuracy: 0.7771 (±0.0999)
OOF ROC AUC Score: 0.7753 (±0.1044)
