# Import Library

In [None]:
import numpy as np
import pandas as pd
import json
import copy
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

#plt
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
import seaborn as sns
import os, random, sys, time, re

# scaling
from sklearn.preprocessing import RobustScaler, StandardScaler,MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer


#Missing value
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

#feature
from sklearn.cluster import KMeans


#classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from catboost import Pool, CatBoostClassifier
import xgboost
from lightgbm import LGBMClassifier

#loss
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score, recall_score,accuracy_score

#times
from datetime import date, datetime

#OverSampler
from imblearn.over_sampling import RandomOverSampler
# SMOTE (Synthetic Minority Over-sampling Technique)
from imblearn.over_sampling import SMOTE

#skf
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import KBinsDiscretizer
import category_encoders as encoders

# seed

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     torch.manual_seed(seed)
#     torch.backends.cudnn.benchmark=False
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.use_deterministic_algorithms = True
    
seed_everything(seed=40)

# Config

In [None]:
class Config:
    BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions'
    random_state = 42
    run_para = 'local' #kaggle
    EJ_encoded='CATBoostENCODE' #intENCODE,dropEJ
    fillna='muti-fill'   #muti-fill
    scaling='RobustScaler'#RobustScaler, StandardScaler,MinMaxScaler
    Sampler='none'  #OverSampler
    target='Class'
    binner_features='DU'
cfg = Config()

# 1. load the data

In [None]:
maindf = pd.read_csv(f'{cfg.BASE_DIR}/train.csv')
greeksdf = pd.read_csv(f'{cfg.BASE_DIR}/greeks.csv')
testdf = pd.read_csv(f'{cfg.BASE_DIR}/test.csv')

#移除首尾空格
maindf.columns      = maindf.columns.str.strip()
testdf.columns = testdf.columns.str.strip()

# 2. EDA 

In [None]:
# df_train_numerical = maindf.drop(['Id', 'EJ', 'Class'], axis=1)
# df_train_numerical.describe(include='all').transpose()

In [None]:
# # Histgram for numercial features
# fig, ax = plt.subplots(11, 5, figsize=(16,30))

# for i in range(0, (len(ax.flatten()))):
#     sns.histplot(data=df_train_numerical, x =df_train_numerical.iloc[:,i], bins=20, ax=ax[int(i/5),i % 5]) 
# plt.subplots_adjust(hspace=0.5)  

# plt.show()

outliers-Robust Scaler（mitigate the impact of outliers and achieve a more reliable and accurate modeling outcome.）

In [None]:
# fig, ax = plt.subplots(11, 5, figsize=(16,30))
# for i in range(0, (len(ax.flatten()))):
#     sns.boxplot(x="Class",y=df_train_numerical.columns[i],data=maindf, ax=ax[int(i/5),i % 5])
# plt.subplots_adjust(wspace=0.3)  
# plt.show()

# 3.Pre-processing

In [None]:
df_numeric_columns =  maindf.drop(['EJ','Id','Class'], axis=1)

* 3.1.feature enhancement

In [None]:
k = 5
if cfg.run_para == 'kaggle':
    BNpd = pd.concat([maindf['BN'], testdf['BN']], axis=0, ignore_index=True)
elif cfg.run_para == 'local':
    BNpd = maindf['BN'] 

data = BNpd.values.reshape(-1, 1)   
kmodel = KMeans(n_clusters=k)         
kmodel.fit(data)  
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) 
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(maindf['BN'].max(), testdf['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

train_BN = maindf['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True).astype(int)
maindf['BN_binning'] = train_binning

test_BN = testdf['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True).astype(int)
testdf['BN_binning'] = test_binning

In [None]:
def binning_feature(df, feature_name, n_bins=8, strategy="quantile"):
    kbins = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy=strategy, subsample=None, random_state=40)
    feature_binned = kbins.fit_transform(df[feature_name].values.reshape(-1, 1))
    df[f'{feature_name}_binning'] = feature_binned
# binning_feature(maindf, cfg.binner_features, n_bins=8, strategy="quantile")
# binning_feature(testdf, cfg.binner_features, n_bins=8, strategy="quantile")

* 3.2.EJ encoding

In [None]:
if cfg.EJ_encoded == 'CATBoostENCODE':
    CATBoostENCODE = encoders.CatBoostEncoder()
    categorical_cols = ['EJ']
    encoder_train = CATBoostENCODE.fit_transform(maindf[categorical_cols], maindf['Class'])
    maindf.EJ = pd.DataFrame(encoder_train)
    encoder_test = CATBoostENCODE.transform(testdf[categorical_cols])
    testdf.EJ = pd.DataFrame(encoder_test)  
elif cfg.EJ_encoded == 'intENCODE':
    first_cat = maindf.EJ.unique()[0]
    maindf.EJ  = maindf.EJ.eq(first_cat).astype('int')
    testdf.EJ   = testdf.EJ.eq(first_cat).astype('int')
elif cfg.EJ_encoded == 'dropEJ':
    maindf=maindf.drop(['EJ'],axis=1)
    testdf=testdf.drop(['EJ'],axis=1)

* 3.3.Fillna 

In [None]:
if cfg.fillna == 'knn':
    imputer = KNNImputer(n_neighbors=5)  
    maindf_none = pd.DataFrame(imputer.fit_transform(maindf[df_numeric_columns.columns]), columns=df_numeric_columns.columns)
    testdf_none =pd.DataFrame(imputer.fit_transform(testdf[df_numeric_columns.columns]), columns=df_numeric_columns.columns)
    #testdf_none =pd.DataFrame(imputer.transform(testdf[df_train_numerical.columns]), columns=df_train_numerical.columns)
   # Replace the imputed columns in the train data sets
    df_train_2 = maindf.drop(df_numeric_columns.columns, axis=1)
    maindf = pd.concat([df_train_2, maindf_none], axis=1)
    # Replace the imputed columns in the test data sets
    df_test_2 = testdf.drop(df_numeric_columns.columns, axis=1)
    testdf = pd.concat([df_test_2, testdf_none], axis=1)
if cfg.fillna == 'muti-fill':
    maindf['BQ'] = maindf['BQ'].fillna(maindf['BQ'].min())
    maindf['EL'] = maindf['EL'].fillna(maindf['EL'].mode()[0])
    maindf['CB'] = maindf['CB'].fillna(maindf['CB'].median())
    maindf['CC'] = maindf['CC'].fillna(maindf['CC'].mean())
    maindf['DU'] = maindf['DU'].fillna(maindf['DU'].min())
    maindf['FC'] = maindf['FC'].fillna(maindf['FC'].mean())
    maindf['FL'] = maindf['FL'].fillna(maindf['FL'].median())
    maindf['FS'] = maindf['FS'].fillna(maindf['FS'].mode()[0])
    maindf['GL'] = maindf['GL'].fillna(maindf['GL'].mode()[0])
    
    testdf['BQ'] = testdf['BQ'].fillna(testdf['BQ'].min())
    testdf['EL'] = testdf['EL'].fillna(testdf['EL'].mode()[0])
    testdf['CB'] = testdf['CB'].fillna(testdf['CB'].median())
    testdf['CC'] = testdf['CC'].fillna(testdf['CC'].mean())
    testdf['DU'] = testdf['DU'].fillna(testdf['DU'].min())
    testdf['FC'] = testdf['FC'].fillna(testdf['FC'].mean())
    testdf['FL'] = testdf['FL'].fillna(testdf['FL'].median())
    testdf['FS'] = testdf['FS'].fillna(testdf['FS'].mode()[0])
    testdf['GL'] = testdf['GL'].fillna(testdf['GL'].mode()[0])
    
    #check
    imputer = KNNImputer(n_neighbors=5)    
    df_test_numerical = testdf.drop(['EJ','Id'], axis=1)
    df_test_2 = testdf.drop(df_test_numerical.columns, axis=1)
    testdf_none =pd.DataFrame(imputer.fit_transform(testdf[df_test_numerical.columns]), columns=df_test_numerical.columns)
    testdf = pd.concat([df_test_2, testdf_none], axis=1)

* 3.4.Standarization

In [None]:
if cfg.scaling=='RobustScaler':
    scaler = RobustScaler()
    index = maindf.index
    scaler_train = scaler.fit_transform(maindf[df_numeric_columns.columns])
    scaler_df_train = pd.DataFrame(scaler_train, columns=df_numeric_columns.columns)
    scaler_df_train.index = index
    
    index = testdf.index
    scaler_test = scaler.transform(testdf[df_numeric_columns.columns])
    scaler_df_test = pd.DataFrame(scaler_test, columns=df_numeric_columns.columns)
    scaler_df_test.index = index
    
    df_train_2 = maindf.drop(df_numeric_columns.columns, axis=1)
    maindf = pd.concat ([df_train_2, scaler_df_train], axis=1)
    
    df_test_2 = testdf.drop(df_numeric_columns.columns, axis=1)
    testdf = pd.concat ([df_test_2, scaler_df_test], axis=1)
    
elif cfg.scaling=='StandardScaler':
    scaler = StandardScaler()
    index = maindf.index
    scaler_train = scaler.fit_transform(maindf[df_numeric_columns.columns])
    scaler_df_train = pd.DataFrame(scaler_train, columns=df_numeric_columns.columns)
    scaler_df_train.index = index
    
    index = testdf.index
    scaler_test = scaler.transform(testdf[df_numeric_columns.columns])
    scaler_df_test = pd.DataFrame(scaler_test, columns=df_numeric_columns.columns)
    scaler_df_test.index = index
    
    df_train_2 = maindf.drop(df_numeric_columns.columns, axis=1)
    maindf = pd.concat ([df_train_2, scaler_df_train], axis=1)
    
    df_test_2 = df_test_2.drop(df_numeric_columns.columns, axis=1)
    testdf = pd.concat ([df_test_2, scaler_df_test], axis=1)

elif cfg.scaling=='MinMaxScaler':
    scaler = MinMaxScaler()
    index = maindf.index
    scaler_train = scaler.fit_transform(maindf[df_numeric_columns.columns])
    scaler_df_train = pd.DataFrame(scaler_train, columns=df_numeric_columns.columns)
    scaler_df_train.index = index
    
    index = testdf.index
    scaler_test = scaler.transform(testdf[df_numeric_columns.columns])
    scaler_df_test = pd.DataFrame(scaler_test, columns=df_numeric_columns.columns)
    scaler_df_test.index = index
    
    df_train_2 = maindf.drop(df_numeric_columns.columns, axis=1)
    maindf = pd.concat ([df_train_2, scaler_df_train], axis=1)
    
    df_test_2 = df_test_2.drop(df_numeric_columns.columns, axis=1)
    testdf = pd.concat ([df_test_2, scaler_df_test], axis=1)

elif cfg.scaling=='none':
    pass

# 4.Prepare Input

In [None]:
times = greeksdf.Epsilon.copy()
times[greeksdf.Epsilon != 'Unknown'] = greeksdf.Epsilon[greeksdf.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeksdf.Epsilon == 'Unknown'] = np.nan


maindf_times = pd.concat([maindf, times], axis=1)
maindf_times['Epsilon'] = maindf_times['Epsilon'].astype(float)
maindf_times['Epsilon'] = maindf_times['Epsilon'].fillna(0, )

target='Class'
predictors = [n for n in maindf_times.columns if n != target and n != 'Id']
pred_and_time=maindf_times[predictors]
y_data=maindf_times['Class']

testdf['Epsilon'] = pred_and_time['Epsilon'].max() + 1
test_pred_and_time =testdf[predictors]

In [None]:
# smote = SMOTE(sampling_strategy={0: 509, 1: 509})
# X_smote, y_smote = smote.fit_resample(pred_and_time, y_train)
# print("length of original data is ",len(pred_and_time))
# print("length of oversampled data is ",len(X_smote))

# 5.balance_logloss

In [None]:
def balance_logloss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

In [None]:
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    balanced_log_loss = (w_0 * log_loss_0 + w_1 * log_loss_1) / 2
    return balanced_log_loss 

def lgb_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

# 6.Model

6.1 LGBM

In [None]:
class WeightedEns(BaseEstimator):
    def __init__(self):
        self.classifiers = [
            LGBMClassifier(
boosting_type='goss', 
learning_rate=0.06733232950390658, 
n_estimators = 50000, 
early_stopping_round = 300, 
random_state=42,
subsample=0.6970532011679706,
colsample_bytree=0.6055755840633003,
class_weight='balanced',
metric='binary', 
is_unbalance=True, 
max_depth=8,
objective="binary"
)]
    def fit_eval(self, X, y, Xv, yv):
        cls, y = np.unique(y, return_inverse=True)
        cls, yv = np.unique(yv, return_inverse=True)
        self.classes_ = cls
        f_imp=[]
        for cl in self.classifiers:
            cl.fit(X, y,eval_set = [(Xv, yv)],eval_metric =lgb_metric,verbose=0)
            
    def predict_proba(self, X):
        ps = np.stack([cl.predict_proba(X) for cl in self.classifiers])
        p = np.mean(ps,axis=0)
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        return new_p / np.sum(new_p,axis=1,keepdims=1)

In [None]:
sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)
scores_train = []
result_preds=[]
recalls=[]
precisions=[]
val_list=[]
off=[]
id_list=[]
scores=[]
accuracies=[]
for fold_num, (train_index, val_index) in enumerate(sk.split(pred_and_time, y_data)): 
    m = WeightedEns()
    X_train, X_val = pred_and_time.loc[train_index], pred_and_time.loc[val_index]
    y_train, y_val = y_data.loc[train_index], y_data.loc[val_index] 
    
#     smote = SMOTE(sampling_strategy={0: len(y_train[y_train==0]), 1:len(y_train[y_train==0])})
#     X_train, y_train = smote.fit_resample(X_train, y_train)
    m.fit_eval(X_train,y_train,X_val,y_val)
                
    y_pred_train=m.predict_proba(X_train)
    preds_train=np.concatenate((y_pred_train[:,:1], np.sum(y_pred_train[:,1:], 1, keepdims=True)), axis=1)[:, 1].reshape(-1)
    
    y_pred_val=m.predict_proba(X_val)
    preds_val =np.concatenate((y_pred_val[:,:1], np.sum(y_pred_val[:,1:], 1, keepdims=True)), axis=1)[:, 1].reshape(-1)

    
    id_list.append(maindf.loc[val_index, 'Id'])
    val_list.append(y_val)
    off.append(preds_val)
    
    y_val_pred_binary = (preds_val > 0.5).astype(int)
    recall = recall_score(y_val, y_val_pred_binary)
    recalls.append(recall)
    precision = precision_score(y_val, y_val_pred_binary)
    precisions.append(precision)
    
    accuracy= accuracy_score(y_val, y_val_pred_binary)
    accuracies.append(accuracy)

    y_pred_test=m.predict_proba(test_pred_and_time)
    preds_test=np.concatenate((y_pred_test[:,:1], np.sum(y_pred_test[:,1:], 1, keepdims=True)), axis=1)[:, 1].reshape(-1)
    result_preds.append(preds_test)
    
    score = balanced_log_loss(y_val, preds_val)
    scores.append(score)
print('train_local_cv:',scores_train)
print(f'train_Mean local_cv: {np.mean(scores_train)}')

print('test_local_cv:',scores)
print(f'test_Mean local_cv: {np.mean(scores)}')

print('recall:',recalls)
print(f'test_Mean recall: {np.mean(recalls)}')

print('precision:',precisions)
print(f'test_Mean precision: {np.mean(precisions)}')

print('accuracy:',accuracies)
print(f'test_Mean accuracy: {np.mean(accuracies)}')

In [None]:
off_list = []
for array in off:
    off_list.extend(array)
y_val_list = []
for array in val_list:
    y_val_list.extend(array)
id_lists = []
for array in id_list:
    id_lists.extend(array)

In [None]:
# class 0 and class 1
class_0_pred = [p for c, p in zip(y_val_list, off_list) if c == 0]
class_1_pred = [p for c, p in zip(y_val_list, off_list) if c == 1]

def jitter(data, sigma=0.3):
    return [x + random.uniform(-sigma, sigma) for x in data]
plt.figure(figsize=(10, 4))

plt.scatter(class_0_pred, jitter([0]*len(class_0_pred)), label='Class 0', edgecolors='blue', marker='o',linewidth=1,facecolors='none')
plt.scatter(class_1_pred, jitter([1]*len(class_1_pred)), label='Class 1', edgecolors='red', marker='o',linewidth=1,facecolors='none')


plt.xlabel('Prediction (pred)')
plt.yticks([0, 1], ['Class 0', 'Class 1'])
plt.ylabel('Data Points')
plt.legend()


plt.show()

In [None]:
table = list(zip(id_lists, y_val_list, off_list))
df = pd.DataFrame(table, columns=['id', 'class', 'pred'])
file_name = 'lgbm.xlsx'
df.to_excel(file_name, index=False, engine='openpyxl')

In [None]:
averages = np.mean(result_preds, axis=0)

In [None]:
submission = pd.DataFrame(testdf["Id"], columns=["Id"])
submission["class_0"] = 1-averages
submission["class_1"] = averages
submission.to_csv('submission.csv', index=False)
submission_df = pd.read_csv('submission.csv')
submission_df

In [None]:
# from scipy.stats import uniform, randint
# params = {
#     'num_leaves': randint(5, 50),
#     'max_depth': randint(3, 10)
# }

# lgbm = LGBMClassifier( boosting_type='goss', 
#                       learning_rate=0.06733232950390658, 
#                       n_estimators = 50000, 
#                       random_state=42,
#                       subsample=0.6970532011679706,
#                       colsample_bytree=0.6055755840633003,
#                       class_weight='balanced',
#                       metric='binary', 
#                       is_unbalance=True, 
#                       max_depth=8,
#                       objective="binary")

In [None]:
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# folds = 5
# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
# lgbm_model = RandomizedSearchCV(lgbm, param_distributions=params, n_iter=2000, scoring='roc_auc', n_jobs=-1, cv=skf.split(X_smote,y_smote), verbose=-1, random_state=1001 )
# lgbm_model.fit(X_smote, y_smote)