In [26]:
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
import random
import os
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder,StandardScaler, QuantileTransformer, RobustScaler, PowerTransformer
from sklearn.feature_selection import VarianceThreshold


import tensorflow as tf
from tensorflow.keras import Sequential, layers, Input, Model
from tensorflow.experimental import numpy as tfnp
from tensorflow.keras.losses import Loss
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Helper Functions

In [2]:

@dataclass
class DataHolder():
    folds : [pd.DataFrame]

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def balanced_log_loss(y_true, y_pred,return_all=False):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)

    p_1 = np.clip(y_pred, 1e-15, 1 - (1e-15))
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1

    if return_all:
        return (log_loss_0 + log_loss_1) / 2, log_loss_0,log_loss_1
    return (log_loss_0 + log_loss_1) / 2

def preprocess_df(input_df: pd.DataFrame)->pd.DataFrame:
    input_df = input_df.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'}) #?
    output_df = input_df.copy()
    str2int_dict = {}
    str2int_dict['EJ'] = {'A': 1.0, 'B': 0.0}
    for col in str2int_dict.keys():
            output_df[col] = output_df[col].map(str2int_dict[col])
    return output_df


# Config documents

In [12]:
class CFG:
    DATA_PATH = Path('/kaggle/input/icr-identify-age-related-conditions')

    
    seed = 42 #52
    n_folds = 10 #replaced 20
    target_col = 'Class'
    metric = 'balanced_log_loss'

In [8]:
seed_everything(CFG.seed)
np.random.seed(CFG.seed)
tf.random.set_seed(CFG.seed)



# Code 

### Load 

In [13]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
df_train = train_df
greeks_df = pd.read_csv(CFG.DATA_PATH / 'greeks.csv')
official_test_df = preprocess_df(pd.read_csv(CFG.DATA_PATH / 'test.csv'))
sub = pd.read_csv(CFG.DATA_PATH / 'sample_submission.csv')

In [14]:
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 
                      'BD', 'BN', 'BP', 'BQ', 'BR', 'BZ',
                      'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 
                      'CR', 'CS', 'CU', 'CW',
                      'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
                      'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU',
                      'FC', 'FD', 'FE', 'FI', 'FL', 'FR', 'FS',
                      'GB', 'GE', 'GF', 'GH', 'GI', 'GL']


In [15]:
df_train['EJ'] = df_train.EJ.map({'A':0,'B':1}).astype(float)
feature_list_KNN = list(set(df_train.columns) - set(['Id','EJ','Class']))
feature_list = list(set(df_train.columns) - set(['Id','Class']))

scaler = QuantileTransformer(n_quantiles=100,random_state=CFG.seed, output_distribution='normal')
df_train[feature_list_KNN] = scaler.fit_transform(df_train[feature_list_KNN])
df_train = df_train.fillna(df_train.median())

  df_train = df_train.fillna(df_train.median())


In [16]:
encoded_df = pd.get_dummies(greeks_df, columns=['Beta','Gamma', 'Delta'])
merged_df = df_train.merge(encoded_df, on='Id')


In [17]:
train_df = merged_df

# Feature Engineering


In [20]:
train_df = greeks_df.merge(train_df,on='Id')


In [21]:
def split_on_column(column):
    train_group = [train_df[train_df['Id'].isin(greeks_df[greeks_df[column] == letter]['Id'])] for letter in greeks_df[column].unique()]

    grouped_splits = [np.array_split(grouped,CFG.n_folds) for grouped in train_group]

    data_holder_split = DataHolder( [pd.concat([split[i] for split in grouped_splits]) for i in range(CFG.n_folds)]
                                   )
    return data_holder_split

In [22]:
positives = train_df[train_df[CFG.target_col] == 1]
negatives = train_df[train_df[CFG.target_col] == 0]

positives_splits = np.array_split(positives, CFG.n_folds)
negatives_splits = np.array_split(negatives, CFG.n_folds)

data_holder_split_class = DataHolder( [pd.concat([positives_splits[i],negatives_splits[i]]) for i in range(CFG.n_folds)]
                               )

data_holder_split_alpha = split_on_column("Alpha")
data_holder_split_gamma = split_on_column("Gamma")

cv_holders = [data_holder_split_class,data_holder_split_alpha,data_holder_split_gamma]

[[y.shape for y in x.folds] for x in cv_holders][2]

[(65, 80),
 (64, 80),
 (64, 80),
 (63, 80),
 (62, 80),
 (61, 80),
 (61, 80),
 (61, 80),
 (58, 80),
 (58, 80)]

## NN adapting

In [22]:
def build_neutralizer(df,features,proportion):
    """


    Builds neutralzied features, then trains a linear model to predict neutralized features from original
    features and return the coeffs of that model.
    """
    neutralizer = {}
    neutralized_features = np.zeros((df.shape[0], len(features)))
    target = df[['Class', 'bias']].values
    for i, f in enumerate(features):
        # obtain corrected feature
        feature = df[f].values.reshape(-1, 1)
        coeffs = np.linalg.lstsq(target, feature)[0]
        neutralized_features[:, i] = (feature - (proportion * target.dot(coeffs))).squeeze()

    # train model to predict corrected features
    neutralizer = np.linalg.lstsq(df[features+['bias']].values, neutralized_features)[0]

    return neutralizer

def neutralize_array(array, neutralizer):
    neutralized_array = array.dot(neutralizer)
    return neutralized_array

In [23]:
def create_model():
    input_shape = (len(feature_list),)
    input_layer = Input(shape=input_shape)

    # Shared layers
    shared_layer1 = Dense(32, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(input_layer)
    shared_layer2 = Dropout(.2)(shared_layer1)
    shared_layer3 = Dense(16, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(shared_layer2)
#     shared_layer4 = Dropout(.2)(shared_layer3)
    shared_layer_output = Dense(8, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(shared_layer1)
    
    # Main branch - 'Class'
    main_output = Dense(1, activation='sigmoid', name='main_output')(shared_layer_output)

    # Additional features branch
    additional_output = Dense(len(aux_columns), activation='sigmoid', name='additional_output')(shared_layer_output)

    # Combine the outputs
    model = Model(inputs=input_layer, outputs=[main_output, additional_output])
    return model
def create_model_1_output():
    input_shape = (len(feature_list),)
    input_layer = Input(shape=input_shape)

    # Shared layers
    shared_layer1 = Dense(32, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(input_layer)
    shared_layer2 = Dropout(.2)(shared_layer1)
    shared_layer3 = Dense(16, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(shared_layer2)
#     shared_layer4 = Dropout(.2)(shared_layer3)
    shared_layer_output = Dense(8, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(shared_layer1)
    
    # Main branch - 'Class'
    main_output = Dense(1, activation='sigmoid', name='main_output',bias_initializer=tf.keras.initializers.Constant(np.log(108/509)))(shared_layer_output)

    # Additional features branch

    # Combine the outputs
    model = Model(inputs=input_layer, outputs=[main_output])
    return model

def custom_loss(y_true, y_pred):
    loss1 = K.binary_crossentropy(K.cast(y_true[0], dtype='float32'), y_pred[0])
    loss2 = K.binary_crossentropy(K.cast(y_true[1], dtype='float32'), y_pred[1])
    lambda_val = .6  # Adjust the value of lambda as desired
    loss = loss1+ lambda_val * loss2
    return loss

def pp_prob(p):
    c0 = p[:,0].sum()
    c1 = p[:,1:].sum()
    new_p = p * np.array([[1/(c0 if i==0 else c1) for i in range(p.shape[1])]])
    new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
    return np.sum(new_p[:,1:],1,keepdims=False)

In [24]:
aux_columns = ['Beta_A', 'Beta_B', 'Beta_C', 'Gamma_A', 'Gamma_B', 'Gamma_E',
                'Gamma_F', 'Gamma_G', 'Gamma_H', 'Gamma_M', 'Gamma_N', 'Delta_A',
                'Delta_B', 'Delta_C', 'Delta_D']

In [81]:
def create_ae_mlp(num_columns, num_labels, hidden_units, dropout_rates, ls = 1e-2, lr = 1e-3):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x0 = tf.keras.layers.BatchNormalization()(inp)
    
    encoder = tf.keras.layers.GaussianNoise(.4)(x0)
    encoder = tf.keras.layers.Dense(32)(encoder)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.keras.layers.Activation('swish')(encoder)

    decoder = tf.keras.layers.Dropout(0.1)(encoder)
    decoder = tf.keras.layers.Dense(num_columns, name = 'decoder')(decoder)

    x_ae = tf.keras.layers.Dense(32)(decoder)
    x_ae = tf.keras.layers.BatchNormalization()(x_ae)
    x_ae = tf.keras.layers.Activation('swish')(x_ae)
    x_ae = tf.keras.layers.Dropout(.4)(x_ae)

    out_ae = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = tf.keras.layers.Concatenate()([x0, encoder])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(.1)(x)
    

    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('swish')(x)
    x = tf.keras.layers.Dropout(.4)(x)
    
    x = tf.keras.layers.Dense(64)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('swish')(x)
    x = tf.keras.layers.Dropout(.3)(x)
    
    x = tf.keras.layers.Dense(64)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('swish')(x)
    x = tf.keras.layers.Dropout(.2)(x)
    
    x = tf.keras.layers.Dense(32)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('swish')(x)
    x = tf.keras.layers.Dropout(.3)(x)
    
        
    out = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'action',bias_initializer=tf.keras.initializers.Constant(np.log(108/509)))(x)
    
    additional_output = Dense(len(aux_columns), activation='sigmoid', name='additional_output')(x)

    
    model = tf.keras.models.Model(inputs = inp, outputs = [decoder, out_ae, out,additional_output])
#     model = tf.keras.models.Model(inputs = inp, outputs = [decoder, out_ae, out])

    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls),
                          'additional_output':tf.keras.losses.CategoricalCrossentropy(label_smoothing = ls) 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name = 'MAE'), 
                             'ae_action': tf.keras.metrics.BinaryCrossentropy(name = 'AUC'), 
                             'action': tf.keras.metrics.BinaryCrossentropy(name = 'AUC'), 
                              'additional_output':tf.keras.losses.CategoricalCrossentropy(label_smoothing = ls) 
                            }, 
                 )
    
    return model

params = {'num_columns': len(feature_list), 
          'num_labels': 1, 
          'hidden_units': [32, 32, 16,32,64], 
          'dropout_rates': [0.25,0.01,0.1,0.01,.01], 
          'ls': 0.0, 
          'lr':1e-3, 
         }# (0.29453164684677857, 0.3353345524846144, 0.2537287412089428)




In [82]:
import copy 

cv_strat_scores = []
feat_importances = []
models = []
fold_log_losses = []
outer_y_preds = []
for cv_strat in tqdm(cv_holders[:2]):
    fold_scores = []
    temp_feat_importances = []
    temp_models = []
    temp_fold_log_losses = []
    inner_y_preds = []
    inner_histories = []
    neural_models = {}
    fold_losses = {}

    early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=15)

        
    for idx,fold_number in enumerate(range(CFG.n_folds)):
        test_idx = idx + 1 
        if idx == CFG.n_folds -1:
            test_idx = 0
        temp_data = copy.deepcopy(cv_strat.folds)
        test = copy.deepcopy(temp_data[test_idx])
        valid = copy.deepcopy(temp_data[idx])
        if test_idx < idx:
            del temp_data[idx]
            del temp_data[test_idx]
        else:
            del temp_data[test_idx]
            del temp_data[idx]
        train = pd.concat(temp_data)

################################################################################################################################################        

#         temp_train = pd.concat([train,valid])

#         proportion = .25
#         temp_train['bias'] = 1.0
#         neutralizer = build_neutralizer(temp_train, feature_list, proportion)
#         temp_train[feature_list] = neutralize_array(temp_train[feature_list+['bias']].values, neutralizer)
        
#         train = temp_train.loc[train.index]
#         valid = temp_train.loc[valid.index]
        
#         rows = []
#         for idx,row in test.iterrows():
#             rows.append(neutralize_array(np.asarray(row[feature_list].tolist() + [1.0]),neutralizer))

#         test[feature_list] = rows
        
        
################################################################################################################################################
            


        # smote = SMOTE(sampling_strategy={"A" : 407 * 2,"B" : 216 * 2, "G": 130*2,"D":61*2},random_state=CFG.seed)
#         MULTIPLIER = 2
#         d = {col : train[train['Gamma'] == col].shape[0] * MULTIPLIER for col in train['Gamma'].value_counts().index.tolist()}
#         smote = SMOTE(sampling_strategy=d,random_state=CFG.seed,k_neighbors=train['Gamma'].value_counts().min() - 1)
#         X_smote, y_smote = smote.fit_resample(train[feature_list], train['Gamma'])

#         train = pd.concat([X_smote,y_smote],1)

#         train['Class'] = np.where(train['Gamma'].isin(["M","N"]), 1, 0)
#         train['Alpha_x'] = -1
#         train['Alpha_x'] = np.where(train['Gamma'].isin(['A','B']), "G", train['Alpha_x'])
#         train['Alpha_x'] = np.where(train['Gamma'].isin(['E','F']), "D", train['Alpha_x'])
#         train['Alpha_x'] = np.where(train['Gamma'].isin(['G','H']), "B", train['Alpha_x'])
#         train['Alpha_x'] = np.where(train['Gamma'].isin(['M','N']), "A", train['Alpha_x'])

#         encoded_df = pd.get_dummies(train, columns=['Gamma'])
#         train = train.merge(encoded_df)
        
#         smote = SMOTE(sampling_strategy=1,random_state=CFG.seed)
#         X_smote, y_smote = smote.fit_resample(train[feature_list], train['Class'])
#         train = pd.concat([X_smote,y_smote],1)

#         Separate positive and negative samples
        positive_samples = train[train['Class'] == 1]
        negative_samples = train[train['Class'] == 0]

        # Upsample positive samples to match the size of negative samples
        upsampled_positive_samples = positive_samples.sample(n=int(len(negative_samples)//2.0), replace=True)

        # Combine the upsampled positive samples with the negative samples
        balanced_data = pd.concat([negative_samples, upsampled_positive_samples])

        # Shuffle the rows in the balanced DataFrame
        balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

        train = balanced_data
        
#################################################################################################################################################
        BATCH_SIZE = 128



        train_ds = tf.data.Dataset.from_tensor_slices(  (train[feature_list].to_numpy(),  {"action": train['Class'].to_numpy(),"additional_output":  train[aux_columns].to_numpy()} ) ).batch(BATCH_SIZE)
        valid_ds = tf.data.Dataset.from_tensor_slices(  (valid[feature_list].to_numpy(),  {"action": valid['Class'].to_numpy(), "additional_output": valid[aux_columns].to_numpy()} ) ).batch(BATCH_SIZE)
        
        
        
        test_x_ds = tf.data.Dataset.from_tensor_slices(test[feature_list].to_numpy()).batch(BATCH_SIZE)

        model = create_ae_mlp(**params)
        ckp = ModelCheckpoint(".", monitor = 'val_action_loss', verbose = 0, 
                              save_best_only = True, save_weights_only = True, mode = 'min')
        es = EarlyStopping(monitor = 'val_action_loss',
#                            monitor = ['val_action_loss','val_loss'],
                           patience = 7, mode = 'min', 
                           baseline = None, restore_best_weights = True, verbose = 0)
        history = model.fit(train_ds, validation_data = valid_ds, 
                            epochs = 100, batch_size = BATCH_SIZE, callbacks = [ckp, es], verbose = 0)
        
        output = model.predict(test_x_ds)
        ae_pred = output[1]
        y_pred_pre = output[2]

        y_pred = pp_prob(np.append(1 - y_pred_pre,y_pred_pre,1))
        ae_pred = pp_prob(np.append(1 - ae_pred,ae_pred,1))

################################################################################################################################################
       
        score,log_loss_0,log_loss_1 = balanced_log_loss(test['Class'], np.ravel(y_pred),return_all=True)
        print(f"{idx}:",balanced_log_loss(test['Class'], np.ravel(y_pred),return_all=True))

        fold_scores.append(score)
        temp_fold_log_losses.append([log_loss_0,log_loss_1])
        inner_y_preds.append({test.index[i] : y_pred[i] for i in range(len(test))})
        
    cv_strat_scores.append(fold_scores)
    outer_y_preds.append(inner_y_preds)
    fold_log_losses.append(temp_fold_log_losses)



    


  0%|          | 0/2 [00:00<?, ?it/s]

0: (0.20253755663644937, 0.25443758969608693, 0.15063752357681182)
1: (0.2909369095180721, 0.4221985642459008, 0.1596752547902434)
2: (0.2727189820480216, 0.4135403323812609, 0.13189763171478228)
3: (0.35972167416840495, 0.36587702574449427, 0.3535663225923157)
4: (0.38157698125817807, 0.42517861506171123, 0.3379753474546449)
5: (0.3932764762179827, 0.47243367260159413, 0.31411927983437116)
6: (0.39706086743420754, 0.46415406952204347, 0.3299676653463716)
7: (0.499148847857291, 0.48789253588641257, 0.5104051598281695)
8: (0.3033196955679868, 0.392536044460256, 0.2141033466757177)


 50%|█████     | 1/2 [03:08<03:08, 188.32s/it]

9: (0.28622975038645615, 0.1867120330034812, 0.38574746776943114)
0: (0.21918006250529137, 0.26256786906319846, 0.1757922559473843)
1: (0.23634310319090407, 0.3089418955618254, 0.16374431081998278)
2: (0.32994654581171323, 0.2551186842426635, 0.404774407380763)
3: (0.4481030987425062, 0.38493383154697675, 0.5112723659380356)
4: (0.38838818662372543, 0.5628314961277487, 0.21394487711970206)
5: (0.44996172155694864, 0.5013288769410739, 0.3985945661728234)
6: (0.4673892635835877, 0.5624067345671457, 0.3723717926000296)
7: (0.5054713455124515, 0.45357643532504377, 0.5573662556998592)
8: (0.30350464436539676, 0.4075962215880679, 0.19941306714272558)


100%|██████████| 2/2 [05:34<00:00, 167.44s/it]

9: (0.3655296495244337, 0.22085123590233297, 0.5102080631465344)





In [83]:
from collections import ChainMap

cv_0_preds = dict(ChainMap(*outer_y_preds[0]))
cv_1_preds = dict(ChainMap(*outer_y_preds[1]))
# cv_2_preds = dict(ChainMap(*outer_y_preds[2]))

import plotly.express as px

base = pd.DataFrame(cv_0_preds,index=['prediction_0']).T.sort_index().join(pd.DataFrame(cv_1_preds,index=['prediction_1']).T.sort_index())
base['weighted_prediction'] = base[['prediction_0','prediction_1']].mean(axis=1)

base['GT'] = train_df['Class']


px.scatter(base,"weighted_prediction",color="GT",color_continuous_scale="picnic",)

In [84]:
frame_scores = pd.DataFrame(cv_strat_scores,columns=[f"fold_{i}" for i in range(CFG.n_folds)],index=['Class','Alpha'])
log_loss_mean_details = pd.DataFrame(np.mean(fold_log_losses,axis=1),columns=["mean_log_loss0","mean_log_loss1" ],index=['Class','Alpha'])
log_loss_std_details = pd.DataFrame(np.std(fold_log_losses,axis=1),columns=["std_log_loss0","std_log_loss1" ],index=['Class','Alpha'])
display(frame_scores)
display(frame_scores.agg([np.mean,np.std],axis=1).join(log_loss_mean_details).join(log_loss_std_details)[['mean','std','mean_log_loss0','std_log_loss0','mean_log_loss1','std_log_loss1']])
frame_scores.mean().mean()



Unnamed: 0,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
Class,0.202538,0.290937,0.272719,0.359722,0.381577,0.393276,0.397061,0.499149,0.30332,0.28623
Alpha,0.21918,0.236343,0.329947,0.448103,0.388388,0.449962,0.467389,0.505471,0.303505,0.36553


Unnamed: 0,mean,std,mean_log_loss0,std_log_loss0,mean_log_loss1,std_log_loss1
Class,0.338653,0.084056,0.388496,0.092182,0.288809,0.115506
Alpha,0.371382,0.098492,0.392015,0.120805,0.350748,0.143791


0.35501726812550044