### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import math

from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")



### Data Prep

In [2]:
COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
train = pd.read_csv(f"{COMP_PATH}/train.csv")
test = pd.read_csv(f"{COMP_PATH}/test.csv")
sample_submission = pd.read_csv(f"{COMP_PATH}/sample_submission.csv")
greeks = pd.read_csv(f"{COMP_PATH}/greeks.csv")

In [3]:
train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [4]:
# Find columns with null or blank values
null_or_blank_cols = train.columns[train.isnull().any() | (train == '').any()]
# Print the columns
print("Columns with null or blank values:")
for col in null_or_blank_cols:
    print(col)

Columns with null or blank values:
BQ
CB
CC
DU
EL
FC
FL
FS
GL


In [5]:
#same for test data frame
null_or_blank_cols = test.columns[test.isnull().any() | (test == '').any()]
print("Columns with null or blank values:")
for col in null_or_blank_cols:
    print(col)

Columns with null or blank values:


In [6]:
# Calculate the median for each column
medians = train.median()

# Replace null or blank values with median
train_filled = train.fillna(medians).replace('', medians)


In [7]:
null_or_blank_cols = train_filled.columns[train_filled.isnull().any() | (train_filled == '').any()]
# Print the columns
print("Columns with null or blank values:")
for col in null_or_blank_cols:
    print(col)

Columns with null or blank values:


In [8]:
#check for class imbalance
train['Class'].value_counts() 

0    509
1    108
Name: Class, dtype: int64

In [9]:
cols = train.columns
feat_cols = cols[1:]
num_cols = train.select_dtypes(include=['float64']).columns
print("No of Columns:", len(cols))

No of Columns: 58


### Metric

In [10]:
def competition_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    return (log_loss_0 + log_loss_1)/2

def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    return balanced_log_loss/(N_0+N_1)

In [11]:
def lgb_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

### Feature Engineering

In [12]:
# Label encoding
train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ']  = test['EJ'].map({'A': 0, 'B': 1})

In [13]:
scaler = StandardScaler()
df, test_df = train.copy(), test.copy()
new_num_cols = train.select_dtypes(include=['float64']).columns
df[new_num_cols] = scaler.fit_transform(train[new_num_cols])
test_df[new_num_cols] = scaler.transform(test[new_num_cols])

In [14]:
df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,-0.572153,-0.170975,-0.261669,-0.237889,-0.189295,-1.900558,-0.083417,-0.173502,-0.038354,...,0.162355,-0.035806,-0.250462,-0.940094,-0.41026,-0.655511,-0.948991,0.531241,-0.815091,1
1,007255e47698,-0.709105,-1.097801,-0.261669,-0.028701,-0.189295,-0.750457,-0.083417,0.678919,-0.104787,...,-0.457909,-0.060566,0.113034,-1.14507,-0.41026,0.687893,-0.238862,-0.509218,1.303181,0
2,013f2bd269f5,-0.015212,-0.377169,-0.261669,-0.094845,-0.189295,0.465662,-0.083417,0.519453,-0.104787,...,0.19817,-0.051023,0.595966,1.637944,-0.29921,-0.05185,-0.351743,-0.424754,-0.807668,0
3,043ac50845d5,-0.480851,0.138196,0.012347,0.547477,-0.189295,-0.72961,-0.083417,0.112088,-0.104787,...,0.059978,-0.060566,-0.105064,-0.219883,-0.342195,-0.650833,0.858232,1.101332,-0.811652,0
4,044fb8a146ec,-0.206946,0.100517,-0.261669,-0.356885,-0.189295,-0.628845,-0.013229,-1.649292,1.445139,...,0.236779,0.896815,-0.229691,-0.432313,0.09992,-0.318309,1.409422,-0.395228,-0.817391,1


### CV

In [15]:
kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
df['fold'] = -1

for fold, (train_idx, test_idx) in enumerate(kf.split(df, greeks['Alpha'])):
    df.loc[test_idx, 'fold'] = fold

df.groupby('fold')["Class"].value_counts()

fold  Class
0     0        101
      1         23
1     0        102
      1         22
2     0        102
      1         21
3     0        102
      1         21
4     0        102
      1         21
Name: Class, dtype: int64

### Training

In [16]:
final_valid_predictions = {}
final_test_predictions = []
scores = []
log_losses = []
balanced_log_losses = []
weights = []

for fold in range(5):
    train_df = df[df['fold'] != fold]
    valid_df = df[df['fold'] == fold]
    valid_ids = valid_df.Id.values.tolist()

    X_train, y_train = train_df.drop(['Id', 'Class', 'fold'], axis=1), train_df['Class']
    X_valid, y_valid = valid_df.drop(['Id', 'Class', 'fold'], axis=1), valid_df['Class']
    
    lgb = LGBMClassifier(boosting_type='goss', learning_rate=0.06733232950390658, n_estimators = 50000, 
                         early_stopping_round = 300, random_state=42,
                        subsample=0.6970532011679706,
                        colsample_bytree=0.6055755840633003,
                         class_weight='balanced',
                         metric='none', is_unbalance=True, max_depth=8)
    
    lgb.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=1000,
            eval_metric=lgb_metric)
    
    y_pred = lgb.predict_proba(X_valid)
    preds_test  = lgb.predict_proba(test_df.drop(['Id'], axis=1).values)
    
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, y_pred)))

    logloss = log_loss(y_valid, y_pred)
    balanced_logloss = balanced_log_loss(y_valid, y_pred[:, 1])
    log_losses.append(logloss)
    balanced_log_losses.append(balanced_logloss)
    weights.append(1/balanced_logloss)
    
    print(f"Fold: {fold}, log loss: {round(logloss, 3)}, balanced los loss: {round(balanced_logloss, 3)}")

print()
print("Log Loss")
print(log_losses)
print(np.mean(log_losses), np.std(log_losses))
print()
print("Balanced Log Loss")
print(balanced_log_losses)
print(np.mean(balanced_log_losses), np.std(balanced_log_losses))
print()
print("Weights")
print(weights)

Fold: 0, log loss: 0.245, balanced los loss: 0.173
Fold: 1, log loss: 0.197, balanced los loss: 0.142
[1000]	valid_0's balanced_log_loss: 0.0226595
Fold: 2, log loss: 0.03, balanced los loss: 0.022
Fold: 3, log loss: 0.232, balanced los loss: 0.113
Fold: 4, log loss: 0.222, balanced los loss: 0.14

Log Loss
[0.24456075168960273, 0.19748941802920092, 0.029779936681547672, 0.23195822376492026, 0.2223370900054482]
0.18522508403414395 0.07924091937407356

Balanced Log Loss
[0.1728458319379688, 0.14168483203609303, 0.021574539415932473, 0.11277963933845107, 0.14002666540566525]
0.11778230162682213 0.051728319496803246

Weights
[5.785502541703648, 7.057918519783814, 46.350931564338055, 8.866848713702707, 7.141496922053688]


In [17]:
test_preds = np.zeros((test_df.shape[0],2))
for i in range(5):
    test_preds[:, 0] += weights[i] * final_test_predictions[i][:, 0]
    test_preds[:, 1] += weights[i] * final_test_predictions[i][:, 1]
test_preds /= sum(weights)
test_preds

array([[0.78349616, 0.21650384],
       [0.78349616, 0.21650384],
       [0.78349616, 0.21650384],
       [0.78349616, 0.21650384],
       [0.78349616, 0.21650384]])

In [18]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ['Id', 'class_0', 'class_1']
final_valid_predictions.to_csv(r"oof.csv", index=False)

test_dict = {}
test_dict.update(dict(zip(test.Id.values.tolist(), test_preds)))
submission = pd.DataFrame.from_dict(test_dict, orient="index").reset_index()
submission.columns = ['Id', 'class_0', 'class_1']                       

submission.to_csv(r"submission.csv", index=False)
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.783496,0.216504
1,010ebe33f668,0.783496,0.216504
2,02fa521e1838,0.783496,0.216504
3,040e15f562a2,0.783496,0.216504
4,046e85c7cc7f,0.783496,0.216504
