In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from sklearn.metrics import f1_score, accuracy_score, \
roc_auc_score, classification_report, precision_score, recall_score, \
log_loss, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import shap

from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.inspection import permutation_importance

import warnings
warnings.filterwarnings("ignore")

RAND = 42
N_FOLDS = 3

# Load data

In [None]:
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
df_train.head()

# EDA

In [None]:
df_train = df_train.drop(columns=['Id'], axis=1)

In [None]:
df_train.info()

In [None]:
df_train.isna().sum()

In [None]:
df_train.iloc[:,1:].describe()

In [None]:
df_train.describe(include=["object"])

In [None]:
# 
le = LabelEncoder()
df_train["EJ"] = le.fit_transform(df_train["EJ"])

df_train.EJ.value_counts()

In [None]:
# filling in the gaps with the average value
for i in df_train.columns:
    col = df_train[i]
    col = col.fillna(col.mean() ,inplace=True) 
    
df_train.info()

In [None]:
# rationing for the size of the dataset
norm_target = (df_train
               .Class
               .value_counts(normalize=True)
               .mul(100)
               .rename('percent')
               .reset_index())

# checking the balance of classes
plt.figure(figsize=(15, 7))
sns.barplot(x='index', y='percent', data=norm_target, palette="crest")

plt.title('Class', fontsize=20)
plt.xlabel('class', fontsize=14)
plt.ylabel('Percentages', fontsize=14)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

In [None]:
# checking the distribution of features and their outliers
cols = df_train.columns

fig, axes = plt.subplots(figsize=(20, 60))

for i in range(len(cols)):
    plt.subplot(19,3,i+1)
    sns.histplot(df_train,x=cols[i],hue="Class",bins=40,kde=True,palette="mako")
    plt.gca().set_title(cols[i])
    plt.grid(True)

fig.tight_layout()
plt.show()

In [None]:
# checking the correlation between features and target

mask = np.triu(df_train.corr())

plt.figure(figsize=(30, 15))
sns.heatmap(df_train.corr(method='spearman'),
            mask=mask,
            annot=True,
            fmt='.3f',
            cmap='coolwarm',
            linewidths=0.00,
            cbar=True)


plt.suptitle('Features with Highest Correlations', weight='bold')
plt.tight_layout()

# Training Models

In [None]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # Implements the Evaluation equation with w_0 = w_1 = 1.
    # Calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # Calculate the average log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    # return the (not further weighted) average of the averages
    return (log_loss_0 + log_loss_1)/2

In [None]:
def get_metrics(y_test, y_pred, y_score, name):
    # displaying metrics
    
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score[:,1])]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['bal_log_loss'] = [balanced_log_loss(y_test, y_score[:,1])]
    
    return df_metrics

In [None]:
def check_overfitting(model, X_train, y_train, X_test, y_test):
    # check on overfitting
    
    y_pred_train = model.predict(X_train)
    y_prod_train = model.predict_proba(X_train)
    
    y_pred_test = model.predict(X_test)
    y_prod_test = model.predict_proba(X_test)
    

    print(f'ROC_AUC train: %.3f' % roc_auc_score(y_train, y_prod_train[:,1]))
    print(f'ROC_AUC test: %.3f' % roc_auc_score(y_test, y_prod_test[:,1]))

    print(f'F1 train: %.3f' % f1_score(y_train, y_pred_train))
    print(f'F1 test: %.3f' % f1_score(y_test, y_pred_test))

In [None]:
# splitting into training and test data

X = df_train.drop(['Class'], axis=1)
y = df_train['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=RAND)

## LGBM CridSearchCV

In [None]:
parameters_grid = {
    'num_iterations': [100],
    'learning_rate': [0.001, 0.05, 0.1],
    'max_depth': [6, 10, 15],
    'num_leaves': [10],
    'is_unbalance': [True]
}

lgm = LGBMClassifier()
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

lgb_grid = GridSearchCV(lgm,
                        parameters_grid,
                        scoring='roc_auc',
                        cv=cv,
                        verbose=1)

lgb_grid.fit(X_train, y_train, verbose=0)

In [None]:
y_pred = lgb_grid.predict(X_test)
y_score = lgb_grid.predict_proba(X_test)

metrics = get_metrics(y_test, y_pred, y_score, 'LGBM_grid')
metrics

In [None]:
check_overfitting(lgb_grid, X_train, y_train, X_test, y_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'TN = {tn}\nFP = {fp}\nFN = {fn}\nTP = {tp}')

## CatBoost

In [None]:
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
ratio

In [None]:
cb = CatBoostClassifier(random_state=RAND,
                       scale_pos_weight=ratio)

cb.fit(X_train,
       y_train,
       verbose=False)

In [None]:
y_pred = cb.predict(X_test)
y_score = cb.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test, y_pred, y_score, 'CatBoost'))
metrics

In [None]:
check_overfitting(cb, X_train, y_train, X_test, y_test)

## CatBoost CridSearchCV

In [None]:
parameters_grid = {
    "iterations": [500],
    "learning_rate": [0.05, 0.03],
    #"depth": [8],
     "bootstrap_type" : ['No'],
    "scale_pos_weight": [ratio]
}

cb = CatBoostClassifier()
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

cb_grid = GridSearchCV(cb,
                        parameters_grid,
                        scoring='roc_auc',
                        cv=cv,
                        verbose=False)

cb_grid.fit(X_train, y_train, verbose=0)

In [None]:
y_pred = cb_grid.predict(X_test)
y_score = cb_grid.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test, y_pred, y_score, 'Catboost_grid'))
metrics

In [None]:
check_overfitting(cb_grid, X_train, y_train, X_test, y_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'TN = {tn}\nFP = {fp}\nFN = {fn}\nTP = {tp}')

## XGBoost GridSearchCV

In [None]:
xgb_params = {
    'n_estimators': [100],
    'max_depth': [6, 8],
    'scale_pos_weight': [ratio],
    'learning_rate': [0.1, 0.05, 0.01],
    'subsample': [0.5, 0.7, 0.9],
    'tree_method': ['exact']
}

xgb_clf = XGBClassifier()
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RAND)

grid_xgb = GridSearchCV(xgb_clf,
                        param_grid=xgb_params,
                        scoring='roc_auc',
                        cv=cv,
                        n_jobs=-1,
                        verbose=0)

grid_xgb.fit(X_train, y_train)

In [None]:
y_pred = grid_xgb.predict(X_test)
y_score = grid_xgb.predict_proba(X_test)

metrics = metrics.append(
    get_metrics(y_test, y_pred, y_score,
                name='XGBoost_GridSearchCV'))

metrics

In [None]:
check_overfitting(grid_xgb, X_train, y_train, X_test, y_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'TN = {tn}\nFP = {fp}\nFN = {fn}\nTP = {tp}')

# Voting Soft

In [None]:
vt_clf = VotingClassifier(estimators=[('ct', cb_grid),
                                      #('xgb', grid_xgb),
                                       ('lg', lgb_grid)],
                           voting='soft')

vt_clf.fit(X_train, y_train)

In [None]:
y_pred = vt_clf.predict(X_test)
y_score = vt_clf.predict_proba(X_test)

metrics = metrics.append(
    get_metrics(y_test, y_pred, y_score, name='Voting_soft'))
metrics

In [None]:
ct = cb_grid
lg = lgb_grid
plt.figure(figsize=(10, 6))

# CatBoost
plt.plot(roc_curve(y_test,
                   ct.predict_proba(X_test)[:, 1])[0],
         roc_curve(y_test,
                   ct.predict_proba(X_test)[:, 1])[1],
         color='red',
         label='CatBoostClassifier ROC-AUC = %0.3f' %
         roc_auc_score(y_test,
                       ct.predict_proba(X_test)[:, 1]))

# LGBM
plt.plot(roc_curve(y_test,
                   lg.predict_proba(X_test)[:, 1])[0],
         roc_curve(y_test,
                   lg.predict_proba(X_test)[:, 1])[1],
         color='green',
         label='LGBMClassifier ROC-AUC = %0.3f' %
         roc_auc_score(y_test,
                       lg.predict_proba(X_test)[:, 1]))

# VotingClassifier
plt.plot(roc_curve(y_test,
                   vt_clf.predict_proba(X_test)[:, 1])[0],
         roc_curve(y_test,
                   vt_clf.predict_proba(X_test)[:, 1])[1],
         color='darkorange',
         label='VotingClassifier ROC-AUC = %0.3f' %
         roc_auc_score(y_test,
                       vt_clf.predict_proba(X_test)[:, 1:]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC')
plt.legend(loc="lower right")
plt.show()

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True);
plt.xlabel('Predicted')
plt.ylabel('Actual');

In [None]:
def features_importance(model, n_repeats=20):
    # checking the importance of features
    
    r = permutation_importance(model,
                               X_test,
                               y_test,
                               n_repeats=n_repeats,
                               random_state=RAND)

    feature_imp = pd.DataFrame()
    feature_imp['column'] = X_test.columns
    feature_imp['value'] = r['importances_mean']
    feature_imp['2_std'] = 2*r['importances_std']

    feature_imp.sort_values(by='value', inplace=True, ascending=False)

    plt.figure(figsize=(10, 7))
    sns.barplot(data=feature_imp[:15], x='value', y='column', palette='crest')
    
    return plt.show()

In [None]:
# let's check the importance of features on LGBM
features_importance(lgb_grid)

In [None]:
# let's check the importance of features on CatBoost
features_importance(cb_grid)

# Submission

In [None]:
df_test.info()

In [None]:
test_ids = df_test['Id']
df_test = df_test.drop(['Id'], axis=1)

In [None]:
le=LabelEncoder()
df_test["EJ"]=le.fit_transform(df_test["EJ"])

In [None]:
y_score_test = vt_clf.predict_proba(df_test)

In [None]:
submission_df = pd.DataFrame({
    'Id': test_ids,
    'class_0': y_score_test[:, 0],  # Probability of Class 0 (No age-related condition)
    'class_1': y_score_test[:, 1],  # Probability of Class 1 (Age-related condition)
})

In [None]:
submission_df.to_csv('submission.csv', index = False)
submission_df