In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb

def balanced_log_loss(y_true, y_pred, sample_weight=None):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    balanced_log_loss = 2 * (w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    return balanced_log_loss / (N_0 + N_1)

# Read the data
train_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

# Preprocess the data
train_df["EJ"] = train_df["EJ"].replace({"A": 0, "B": 1})
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)

test_df["EJ"] = test_df["EJ"].replace({"A": 0, "B": 1})
test_df.fillna(test_df.mean(numeric_only=True), inplace=True)

# Apply KNN imputation to handle missing values in training data
knn_imputer = KNNImputer(n_neighbors=20)
train_df_imputed = knn_imputer.fit_transform(train_df.drop(["Class", "Id"], axis=1))
train_df_imputed = pd.DataFrame(train_df_imputed, columns=train_df.drop(["Class", "Id"], axis=1).columns)

# Apply KNN imputation to handle missing values in test data
test_df_imputed = knn_imputer.transform(test_df.drop("Id", axis=1))
test_df_imputed = pd.DataFrame(test_df_imputed, columns=test_df.drop("Id", axis=1).columns)

# Feature selection: Select k-best features
k_best_features = 30  # Choose the number of best features you want to select
selector = SelectKBest(f_classif, k=k_best_features)
X_train_kbest = selector.fit_transform(train_df_imputed, train_df["Class"])
X_test_kbest = selector.transform(test_df_imputed)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_kbest, train_df["Class"],
                                                  test_size=0.1, random_state=42)

# Apply SMOTE to the training set to oversample the minority class
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train_kbest, train_df["Class"])

# Apply Random Under-sampling to balance the class distribution
under_sampler = RandomUnderSampler(random_state=42)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)

# Data preprocessing: Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test_kbest_scaled = scaler.transform(X_test_kbest)

# CatBoost Model with extended early stopping and L1/L2 regularization
num_positive_samples = len(y_train[y_train == 1])
num_negative_samples = len(y_train[y_train == 0])
scale_pos_weight = num_negative_samples / num_positive_samples
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.0005, depth=10, reg_lambda=1,
                                    random_strength=1, loss_function='Logloss',
                                    scale_pos_weight=scale_pos_weight, verbose=False)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=20, verbose_eval=50)

# LightGBM Model with L1 and L2 regularization
lgb_train = lgb.Dataset(X_train, y_train)
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.1,  # L1 regularization term
    'lambda_l2': 0.1,  # L2 regularization term
    'verbose': -1
}
lightgbm_model = lgb.train(lgb_params, lgb_train, num_boost_round=2000)

# XGBoost Model with L1 and L2 regularization
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 8,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 0.1,  # L1 regularization term 
    'alpha': 0.1,   # L2 regularization term 
    'seed': 42
}
xgb_train = xgb.DMatrix(X_train, label = y_train)
xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round = 2000)

# Predictions on the validation set
val_preds_catboost = catboost_model.predict_proba(X_val)[ :, 1]
val_preds_lightgbm = lightgbm_model.predict(X_val)
val_preds_xgboost = xgb_model.predict(xgb.DMatrix(X_val))

# Weights for weighted average ensemble
weight_catboost = 0.1
weight_lightgbm = 0.8
weight_xgboost = 0.1

# Weighted ensemble predictions on the validation set
val_preds_ensemble_weighted = (weight_catboost * val_preds_catboost +
                              weight_lightgbm * val_preds_lightgbm +
                              weight_xgboost * val_preds_xgboost)

# Evaluate the ensemble model with balanced_log_loss on the validation set
ensemble_loss_weighted = balanced_log_loss(y_val, val_preds_ensemble_weighted)
print(f"Validation Balanced Log Loss (Ensemble): {ensemble_loss_weighted}")

# Predictions on the test data for each model
test_preds_xgboost = xgb_model.predict(xgb.DMatrix(X_test_kbest_scaled))
test_preds_catboost = catboost_model.predict_proba(X_test_kbest_scaled)[ :, 1]
test_preds_lightgbm = lightgbm_model.predict(X_test_kbest_scaled)

# Weighted ensemble predictions on the test data
test_preds_ensemble_weighted = (weight_catboost * test_preds_catboost +
                                weight_lightgbm * test_preds_lightgbm +
                                weight_xgboost * test_preds_xgboost)

# Prepare the submission using the ensemble model predictions
sample_submission_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
sample_submission_df['Id'] = test_df.reset_index()['Id']
sample_submission_df["class_0"] = 1 - test_preds_ensemble_weighted
sample_submission_df["class_1"] = test_preds_ensemble_weighted
sample_submission_df.set_index('Id').to_csv('submission.csv')




0:	learn: 0.6923979	test: 0.6924427	best: 0.6924427 (0)	total: 108ms	remaining: 1m 48s
50:	learn: 0.6582626	test: 0.6587053	best: 0.6587053 (50)	total: 2.19s	remaining: 40.8s
100:	learn: 0.6257883	test: 0.6265728	best: 0.6265728 (100)	total: 4.32s	remaining: 38.5s
150:	learn: 0.5956427	test: 0.5969268	best: 0.5969268 (150)	total: 6.4s	remaining: 36s
200:	learn: 0.5670357	test: 0.5687569	best: 0.5687569 (200)	total: 8.48s	remaining: 33.7s
250:	learn: 0.5400053	test: 0.5421171	best: 0.5421171 (250)	total: 10.6s	remaining: 31.5s
300:	learn: 0.5141058	test: 0.5163251	best: 0.5163251 (300)	total: 12.6s	remaining: 29.4s
350:	learn: 0.4902612	test: 0.4928024	best: 0.4928024 (350)	total: 14.9s	remaining: 27.5s
400:	learn: 0.4678021	test: 0.4704835	best: 0.4704835 (400)	total: 16.9s	remaining: 25.3s
450:	learn: 0.4463697	test: 0.4492138	best: 0.4492138 (450)	total: 19s	remaining: 23.2s
500:	learn: 0.4260241	test: 0.4289115	best: 0.4289115 (500)	total: 21.1s	remaining: 21s
550:	learn: 0.4070668	