In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os



In [2]:
imput_file_path = '/kaggle/input/icr-identify-age-related-conditions/'
correlation_threshold = 0.7

use_perc = False
lower_percentile = 5
upper_percentile = 95

use_std = True
num_std = 3

min_max_scaler = True
standard_scaler = False

In [3]:
# Reading Datasets
sample_submission = pd.read_csv(imput_file_path + 'sample_submission.csv')
greeks = pd.read_csv(imput_file_path + 'greeks.csv')
train = pd.read_csv(imput_file_path + 'train.csv')
test = pd.read_csv(imput_file_path + 'test.csv')

In [4]:
# Check prediction class imbalance
train['Class'].value_counts()

0    509
1    108
Name: Class, dtype: int64

In [5]:
# Encode categorical column
train['EJ'] = train['EJ'].replace({'A':0, 'B':1})
test['EJ'] = test['EJ'].replace({'A':0, 'B':1})

In [6]:
# Remove correlated features
def get_correlation(df):
    correlation_data = []
    columns = df.columns

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col1 = columns[i]
            col2 = columns[j]
            correlation = df[col1].corr(df[col2])
            correlation_data.append((col1, col2, correlation))

    correlation_df = pd.DataFrame(correlation_data, columns=['Column 1', 'Column 2', 'Correlation'])
    correlation_df = correlation_df.sort_values(by='Correlation', ascending=False).reset_index(drop=True)

    return correlation_df


correlation_result = get_correlation(train[[col for col in train.columns if train[col].dtype != 'object']])
print(train.shape)

while correlation_result['Correlation'][0] > correlation_threshold:
    train.drop(columns = [correlation_result['Column 2'][0]], inplace = True)
    correlation_result = get_correlation(train[[col for col in train.columns if train[col].dtype != 'object']])
    
print(train.shape)

(617, 58)
(617, 50)


In [7]:
# Outlier removal - based on percentile
def cap_data_by_percentile(train_data, lower_percentile=5, upper_percentile=95):
    lower_thresholds = np.percentile(train_data, lower_percentile, axis=0)
    upper_thresholds = np.percentile(train_data, upper_percentile, axis=0)

    capped_data = np.clip(train_data, lower_thresholds, upper_thresholds)

    return capped_data

if use_perc:
    train = cap_data_by_percentile(train[[col for col in train.columns if train[col].dtype != 'object']], \
                                   lower_percentile=lower_percentile, upper_percentile=upper_percentile)

# Outlier removal - based on standard deviation
elif use_std:
    for col in [col for col in train.columns if train[col].dtype != 'object']:
        mean = np.mean(train[col], axis=0)
        std = np.std(train[col], axis=0)
        thresholds = num_std * std

        train[col] = np.clip(train[col], mean - thresholds, mean + thresholds)

In [8]:
# # Feature Scaling

# # MinMaxScaler
# if min_max_scaler:
#     scaler = MinMaxScaler()
#     scaled_train = scaler.fit_transform(train)

# # StandardScaler
# if standard_scaler:
#     scaler = StandardScaler()
#     scaled_train = scaler.fit_transform(train)

In [9]:
for col in train.columns:
    if train[col].isna().sum()>0:
        train[col].fillna(train[col].mean(), inplace=True)

In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
import optuna

# Step 1: Prepare the data (Replace X and y with your dataset)
X, y = np.array(train.drop(columns = ['Id', 'Class'])), train['Class']

# Step 2: Define the model
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
    }
    model = DecisionTreeClassifier(**params)

    # Step 4: Perform K-fold Stratification
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_losses = []
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)
        log_loss_val = log_loss(y_val, y_pred_proba)
        log_losses.append(log_loss_val)

    return np.mean(log_losses)

# Step 3: Hyperparameter Tuning with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Step 5: Train and evaluate the model with the best hyperparameters
best_params = study.best_params
best_model = DecisionTreeClassifier(**best_params)
best_model.fit(X, y)

print("Best Hyperparameters:", best_params)


[I 2023-08-01 15:12:57,625] A new study created in memory with name: no-name-1bff3ee4-61b0-46d6-a67f-fd0f5c59b6f4
[I 2023-08-01 15:12:57,693] Trial 0 finished with value: 0.47896147887723595 and parameters: {'max_depth': 3, 'min_samples_split': 6}. Best is trial 0 with value: 0.47896147887723595.
[I 2023-08-01 15:12:57,761] Trial 1 finished with value: 1.4655523383287612 and parameters: {'max_depth': 4, 'min_samples_split': 8}. Best is trial 0 with value: 0.47896147887723595.
[I 2023-08-01 15:12:57,847] Trial 2 finished with value: 2.780413712321952 and parameters: {'max_depth': 6, 'min_samples_split': 10}. Best is trial 0 with value: 0.47896147887723595.
[I 2023-08-01 15:12:57,896] Trial 3 finished with value: 0.3504586008670302 and parameters: {'max_depth': 2, 'min_samples_split': 2}. Best is trial 3 with value: 0.3504586008670302.
[I 2023-08-01 15:12:58,005] Trial 4 finished with value: 4.192057327843315 and parameters: {'max_depth': 10, 'min_samples_split': 6}. Best is trial 3 with

Best Hyperparameters: {'max_depth': 2, 'min_samples_split': 2}


In [11]:
test.fillna(0, inplace = True)

In [12]:
best_model.predict_proba(test[[col for col in train.drop(columns = ['Id', 'Class']).columns]])



array([[0.93110236, 0.06889764],
       [0.93110236, 0.06889764],
       [0.93110236, 0.06889764],
       [0.93110236, 0.06889764],
       [0.93110236, 0.06889764]])

In [13]:
sample_submission[['class_0', 'class_1']] = best_model.predict_proba(test[[col for col in train.drop(columns = ['Id', 'Class']).columns]])



In [14]:
sample_submission.to_csv('submission.csv', index=False)