# Explore Mental Health Data
Your Goal: Your goal is to use data from a mental health survey to explore factors that may cause individuals to experience depression.

Evaluation: Submissions are evaluated using **Accuracy Score**.

# Preparation

## Load libraries

In [None]:
!pip install -U lightautoml[all]

In [None]:
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical computing
from datetime import datetime
import scipy.stats as stats      # For statistical analysis
import math
import matplotlib                # For plotting and visualization
import matplotlib.pyplot as plt  
from pandas.plotting import parallel_coordinates
import seaborn as sns            # For statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import torch
import os

In [None]:
# For machine learning
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

from lightgbm import early_stopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, roc_auc_score,
                             f1_score, confusion_matrix, classification_report)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
import optuna

## Load datasets

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')

# Exploratory Data Analysis

In [None]:
print('#####Train dataset structure#####')
print(df_train.info())
print('#####Test dataset structure#####')
print(df_test.info())

In [None]:
df_train.head()

In [None]:
num_cols = [col for col in df_test.columns if df_test[col].dtypes in ['int', 'float']]
cat_cols = [col for col in df_test.columns if col not in num_cols]
print(num_cols, cat_cols)

## Categorical Data

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(20, 30))
 
for i, col in enumerate(cat_cols):
    plt.subplots_adjust(top = 0.85)
    ax = sns.histplot(data = df_train, 
                x = col, 
                hue = 'Depression',
                ax = axes[i // 2, i % 2])

fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Categorical Feature Distributions by Depression Status', fontsize=16)
plt.show()

**Observations**
- There are many data types in the features - int64, float64, and object
- There are null values in these features in both train and test datasets -
    - Major NA value proportion `Profession`, `Academic Pressure`, `Work Pressure`, `CGPA`, `Study Satisfaction`, `Job Satisfaction`
    - Minor NA value proportion `Dietary Habits`, `Degree`
- Categorical Features have a lot of low frequency categories
- Work and Academic columns are quite overlapped
- `Sleep Duration` has too many irrelevant answers

**Actions**
- Binary Code for `Gender`, `Have you ever had suicidal thoughts ?`, `Family History of Mental Illness`
- Engineer columns to simplify `Profession` with `Working Professional or Student`, `Academic Pressure` with `Work Pressure`, `Academic Satisfaction` with `Work Satisfaction`

## Numerical Data

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))

for i, col in enumerate(num_cols):
    ax = sns.histplot(data=df_train,
                      x=col,
                      bins=20,
                      ax=axes[i // 3, i % 3])
    ax.bar_label(ax.containers[1])

fig.tight_layout(h_pad=2)
plt.subplots_adjust(top=0.92)
plt.suptitle('Numerical Feature Distributions', fontsize=16)
plt.show()

# Feature Engineering

In [None]:
df = pd.concat([df_train, df_test])
df.head()

In [None]:
diet_map = {
        'Healthy': 2,
        'Moderate': 1
    }
gender_map = {
        'Female': 1,
        'Male': 0
    }
profession_map = {
        'Working Professional': 1,
        'Student': 0
    }
sleep_map = {
        'Very Short ': 1,
        'Short': 2,
        'Recommended': 4,
        'Long': 3,
        'Irregular': 0
    }

In [None]:
def feature_engineering(df):
    df['Suicidal Thoughts'] = df['Have you ever had suicidal thoughts ?'].apply(lambda x: 1 if x == 'Yes' else 0)
    df['Family History'] = df['Family History of Mental Illness'].apply(lambda x: 1 if x == 'Yes' else 0)
    # Name 
    mean_target_per_name = df.groupby('Name')['Depression'].mean()
    df['Name'] = df['Name'].map(mean_target_per_name)
    # Gender
    df['Gender'] = df['Gender'].map(gender_map).fillna(0).astype(int)
    # Age
    
    # City 
    mean_target_per_city = df.groupby('City')['Depression'].mean()
    df['City'] = df['City'].map(mean_target_per_city)
    # Work or Student
    df['Working Professional or Student'] = df['Working Professional or Student'].map(profession_map).fillna(0).astype(int)
    df['Profession'].fillna('Student', inplace = True)
    mean_target_per_prof = df.groupby('Profession')['Depression'].mean()
    df['Profession'] = df['Profession'].map(mean_target_per_prof)
    # Pressure
    df['Academic Pressure'].fillna(0, inplace = True)
    df['Work Pressure'].fillna(0, inplace = True)
    df['Life Pressure'] = df['Academic Pressure'] + df['Work Pressure']
    # Satisfaction
    df['Study Satisfaction'].fillna(0, inplace = True)
    df['Job Satisfaction'].fillna(0, inplace = True)
    df['Life Satisfaction'] = df['Study Satisfaction'] + df['Job Satisfaction']
    # CGPA
    df['CGPA'].fillna(10, inplace = True)

    # Sleep
    def bucketize_sleep_duration(val):
        if '1-' in val or '2-' in val or '3-' in val:
            return "Very Short"
        elif '4-' in val or '5-' in val:
            return "Short"
        elif '7-' in val or '8-' in val:
            return "Recommended"
        elif '9-' in val or '10-' in val or 'More' in val:
            return "Long"
        else:
            return "Irregular"
    df['Sleep Length'] = df['Sleep Duration'].apply(bucketize_sleep_duration)
    df['Healthy Sleep'] = df['Sleep Length'].map(sleep_map).fillna(0).astype(int)
    
    # Diet
    df['Dietary Habits'] = df['Dietary Habits'].map(diet_map).fillna(0).astype(int)
    
    # Degree 
    mean_target_per_degree = df.groupby('Degree')['Depression'].mean()
    df['Degree'] = df['Degree'].map(mean_target_per_degree)

    # Finance
    financial_stress_mean  = df['Financial Stress'].mean()
    df['Financial Stress'].fillna(financial_stress_mean, inplace=True)

    # Statistical Features
    df['academic_pressure__age'] =  df['Age'] *  df['Academic Pressure']
    df['work_pressure__age'] =  df['Age'] *  df['Work Pressure']
    df['life_pressure__age'] =  df['Age'] *  df['Life Pressure']
    df['academic_pressure__age_per_work_pressure__age'] = df['academic_pressure__age']*df['work_pressure__age']
    df['pressure_satisfaction_prof'] = df['Work Pressure']*df['Job Satisfaction']
    df['pressure_saticfaction_stud'] =  df['Academic Pressure'] * df['Study Satisfaction']
    df['pressure_satisfaction_prof_per_work'] = df['Work Pressure']*df['Job Satisfaction']*(df['Work/Study Hours']+1e-6)
    df['pressure_saticfaction_stud_per_study'] =  df["Academic Pressure"] *  df['Study Satisfaction']*df['Work/Study Hours']
    df['work_financial_pressure'] =  df['Work Pressure'] * df['Financial Stress']
    df['work_financial_pressure_satisfaction_work'] =df['Job Satisfaction']*(df['work_financial_pressure'] + 1e-6)
    df['work_financial_pressure_satisfaction_work'] = df['Study Satisfaction']*(df['work_financial_pressure']+ 1e-6)
    
    # Drop columns
    df.drop(['Have you ever had suicidal thoughts ?'], axis=1, inplace=True)
    df.drop(['Family History of Mental Illness'], axis=1, inplace=True)
    df.drop(['Sleep Length'], axis=1, inplace=True)
    df.drop(['Sleep Duration'], axis=1, inplace=True)
    return df

In [None]:
feature_engineering(df)

In [None]:
df.drop(['id'], axis=1, inplace=True)
df_train_processed = df[:140700]
df_train_processed.head()

In [None]:
df_test_processed = df[140700:]
df_test_processed.drop(['Depression'], axis=1, inplace=True)
df_test_processed.head()

# Machine Learning

## Train Test Split

In [None]:
X = df_train_processed.drop(columns = ['Depression'] , axis = 1)
y = df_train_processed['Depression']

In [None]:
# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def reduce_mem_usage(df):
    """ 
    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if pd.api.types.is_numeric_dtype(df[col]):
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_integer_dtype(df[col]):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_train_processed = reduce_mem_usage(df_train_processed)
df_test_processed = reduce_mem_usage(df_test_processed)

In [None]:
# Split the dataset into training and validation sets
df_train_split, df_valid_split = train_test_split(df_train_processed, test_size=0.2, random_state=42)

In [None]:
X_train = df_train_split.drop(columns = ['Depression'] , axis = 1)
y_train = df_train_split['Depression']

In [None]:
X_valid = df_valid_split.drop(columns = ['Depression'] , axis = 1)
y_valid = df_valid_split['Depression']

## Light AutoML

In [None]:
def map_class(x, task, reader):
    if task.name == 'multiclass':
        return reader[x]
    else:
        return x

mapped = np.vectorize(map_class)

def score(task, y_true, y_pred):
    if task.name == 'binary':
        return roc_auc_score(y_true, y_pred)
    elif task.name == 'multiclass':
        return accuracy_score(y_true, np.argmax(y_pred, 1))
    elif task.name == 'reg' or task.name == 'multi:reg':
        return median_absolute_error(y_true, y_pred)
    else:
        raise 'Task is not correct.'
        
def take_pred_from_task(pred, task):
    if task.name == 'binary' or task.name == 'reg':
        return pred[:, 0]
    elif task.name == 'multiclass' or task.name == 'multi:reg':
        return pred
    else:
        raise 'Task is not correct.'
        
def use_plr(USE_PLR):
    if USE_PLR:
        return "plr"
    else:
        return "cont"

In [None]:
RANDOM_STATE = 42
N_THREADS = os.cpu_count()
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
# task = Task('binary') 
# autodl = TabularAutoML(
#     task = task, 
#     timeout = 9 * 3600,
#     cpu_limit = os.cpu_count(),
#     general_params = {"use_algos": [['resnet']]}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
#     nn_params = {
#         "n_epochs": 10, 
#         "bs": 512, 
#         "num_workers": 0, 
#         "path_to_save": None, 
#         "freeze_defaults": True,
#         "cont_embedder": 'plr',
#         'cat_embedder': 'weighted',
#         "hidden_size": 64,
#         'hid_factor': [4, 6],
#         'block_config': [4, 4],
#         'embedding_size': 64, 
#         'stop_by_metric': True,
#         'verbose_bar': True,
#         "snap_params": { 'k': 2, 'early_stopping': True, 'patience': 2, 'swa': True }
#     },
#     nn_pipeline_params = {"use_qnt": False, "use_te": True},
#     reader_params = {'n_jobs': os.cpu_count(), 'cv': 5, 'random_state': 42, 'advanced_roles': True}
# )

In [None]:
# out_of_fold_predictions = autodl.fit_predict(
#     df_train_split,
#     roles = {
#         'target': 'Depression',
#     }, 
#     verbose = 3
# )

In [None]:
task = Task('binary') 
automl = TabularAutoML(
    task = task, 
    timeout = 9 * 3600,
    cpu_limit = os.cpu_count(),
    nn_params = {
    'stop_by_metric': True,
    'verbose_bar': True},
    nn_pipeline_params = {"use_qnt": False, "use_te": False},
    reader_params = {'n_jobs': os.cpu_count(), 'cv': 10, 'random_state': 42, 'advanced_roles': True}
)

In [None]:
out_of_fold_predictions = automl.fit_predict(
    df_train_processed,
    roles = {
        'target': 'Depression',
    }, 
    verbose = 3
)

In [None]:
predicted_classes = (out_of_fold_predictions.data.flatten() > 0.5).astype(int)
true_labels = df_train_processed['Depression'].values.astype(int)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(true_labels, predicted_classes)
print("Accuracy Score on Validation Data:", accuracy)

In [None]:
y_pred_proba_automl_1 = automl.predict(df_valid_split).data[:, 0]

## XGBoost

In [None]:
# # Define objective function for Optuna
# def objective(trial):
#     # Define hyperparameters to search
#     params = {
#         'booster': 'gbtree',
#         'objective': 'binary:logistic',
#         'eval_metric': 'auc',
#         'max_depth': trial.suggest_int('max_depth', 3, 20),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'min_child_weight': trial.suggest_float('min_child_weight', 1, 15),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 12000),
#         'device': 'cuda',
#         'random_state': 0
#     }

#     # Split the training data into training and validation sets
#     X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

#     # Train XGBoost model with current hyperparameters
#     clf = XGBClassifier(**params)
#     clf.fit(X_train_split, y_train_split)

#     # Predict probabilities on validation set
#     y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

#     # Calculate ROC AUC on validation set
#     roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
#     return roc_auc

# # Optimize hyperparameters using Optuna
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

# # Get best hyperparameters
# xgb_best_params = study.best_params
# print("Best Hyperparameters:", xgb_best_params)

Best Hyperparameters: {'max_depth': 15, 'learning_rate': 0.01801657172689155, 'min_child_weight': 4.8772240757900045, 'colsample_bytree': 0.6938100830879348, 'reg_alpha': 8.974794625892391, 'reg_lambda': 9.794363082142656, 'subsample': 0.628026726594587, 'n_estimators': 703}

Best Hyperparameters: {'max_depth': 3, 'learning_rate': 0.011670653230923514, 'min_child_weight': 14.127992761618438, 'colsample_bytree': 0.5038048654086844, 'reg_alpha': 5.760848133682294, 'reg_lambda': 2.6948173441150227, 'subsample': 0.5852808297867893, 'n_estimators': 3489}

Best Hyperparameters: {'max_depth': 4, 'learning_rate': 0.028949993238818562, 'min_child_weight': 7.472178342152908, 'colsample_bytree': 0.5048225534122298, 'reg_alpha': 8.993064644792046, 'reg_lambda': 3.4598638747943316, 'subsample': 0.5687531348526244, 'n_estimators': 750}

Best Hyperparameters: {'max_depth': 3, 'learning_rate': 0.05783817198027412, 'min_child_weight': 10.124475828112256, 'colsample_bytree': 0.6192098284876566, 'reg_alpha': 9.615639451753312, 'reg_lambda': 3.5778496101852357, 'subsample': 0.9446520676127865, 'n_estimators': 484}

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create the XGBoost model using Optuna model
xgb_best_params_1 = {'max_depth': 4, 
                    'learning_rate': 0.028949993238818562, 
                    'min_child_weight': 7.472178342152908, 
                    'colsample_bytree': 0.5048225534122298, 
                    'reg_alpha': 8.993064644792046, 
                    'reg_lambda': 3.4598638747943316, 
                    'subsample': 0.5687531348526244, 
                    'n_estimators': 750,
                    'booster': 'gbtree',
                    'objective': 'binary:logistic',
                    'eval_metric': 'auc',
                    'device': 'cuda',
                    'random_state': 0}
xgb_1 = XGBClassifier(**xgb_best_params_1)

# Fit the model with early stopping
xgb_1.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=50,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_xgb_1 = xgb_1.predict_proba(X_valid)[:, 1]
y_pred_xgb_1 = np.round(y_pred_proba_xgb_1).astype(int)

# Calculate Accuracy Score on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_xgb_1)
accuracy = accuracy_score(y_valid, y_pred_xgb_1)
print("Accuracy Score on Validation Data:", accuracy)

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create the XGBoost model using Optuna model
xgb_best_params_2 = {'max_depth': 3, 
                     'learning_rate': 0.05783817198027412, 
                     'min_child_weight': 10.124475828112256, 
                     'colsample_bytree': 0.6192098284876566, 
                     'reg_alpha': 9.615639451753312, 
                     'reg_lambda': 3.5778496101852357, 
                     'subsample': 0.9446520676127865, 
                     'n_estimators': 484}
xgb_2 = XGBClassifier(**xgb_best_params_2)

# Fit the model with early stopping
xgb_2.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=50,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_xgb_2 = xgb_2.predict_proba(X_valid)[:, 1]
y_pred_xgb_2 = np.round(y_pred_proba_xgb_2).astype(int)

# Calculate Accuracy Score on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_xgb_2)
accuracy = accuracy_score(y_valid, y_pred_xgb_2)
print("Accuracy Score on Validation Data:", accuracy)

## CatBoost

In [None]:
# # Define objective function for Optuna
# def objective(trial):
#     # Define hyperparameters to search
#     params = {
#         'iterations': trial.suggest_int('iterations', 100, 1000),
#         'depth': trial.suggest_int('depth', 4, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
#         'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#         'border_count': trial.suggest_int('border_count', 1, 255),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100),
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.01, 1.0),
#         'eval_metric': 'AUC',
#         'random_state': 0
#     }

#     # Split the training data into training and validation sets
#     X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

#     # Train CatBoost model with current hyperparameters
#     clf = CatBoostClassifier(**params)
#     clf.fit(X_train_split, y_train_split, eval_set=(X_valid_split, y_valid_split), verbose=0, early_stopping_rounds=50)

#     # Predict probabilities on validation set
#     y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

#     # Calculate ROC AUC on validation set
#     roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
#     return roc_auc

# # Optimize hyperparameters using Optuna
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

# # Get best hyperparameters
# cat_best_params = study.best_params
# print("Best Hyperparameters:", cat_best_params)

Best Hyperparameters: {'iterations': 809, 'depth': 8, 'learning_rate': 0.16835084169271636, 'random_strength': 3.2449625766961097, 'bagging_temperature': 0.30055204865985985, 'border_count': 72, 'l2_leaf_reg': 34.289254984798575, 'scale_pos_weight': 0.841012500039096}

Best Hyperparameters: {'iterations': 707, 'depth': 6, 'learning_rate': 0.12929345186104244, 'random_strength': 3.5255746549441564, 'bagging_temperature': 0.39315065600806615, 'border_count': 231, 'l2_leaf_reg': 7.450462475101695, 'scale_pos_weight': 0.9507699352909662}

Best Hyperparameters: {'iterations': 750, 'depth': 5, 'learning_rate': 0.1498452500271287, 'random_strength': 2.3916541237913607, 'bagging_temperature': 0.7940046664760633, 'border_count': 255, 'l2_leaf_reg': 16.84665620855418, 'scale_pos_weight': 0.8657868473704984}

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Create the CatBoost model using Optuna model
cat_best_params_1 = {'iterations': 750, 
                   'depth': 5, 
                   'learning_rate': 0.1498452500271287, 
                   'random_strength': 2.3916541237913607, 
                   'bagging_temperature': 0.7940046664760633, 
                   'border_count': 255, 
                   'l2_leaf_reg': 16.84665620855418, 
                   'scale_pos_weight': 0.8657868473704984}
cat_1 = CatBoostClassifier(**cat_best_params_1)

# Fit the model with early stopping
cat_1.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=100,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_cat_1 = cat_1.predict_proba(X_valid)[:, 1]
y_pred_cat_1 = np.round(y_pred_proba_cat_1).astype(int)

# Calculate Accuracy Score on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_cat_1)
accuracy = accuracy_score(y_valid, y_pred_cat_1)
print("Accuracy Score on Validation Data:", accuracy)

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Create the CatBoost model using Optuna model
cat_best_params_2 = {'iterations': 179, 
                     'depth': 7, 
                     'learning_rate': 0.15525529979928837, 
                     'random_strength': 8.251760779808803, 
                     'bagging_temperature': 0.5813066735174205, 
                     'border_count': 222, 
                     'l2_leaf_reg': 14.7531039514028, 
                     'scale_pos_weight': 0.9981351356564279}
cat_2 = CatBoostClassifier(**cat_best_params_2)

# Fit the model with early stopping
cat_2.fit(X_train_split, y_train_split,
          eval_set=[(X_valid_split, y_valid_split)],
          early_stopping_rounds=100,
          verbose=500)

# Predict probabilities on validation data
y_pred_proba_cat_2 = cat_2.predict_proba(X_valid)[:, 1]
y_pred_cat_2 = np.round(y_pred_proba_cat_2).astype(int)

# Calculate Accuracy Score on validation data
roc_auc = roc_auc_score(y_valid, y_pred_proba_cat_2)
accuracy = accuracy_score(y_valid, y_pred_cat_2)
print("Accuracy Score on Validation Data:", accuracy)

## LightGBM

In [None]:
# # Define objective function for Optuna
# def objective(trial):
#     # Define hyperparameters to search
#     params = {
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric': 'auc',
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 150),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'random_state': 0,
#         'verbose': -1
#     }

#     # Split the training data into training and validation sets
#     X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

#     # Train LightGBM model with current hyperparameters
#     clf = LGBMClassifier(**params)
#     clf.fit(X_train_split, y_train_split)

#     # Predict probabilities on validation set
#     y_pred_proba = clf.predict_proba(X_valid_split)[:, 1]

#     # Calculate ROC AUC on validation set
#     roc_auc = roc_auc_score(y_valid_split, y_pred_proba)
#     return roc_auc

# # Optimize hyperparameters using Optuna
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

# # Get best hyperparameters
# lgb_best_params = study.best_params
# print("Best Hyperparameters:", lgb_best_params)

Best Hyperparameters: {'max_depth': 3, 'num_leaves': 118, 'learning_rate': 0.10351217553076539, 'feature_fraction': 0.74365495415815, 'bagging_fraction': 0.6603938917764212, 'bagging_freq': 4, 'reg_alpha': 4.748837605697682, 'reg_lambda': 4.769270760614292, 'n_estimators': 199}

Best Hyperparameters: {'max_depth': 6, 'num_leaves': 99, 'learning_rate': 0.013072254029319857, 'feature_fraction': 0.6395233614280195, 'bagging_fraction': 0.796714605811389, 'bagging_freq': 3, 'reg_alpha': 5.261195256088222, 'reg_lambda': 2.9272237697785592, 'n_estimators': 886}

Best Hyperparameters: {'max_depth': 6, 'num_leaves': 20, 'learning_rate': 0.09192866255765324, 'feature_fraction': 0.5094500857265226, 'bagging_fraction': 0.6634973533057996, 'bagging_freq': 4, 'reg_alpha': 7.196198571360236, 'reg_lambda': 1.9539428468670361, 'n_estimators': 268}

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Create the XGBoost model using Optuna model
lgb_best_params_1 = {'max_depth': 6, 
                    'num_leaves': 99, 
                    'learning_rate': 0.013072254029319857, 
                    'feature_fraction': 0.6395233614280195, 
                    'bagging_fraction': 0.796714605811389, 
                    'bagging_freq': 3, 
                    'reg_alpha': 5.261195256088222, 
                    'reg_lambda': 2.9272237697785592, 
                    'n_estimators': 886,
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',        
                    'random_state': 0,
                    'verbose': -1}

lgb_1 = LGBMClassifier(**lgb_best_params_1)

# Fit the model with early stopping
lgb_1.fit(
    X_train_split, y_train_split,
    eval_set=[(X_valid_split, y_valid_split)],
    callbacks=[early_stopping(100)]
)

# Predict probabilities on validation data
y_pred_proba_lgb_1 = lgb_1.predict_proba(X_valid)[:, 1]
y_pred_lgb_1 = np.round(y_pred_proba_lgb_1).astype(int)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, y_pred_lgb_1)
print("Accuracy Score on Validation Data:", accuracy)

In [None]:
# Split the training data to include a validation set for early stopping
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Create the XGBoost model using Optuna model
lgb_best_params_2 = {'max_depth': 6,
                     'num_leaves': 20, 
                     'learning_rate': 0.09192866255765324, 
                     'feature_fraction': 0.5094500857265226, 
                     'bagging_fraction': 0.6634973533057996, 
                     'bagging_freq': 4, 
                     'reg_alpha': 7.196198571360236, 
                     'reg_lambda': 1.9539428468670361, 
                     'n_estimators': 268,
                     'boosting_type': 'gbdt',
                     'objective': 'binary',
                     'metric': 'auc',        
                     'random_state': 0,
                     'verbose': -1}

lgb_2 = LGBMClassifier(**lgb_best_params_2)

# Fit the model with early stopping
lgb_2.fit(
    X_train_split, y_train_split,
    eval_set=[(X_valid_split, y_valid_split)],
    callbacks=[early_stopping(100)]
)

# Predict probabilities on validation data
y_pred_proba_lgb_2 = lgb_2.predict_proba(X_valid)[:, 1]
y_pred_lgb_2 = np.round(y_pred_proba_lgb_2).astype(int)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, y_pred_lgb_2)
print("Accuracy Score on Validation Data:", accuracy)

# Ensemble

## Voting Classifier

In [None]:
final_models = {
    "XGBClassifier_1" : xgb_1,
    "XGBClassifier_2" : xgb_2,
    "LGBMClassifier_1": lgb_1,
    "LGBMClassifier_2": lgb_2,
    "CatBoostClassifier_1": cat_1,
    "CatBoostClassifier_2": cat_2
}

from sklearn.ensemble import VotingClassifier
model_voting =  VotingClassifier(estimators=[
    ('XGBClassifier_1', final_models['XGBClassifier_1']),
    ('XGBClassifier_2', final_models['XGBClassifier_2']),
    ('LGBMClassifier_1', final_models['LGBMClassifier_1']),
    ('LGBMClassifier_2', final_models['LGBMClassifier_2']),
    ('CatBoostClassifier_1', final_models['CatBoostClassifier_1']),
    ('CatBoostClassifier_2', final_models['CatBoostClassifier_2'])
], voting="soft")

model_voting.fit(X_train, y_train)

# Predict probabilities on validation data
y_pred_proba_voting = model_voting.predict_proba(X_valid)[:, 1]
y_pred_voting = np.round(y_pred_proba_voting).astype(int)

In [None]:
# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, y_pred_voting)
print("Accuracy Score on Validation Data:", accuracy)

## Optuna Voting

In [None]:
# Define the objective function
def objective(trial):
    # Trial suggests weights for each model
    w1 = trial.suggest_float("xgb_weight_1", 0.0, 1.0)
    w2 = trial.suggest_float("xgb_weight_2", 0.0, 1.0)
    w3 = trial.suggest_float("lgb_weight_1", 0.0, 1.0)
    w4 = trial.suggest_float("lgb_weight_2", 0.0, 1.0)
    w5 = trial.suggest_float("cat_weight_1", 0.0, 1.0)
    w6 = trial.suggest_float("cat_weight_2", 0.0, 1.0)
    
    # Normalize weights to sum to 1
    total_weight = w1 + w2 + w3 + w4 + w5 + w6
    w1, w2, w3, w4, w5, w6 = w1 / total_weight, w2 / total_weight, w3 / total_weight, w4 / total_weight, w5 / total_weight, w6 / total_weight

    # Compute weighted ensemble predictions
    ensemble_pred_proba = (
        w1 * y_pred_proba_xgb_1 +
        w2 * y_pred_proba_xgb_2 +
        w3 * y_pred_proba_lgb_1 +
        w4 * y_pred_proba_lgb_2 +
        w5 * y_pred_proba_cat_1 +
        w6 * y_pred_proba_cat_2 
    )

    # Calculate accuracy (or other metric) and return as objective value
    return roc_auc_score(y_valid, ensemble_pred_proba)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Best weights and score
print("Best weights:", study.best_params)
print("Best auc:", study.best_value)

In [None]:
best_weights = study.best_params
w1, w2, w3, w4, w5, w6 = best_weights["xgb_weight_1"], best_weights["xgb_weight_2"], best_weights["lgb_weight_1"], best_weights["lgb_weight_2"], best_weights["cat_weight_1"], best_weights["cat_weight_2"]
# Normalize the weights
total_weight = w1 + w2 + w3 + w4 + w5 + w6 
w1, w2, w3, w4, w5, w6 = w1 / total_weight, w2 / total_weight, w3 / total_weight, w4 / total_weight, w5 / total_weight, w6 / total_weight

# Compute weighted ensemble predictions
ensemble_pred_proba = (
    w1 * y_pred_proba_xgb_1 +
    w2 * y_pred_proba_xgb_2 +
    w3 * y_pred_proba_lgb_1 +
    w4 * y_pred_proba_lgb_2 +
    w5 * y_pred_proba_cat_1 +
    w6 * y_pred_proba_cat_2 
)

ensemble_pred = (ensemble_pred_proba > 0.5).astype(int)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, ensemble_pred)
print("Accuracy Score on Validation Data:", accuracy)

## Stacking Classifier

In [None]:
# final_models = {
#     "XGBClassifier" : xgb_1,
#     "LGBMClassifier": lgb_1,
#     "CatBoostClassifier": cat_1
# }

# from sklearn.ensemble import StackingClassifier
# model_stacking =  StackingClassifier(estimators=[
#     ('XGBClassifier', final_models['XGBClassifier']),
#     ('LGBMClassifier', final_models['LGBMClassifier']),
#     ('CatBoostClassifier', final_models['CatBoostClassifier'])],
#                            final_estimator=xgb_1)

# model_stacking.fit(X_train, y_train)

# # Predict probabilities on validation data
# y_pred_proba_stacking = model_stacking.predict_proba(X_valid)[:, 1]
# y_pred_stacking = np.round(y_pred_proba_stacking).astype(int)

In [None]:
# Combine predictions using weighted average
y_pred_proba_weighted = (0.1 * y_pred_proba_xgb_1 + 0.9 * y_pred_proba_xgb_2 
                         + 0.1 * y_pred_proba_lgb_1 + 0.9 * y_pred_proba_lgb_2 
                         + 0.8 * y_pred_proba_cat_1 +  0.2 * y_pred_proba_cat_2)/3

# Perform weighted average
y_pred_weighted = np.round(y_pred_proba_weighted).astype(int)

# Calculate ROC AUC on validation data
roc_auc_weighted = roc_auc_score(y_valid, y_pred_proba_weighted)
print("ROC AUC on Validation Data:", roc_auc_weighted)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, y_pred_weighted)
print("Accuracy Score on Validation Data:", accuracy)

# Ensemble with AutoML

## Weighted Average and Optuna Weights

In [None]:
# Combine predictions using weighted average
y_pred_proba_weighted = 0.01 * (
                    0.1 * y_pred_proba_xgb_1 +
                    0.9 * y_pred_proba_xgb_2 +
                    0.1 * y_pred_proba_lgb_1 +
                    0.9 * y_pred_proba_lgb_2 +
                    0.8 * y_pred_proba_cat_1 +
                    0.2 * y_pred_proba_cat_2) + 0.99 * y_pred_proba_automl_1

# Perform weighted average
y_pred_weighted = np.round(y_pred_proba_weighted).astype(int)

# Calculate ROC AUC on validation data
roc_auc_weighted = roc_auc_score(y_valid, y_pred_proba_weighted)
print("ROC AUC on Validation Data:", roc_auc_weighted)

# Calculate Accuracy Score on validation data
accuracy = accuracy_score(y_valid, y_pred_weighted)
print("Accuracy Score on Validation Data:", accuracy)

## Weighted Average and Voting Classifier

In [None]:
# # Combine predictions using soft voting
# y_pred_proba_voting_weighted = (0.05 * y_pred_proba_voting + 
#                                 0.95 * y_pred_proba_automl_1)

# # Perform soft voting
# y_pred_voting_weighted = np.round(y_pred_proba_voting_weighted).astype(int)

# # Calculate ROC AUC on validation data
# roc_auc_voting_weighted = roc_auc_score(y_valid, y_pred_proba_voting_weighted)
# print("ROC AUC on Validation Data:", roc_auc_voting_weighted)

# # Calculate Accuracy Score on validation data
# accuracy = accuracy_score(y_valid, y_pred_voting_weighted)
# print("Accuracy Score on Validation Data:", accuracy)

## Weighted Average and Stacking Classifier

In [None]:
# # Combine predictions using soft voting
# y_pred_proba_stacking_weighted = (0.05 * y_pred_proba_stacking + 
#                                   0.95 * y_pred_proba_automl_1)

# # Perform soft voting
# y_pred_stacking_weighted = np.round(y_pred_proba_stacking_weighted).astype(int)

# # Calculate ROC AUC on validation data
# roc_auc_stacking_weighted = roc_auc_score(y_valid, y_pred_proba_stacking_weighted)
# print("ROC AUC on Validation Data:", roc_auc_stacking_weighted)

# # Calculate Accuracy Score on validation data
# accuracy = accuracy_score(y_valid, y_pred_stacking_weighted)
# print("Accuracy Score on Validation Data:", accuracy)

# Final Prediction

In [None]:
# y_pred_proba_test_xgb_1 = xgb_1.predict_proba(df_test_processed)[:, 1]
# y_pred_proba_test_cat_1 = cat_1.predict_proba(df_test_processed)[:, 1]
# y_pred_proba_test_lgb_1 = lgb_1.predict_proba(df_test_processed)[:, 1]

# # Voting Ensemble
# y_pred_proba_test_voting_1 = model_voting.predict_proba(df_test_processed)[:, 1]
# y_pred_proba_test_automl_1 = automl.predict(df_test_processed).data[:, 0]

# y_pred_proba_test_ensemble = (0.05 * y_pred_proba_test_voting_1 +
#                               0.95 * y_pred_proba_test_automl_1)

# y_pred_proba_test_ensemble = np.round(y_pred_proba_test_ensemble).astype(int)

In [None]:
y_pred_proba_test_xgb_1 = xgb_1.predict_proba(df_test_processed)[:, 1]
y_pred_proba_test_xgb_2 = xgb_2.predict_proba(df_test_processed)[:, 1]
y_pred_proba_test_cat_1 = cat_1.predict_proba(df_test_processed)[:, 1]
y_pred_proba_test_cat_2 = cat_2.predict_proba(df_test_processed)[:, 1]
y_pred_proba_test_lgb_1 = lgb_1.predict_proba(df_test_processed)[:, 1]
y_pred_proba_test_lgb_2 = lgb_2.predict_proba(df_test_processed)[:, 1]

y_pred_proba_test_automl_1 = automl.predict(df_test_processed).data[:, 0]

y_pred_proba_test_ensemble = 0.01 * (
               0.1 * y_pred_proba_test_xgb_1 +
               0.9 * y_pred_proba_test_xgb_2 +
               0.1 * y_pred_proba_test_lgb_1 +
               0.9 * y_pred_proba_test_lgb_2 +
               0.8 * y_pred_proba_test_cat_1 +
               0.2 * y_pred_proba_test_cat_2) + 0.99 * y_pred_proba_test_automl_1
y_pred_test_ensemble = np.round(y_pred_proba_test_ensemble).astype(int)

# Submission

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')

submission['Depression'] = y_pred_test_ensemble.astype(int)
#submission['Depression'] = predictions_df['predict']
submission.to_csv('submission_ensemble.csv', index=False)
submission.head()