In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.getLogger('lightgbm').setLevel(logging.INFO)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [18]:
train_data = pd.read_csv("csv\\train.csv")
test_data = pd.read_csv("csv\\test.csv")
original_data = pd.read_csv("csv\\ObesityDataSet.csv")
sample_submission_data = pd.read_csv("csv\\sample_submission.csv")

In [19]:
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True)
train_data = train_data.drop_duplicates()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
cat_cols = list(train_data.select_dtypes(include=['object']).columns)

num_cols_test = list(test_data.select_dtypes(exclude=['object']).columns)
cat_cols_test = list(test_data.select_dtypes(include=['object']).columns)

num_cols_test = [col for col in num_cols_test if col not in ['id']]

scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols_test] = scaler.transform(test_data[num_cols_test])

labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes=='object':
        train_data[col_name]=labelencoder.fit_transform(train_data[col_name])

for col_name in test_data.columns:
    if test_data[col_name].dtypes=='object':
        test_data[col_name]=labelencoder.fit_transform(test_data[col_name])

In [20]:
X = train_data.drop(['NObeyesdad'], axis=1)
y = train_data['NObeyesdad']
y = labelencoder.fit_transform(y)
X_test = test_data.drop(["id"],axis=1)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

param = {"objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26}

model_lgb = lgb.LGBMClassifier(**param,verbose=100)
model_lgb.fit(X_train, y_train)
pred_lgb = model_lgb.predict(X_val)
pred_proba = model_lgb.predict_proba(X_val)

In [21]:
def objective(trial):
    # Define the thresholds for each class
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)

    # Apply the thresholds to convert probabilities to predictions
    y_pred = apply_thresholds(pred_proba, thresholds)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

def apply_thresholds(y_proba, thresholds):
    # Apply the specified thresholds to convert probabilities to predicted labels
    y_pred_labels = np.argmax(y_proba, axis=1)
    for i in range(y_proba.shape[1]):
        y_pred_labels[y_proba[:, i] > thresholds[f'threshold_{i}']] = i

    return y_pred_labels

In [23]:
import optuna
num_classes = 7
pred_proba = pred_proba  # Example: replace with actual y_pred_proba
y_val = y_val  # Example: replace with actual y_val

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# # Get the best thresholds
best_thresholds = study.best_params
print("Best Thresholds:", best_thresholds)

[I 2024-02-27 21:01:05,537] A new study created in memory with name: no-name-54d294f9-a8c0-48c3-b3ad-cd7a3ba9f042
[I 2024-02-27 21:01:05,541] Trial 0 finished with value: 0.9015101772816809 and parameters: {'threshold_0': 0.4583686425682275, 'threshold_1': 0.443427785885007, 'threshold_2': 0.8761806395820562, 'threshold_3': 0.398558735917913, 'threshold_4': 0.3481531897076239, 'threshold_5': 0.1857552729094164, 'threshold_6': 0.22018491052984746}. Best is trial 0 with value: 0.9015101772816809.
[I 2024-02-27 21:01:05,545] Trial 1 finished with value: 0.8780914861019917 and parameters: {'threshold_0': 0.826687881799545, 'threshold_1': 0.5509948085998111, 'threshold_2': 0.9782860240481421, 'threshold_3': 0.07526970351667195, 'threshold_4': 0.5201416494325606, 'threshold_5': 0.43029451212812275, 'threshold_6': 0.07169020837359241}. Best is trial 0 with value: 0.9015101772816809.
[I 2024-02-27 21:01:05,551] Trial 2 finished with value: 0.8583935215583278 and parameters: {'threshold_0': 0.8

Best Thresholds: {'threshold_0': 0.3445424346963367, 'threshold_1': 0.9321711766676472, 'threshold_2': 0.5031887545974678, 'threshold_3': 0.8895267200373516, 'threshold_4': 0.16343218215132826, 'threshold_5': 0.6636434518600034, 'threshold_6': 0.8425915928866956}


In [24]:
threshold= {'threshold_0': 0.724201213234911, 'threshold_1': 0.6161299800571379, 'threshold_2': 0.29138887902587174, 'threshold_3': 0.3145837593497076, 'threshold_4': 0.8469398340837189, 'threshold_5': 0.6800824438387787, 'threshold_6': 0.35886959729223455}

In [26]:
test_label = model_lgb.predict_proba(X_test)
test_label = apply_thresholds(test_label, threshold)

pred = labelencoder.inverse_transform(test_label)
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('files\\model2_submission.csv', index=False)