In [59]:
# Importing required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.getLogger('lightgbm').setLevel(logging.INFO)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

#### 1、读入数据
加入了额外数据original_data作为训练数据

In [60]:
train_data = pd.read_csv("data/s4e2/train.csv")
test_data = pd.read_csv("data/s4e2/test.csv")
original_data = pd.read_csv("data/s4e2/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("data/s4e2/sample_submission.csv")

#### 2、合并train数据
train_data中的id列先删除，再和original_data数据合并，并去除重复数据

In [61]:
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True)
train_data = train_data.drop_duplicates()
train_data.shape

(22845, 17)

#### 3、区分train和test中的数据列和分类列
target的这一列还在train的cat_cols中

In [62]:
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
cat_cols = list(train_data.select_dtypes(include=['object']).columns)

num_cols_test = list(test_data.select_dtypes(exclude=['object']).columns)
cat_cols_test = list(test_data.select_dtypes(include=['object']).columns)

num_cols_test = [col for col in num_cols_test if col not in ['id']]

#### 4、数字列进行标准化处理

train和test中的

In [63]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols_test] = scaler.transform(test_data[num_cols_test])

#### 5、分类列进行encode
train和test都进行处理，**不包括train中的target**

In [64]:
#  object datatype columns encoding:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes=='object':
        train_data[col_name]=labelencoder.fit_transform(train_data[col_name])
        
for col_name in test_data.columns:
    if test_data[col_name].dtypes=='object':
        test_data[col_name]=labelencoder.fit_transform(test_data[col_name])

#### 6、train_data分为x和y
test_data去除id，并对训练数据中的y进行encode

In [65]:
X = train_data.drop(['NObeyesdad'], axis=1)
y = train_data['NObeyesdad']
y = labelencoder.fit_transform(y)
X_test = test_data.drop(["id"],axis=1)

In [66]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

#### 7.model:LGBMClassifier

In [67]:
param = {"objective": "multiclass",          
    "metric": "multi_logloss",          
    "verbosity": -1,                    
    "boosting_type": "gbdt",            
    "random_state": 42,       
    "num_class": 7,                     
    'learning_rate': 0.030962211546832760,  
    'n_estimators': 500,                
    'lambda_l1': 0.009667446568254372,  
    'lambda_l2': 0.04018641437301800,   
    'max_depth': 10,                    
    'colsample_bytree': 0.40977129346872643,  
    'subsample': 0.9535797422450176,   
    'min_child_samples': 26}

model_lgb = lgb.LGBMClassifier(**param,verbose=100)
model_lgb.fit(X_train, y_train)
pred_lgb = model_lgb.predict(X_val)  # 结果
pred_proba = model_lgb.predict_proba(X_val)  # 概率，对于每个样本，包含一个数组，数组的长度等于类别的数量，因此，对于每个样本，pred_proba 都会有七个概率值

#### 8.使用阈值来调优

In [68]:
import optuna

def objective(trial):
    # Define the thresholds for each class
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)

    # Apply the thresholds to convert probabilities to predictions
    y_pred = apply_thresholds(pred_proba, thresholds)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy  

def apply_thresholds(y_proba, thresholds):
    # Apply the specified thresholds to convert probabilities to predicted labels
    y_pred_labels = np.argmax(y_proba, axis=1)  # 样本数的长度(5,)
    # print(y_pred_labels)
    for i in range(y_proba.shape[1]):  # 类别数，(5, 3)中的3
        bool_res = y_proba[:, i] > thresholds[f'threshold_{i}']
        # print(bool_res)
        y_pred_labels[bool_res] = i

    return y_pred_labels

用例子解释apply_thresholds怎么用thre来卡阈值

In [69]:
my_y = np.array([
    [0.3, 0.6, 0.1],  # 样本1
    [0.7, 0.2, 0.1],  # 样本2
    [0.2, 0.2, 0.6],  # 样本3
    [0.5, 0.3, 0.2],  # 样本4
    [0.1, 0.8, 0.1]   # 样本5
])

# 假设我们有以下阈值字典
my_thre = {'threshold_0': 0.4, 'threshold_1': 0.5, 'threshold_2': 0.1}  # 第一个大于阈值的类别
my_y_pred = apply_thresholds(my_y, my_thre)
my_y_pred


array([1, 0, 2, 2, 1], dtype=int64)

始了优化过程，会运行100次试验，即调用objective函数来尝试不同的阈值组合，并找到最大化准确率的组合。

In [70]:
num_classes = 7
pred_proba = pred_proba  # Example: replace with actual y_pred_proba
y_val = y_val  # Example: replace with actual y_val

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# # Get the best thresholds
best_thresholds = study.best_params
print("Best Thresholds:", best_thresholds)

[I 2024-02-29 14:48:35,452] A new study created in memory with name: no-name-866d5a29-6665-4dd9-880b-ec416ffb7717
[I 2024-02-29 14:48:35,468] Trial 0 finished with value: 0.8936309914642153 and parameters: {'threshold_0': 0.7622045623596965, 'threshold_1': 0.9355079897965807, 'threshold_2': 0.5088469144278214, 'threshold_3': 0.6801121496038931, 'threshold_4': 0.10313430806503077, 'threshold_5': 0.0902209715608897, 'threshold_6': 0.23520825922240574}. Best is trial 0 with value: 0.8936309914642153.
[I 2024-02-29 14:48:35,475] Trial 1 finished with value: 0.9072007003720727 and parameters: {'threshold_0': 0.9621276674093883, 'threshold_1': 0.25405380334152394, 'threshold_2': 0.7800769108842507, 'threshold_3': 0.6044347492470464, 'threshold_4': 0.8546372326055873, 'threshold_5': 0.3150388333766899, 'threshold_6': 0.8882418810573716}. Best is trial 1 with value: 0.9072007003720727.
[I 2024-02-29 14:48:35,483] Trial 2 finished with value: 0.9087327642810243 and parameters: {'threshold_0': 0

Best Thresholds: {'threshold_0': 0.9619249021644121, 'threshold_1': 0.6948706091661514, 'threshold_2': 0.9706612534574464, 'threshold_3': 0.4207464572728179, 'threshold_4': 0.25084908594336014, 'threshold_5': 0.7489466254602275, 'threshold_6': 0.6359498429335904}


In [72]:
# threshold= {'threshold_0': 0.724201213234911, 'threshold_1': 0.6161299800571379, 'threshold_2': 0.29138887902587174, 'threshold_3': 0.3145837593497076, 'threshold_4': 0.8469398340837189, 'threshold_5': 0.6800824438387787, 'threshold_6': 0.35886959729223455}
# threshold = {'threshold_0': 0.7080864899172834, 'threshold_1': 0.6995227907109729, 'threshold_2': 0.3004286663817531, 'threshold_3': 0.1881656893938678, 'threshold_4': 0.6084616291217079, 'threshold_5': 0.8455882841439035, 'threshold_6': 0.4298530141390267}
threshold = {'threshold_0': 0.9619249021644121, 'threshold_1': 0.6948706091661514, 'threshold_2': 0.9706612534574464, 'threshold_3': 0.4207464572728179, 'threshold_4': 0.25084908594336014, 'threshold_5': 0.7489466254602275, 'threshold_6': 0.6359498429335904}
pred_res = apply_thresholds(pred_proba, threshold)
print(accuracy_score(pred_res, y_val))

0.9137666885532939


In [73]:
test_label = model_lgb.predict_proba(X_test)
test_label = apply_thresholds(test_label, threshold)

In [74]:
pred = labelencoder.inverse_transform(test_label)
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('data/s4e2/submission_v2.csv', index=False)