### **SETTING**

In [354]:
import re
import os.path as osp
from tqdm import tqdm
import json

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import librosa
import optuna
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix, train
import opensmile
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from IPython.display import Audio

### **데이터 정제 및 탐색**

<h4>[ 데이터 설명 ]</h4>
데이터 출처("Audio MNIST") : https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist

- file_dir_nm : 대상 파일 저장 폴더
- file_id : 파일 이름
- class_id : 클래스 ID > 성별로 지정(male, female)
- class_name : 클래스 이름 > male(1), female(0)
- detail_class_name : 클래스에 관한 상세 설명 > {accent, age, origin}

In [297]:
# 데이터 불러오기
with open("./data/audioMNIST_meta.txt", "r") as file:
    json_data = file.read()

json_file = json.loads(json_data)

nums = [f"{n:02}" for n in range(1, 61)]  # 숫자 filtering
file_ids = []
class_ids = []
class_names = []
dir_nms = []
class_details = []

for num in nums:
    json_at_num = json_file[f"{num}"]
    path = f'data\{num}'

    # 대상 파일 저장 폴더
    dir_nm = [path] * 500
    dir_nms.append(dir_nm)

    # 파일 이름(.wav)
    ids_sub = [f'{i}_{num}_{j}.wav' for j in range(50) for i in range(10)]
    file_ids.append(ids_sub)

    # 클래스 ID
    gender = json_at_num['gender']
    class_name = [gender] * 500
    class_names.append(class_name)

    # class 이름
    if gender == 'male':
        class_id = 1
    elif gender == 'female':
        class_id = 0
    else:
        class_id = -1

    gender_result = [class_id] * 500
    class_ids.append(gender_result)

    # class detail : accent, age, origin
    accent, age, origin = json_at_num['accent'], json_at_num['age'], json_at_num['origin']
    class_detail = [f'{accent}, {age}, {origin}' for _ in range(500)]
    class_details.append(class_detail)

In [298]:
# 리스트 단순화
def flatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

# 결과 확인
flattened_dir_nms = flatten_list(dir_nms)
flattened_file_ids = flatten_list(file_ids)
flattened_class_ids = flatten_list(class_ids)
flattened_class_names = flatten_list(class_names)
flattened_class_details = flatten_list(class_details)

print(len(flattened_dir_nms))
print(len(flattened_file_ids))
print(len(flattened_class_ids))
print(len(flattened_class_names))
print(len(flattened_class_details))

30000
30000
30000
30000
30000


In [299]:
# 데이터프레임 생성
df = pd.DataFrame({
    'file_dir_nm': flattened_dir_nms,
    'file_id': flattened_file_ids,
    'class_id': flattened_class_ids,
    'class_name': flattened_class_names,
    'detail_class_name': flattened_class_details
})

df

Unnamed: 0,file_dir_nm,file_id,class_id,class_name,detail_class_name
0,data\01,0_01_0.wav,1,male,"german, 30, Europe, Germany, Wuerzburg"
1,data\01,1_01_0.wav,1,male,"german, 30, Europe, Germany, Wuerzburg"
2,data\01,2_01_0.wav,1,male,"german, 30, Europe, Germany, Wuerzburg"
3,data\01,3_01_0.wav,1,male,"german, 30, Europe, Germany, Wuerzburg"
4,data\01,4_01_0.wav,1,male,"german, 30, Europe, Germany, Wuerzburg"
...,...,...,...,...,...
29995,data\60,5_60_49.wav,0,female,"Tamil, 27, Asia, India, Chennai"
29996,data\60,6_60_49.wav,0,female,"Tamil, 27, Asia, India, Chennai"
29997,data\60,7_60_49.wav,0,female,"Tamil, 27, Asia, India, Chennai"
29998,data\60,8_60_49.wav,0,female,"Tamil, 27, Asia, India, Chennai"


In [320]:
def train_test_data_split(df, train_size: float = 0.9):
    # 전체 데이터의 길이
    total_length = len(df)
    
    # 500개씩 나누기 위한 범위 계산
    num_chunks = total_length // 500

    # 결과를 저장할 리스트 초기화
    train_indices = []
    test_indices = []

    for term in range(num_chunks):
        # 각 청크의 데이터프레임 슬라이스
        sub_df = df.iloc[term*500 : (term+1)*500]

        # 숫자 배열 생성 (0부터 499까지)
        numbers = np.arange(len(sub_df))

        # 전체 숫자의 train_size에 해당하는 개수 계산
        sample_size = int(round(len(numbers) * train_size))

        # 0부터 1까지의 난수를 전체 숫자 개수만큼 생성
        random_values = np.random.random(len(numbers))

        # 난수를 기준으로 정렬한 후 상위 sample_size만큼 선택 > train number
        train_sub_indices = np.argsort(random_values)[:sample_size]

        # 나머지 > test number
        test_sub_indices = np.argsort(random_values)[sample_size:]

        # 원본 데이터프레임의 인덱스를 저장
        train_indices.extend(sub_df.iloc[train_sub_indices].index.tolist())
        test_indices.extend(sub_df.iloc[test_sub_indices].index.tolist())

    return train_indices, test_indices

In [321]:
# 함수 호출하여 train/test sample number 분리
train_sample, test_sample = train_test_data_split(df, train_size=0.7)

# train/test 데이터프레임 분리
train_df = df.iloc[train_sample]
test_df = df.iloc[test_sample]

print("Train DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())

Train DataFrame:
    file_dir_nm      file_id  class_id class_name  \
413     data\01  3_01_41.wav         1       male   
408     data\01  8_01_40.wav         1       male   
78      data\01   8_01_7.wav         1       male   
344     data\01  4_01_34.wav         1       male   
231     data\01  1_01_23.wav         1       male   

                          detail_class_name  
413  german, 30, Europe, Germany, Wuerzburg  
408  german, 30, Europe, Germany, Wuerzburg  
78   german, 30, Europe, Germany, Wuerzburg  
344  german, 30, Europe, Germany, Wuerzburg  
231  german, 30, Europe, Germany, Wuerzburg  

Test DataFrame:
    file_dir_nm      file_id  class_id class_name  \
265     data\01  5_01_26.wav         1       male   
197     data\01  7_01_19.wav         1       male   
392     data\01  2_01_39.wav         1       male   
447     data\01  7_01_44.wav         1       male   
468     data\01  8_01_46.wav         1       male   

                          detail_class_name  
265  g

In [322]:
# 데이터 확인
print("--< TRAIN DATA >--")
print(train_df[:3])
print(len(train_df))
print()
print("--< TEST DATA >--")
print(test_df[:3])
print(len(test_df))

--< TRAIN DATA >--
    file_dir_nm      file_id  class_id class_name  \
413     data\01  3_01_41.wav         1       male   
408     data\01  8_01_40.wav         1       male   
78      data\01   8_01_7.wav         1       male   

                          detail_class_name  
413  german, 30, Europe, Germany, Wuerzburg  
408  german, 30, Europe, Germany, Wuerzburg  
78   german, 30, Europe, Germany, Wuerzburg  
21000

--< TEST DATA >--
    file_dir_nm      file_id  class_id class_name  \
265     data\01  5_01_26.wav         1       male   
197     data\01  7_01_19.wav         1       male   
392     data\01  2_01_39.wav         1       male   

                          detail_class_name  
265  german, 30, Europe, Germany, Wuerzburg  
197  german, 30, Europe, Germany, Wuerzburg  
392  german, 30, Europe, Germany, Wuerzburg  
9000


In [323]:
print("--< COLUMN FEATURE >--")
print(train_df.columns)
print()
print("--< LENGTH OF DATA >--")
print("train data : ", len(train_df))
print("test_data : ", len(test_df))
print(f"ratio of train, test : {round(len(train_df)/(len(train_df) + len(test_df))*100)}%, {round(len(test_df)/(len(train_df) + len(test_df))*100)}%")

--< COLUMN FEATURE >--
Index(['file_dir_nm', 'file_id', 'class_id', 'class_name',
       'detail_class_name'],
      dtype='object')

--< LENGTH OF DATA >--
train data :  21000
test_data :  9000
ratio of train, test : 70%, 30%


In [324]:
# 파일 경로 생성 in rows
def get_file_path_from_row(row):
    return osp.join(row.file_dir_nm, row.file_id)

# 샘플 불러오기
sample_row = train_df.iloc[0]
file_path = get_file_path_from_row(sample_row)

# audio 확인
audio, sr = librosa.load(file_path)
print(audio.shape, sr)
Audio(audio, rate=sr)

(15688,) 22050


### **Feature Extract**

In [325]:
## opensmaile 사용

# 추출할 Feature 정의
feature_keys = ["eGeMAPSv02", "ComParE_2016"]

# 특징 추출 객체 생성
# opensmile의 `ComParE 2016` Feature 추출기 생성
smile = opensmile.Smile(
    feature_set = opensmile.FeatureSet.ComParE_2016,
    feature_level = opensmile.FeatureLevel.Functionals,
    num_workers = -1,
    multiprocessing = True
)

# `ComParE 2016`이 포함하는 Feature 목록 저장
feature_names = smile.feature_names

# 출력
print("[ComParE 2016 Feature List]")
print("# of the features :", len(feature_names))
print("----------------")
print("\n".join(feature_names))

[ComParE 2016 Feature List]
# of the features : 6373
----------------
audspec_lengthL1norm_sma_range
audspec_lengthL1norm_sma_maxPos
audspec_lengthL1norm_sma_minPos
audspec_lengthL1norm_sma_quartile1
audspec_lengthL1norm_sma_quartile2
audspec_lengthL1norm_sma_quartile3
audspec_lengthL1norm_sma_iqr1-2
audspec_lengthL1norm_sma_iqr2-3
audspec_lengthL1norm_sma_iqr1-3
audspec_lengthL1norm_sma_percentile1.0
audspec_lengthL1norm_sma_percentile99.0
audspec_lengthL1norm_sma_pctlrange0-1
audspec_lengthL1norm_sma_stddev
audspec_lengthL1norm_sma_skewness
audspec_lengthL1norm_sma_kurtosis
audspec_lengthL1norm_sma_meanSegLen
audspec_lengthL1norm_sma_maxSegLen
audspec_lengthL1norm_sma_minSegLen
audspec_lengthL1norm_sma_segLenStddev
audspec_lengthL1norm_sma_upleveltime25
audspec_lengthL1norm_sma_upleveltime50
audspec_lengthL1norm_sma_upleveltime75
audspec_lengthL1norm_sma_upleveltime90
audspec_lengthL1norm_sma_risetime
audspec_lengthL1norm_sma_leftctime
audspec_lengthL1norm_sma_lpgain
audspec_lengthL1

In [326]:
# 파일 경로로부터 Feature 추출
feature = smile.process_file(file_path)

# 추출된 Feature의 type 확인
print(type(feature))
print(feature.head(3))

<class 'pandas.core.frame.DataFrame'>
                                                      audspec_lengthL1norm_sma_range  \
file                start  end                                                         
data\01\3_01_41.wav 0 days 0 days 00:00:00.711458333                        0.147758   

                                                      audspec_lengthL1norm_sma_maxPos  \
file                start  end                                                          
data\01\3_01_41.wav 0 days 0 days 00:00:00.711458333                         0.609375   

                                                      audspec_lengthL1norm_sma_minPos  \
file                start  end                                                          
data\01\3_01_41.wav 0 days 0 days 00:00:00.711458333                              0.0   

                                                      audspec_lengthL1norm_sma_quartile1  \
file                start  end                                       

In [327]:
# 추출된 Feature의 shape 확인
print(feature.shape)

# 추출된 Feature 출력
display(feature)

# print(smile.feature_level) # FeatureLevel.Functionals

(1, 6373)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakRangeAbs,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope
file,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
data\01\3_01_41.wav,0 days,0 days 00:00:00.711458333,0.147758,0.609375,0.0,0.047935,0.08261,0.109086,0.034675,0.026475,0.06115,0.01795,...,5.160448,0.753518,1.257544,1.237329,18.50758,0.396857,77.349648,38.13628,83.870255,35.62965


In [486]:
# row에서 오디오 파일 경로를 추출하는 함수
def get_file_path_from_row(row):
    return row['file_dir_nm'] + '\\' + row['file_id']

def extract_features_from_df(df: pd.DataFrame, extractors: dict):
    """
        input
            df (DataFrame)    : Feature를 추출할 정보가 담긴 데이터프레임
            extractors (dict) : Opensmile Feature 추출기 (key: 추출기명, value: 추출기)
        
        output
            ret_df (DataFrame): 입력 데이터프레임 `df`에 추출된 Feature가 추가된 데이터프레임
    """
    
    ret_df = df.copy()
    
    for i, row in tqdm(df.iterrows(), total=len(df)):
        # 각 행에 해당되는 오디오 파일 불러오기
        y, sr = librosa.load(get_file_path_from_row(row))
        # 각 추출기를 사용하여 `ret_df`에 Feature set 추가
        for ext_name, extractor in extractors.items():
            # Feature set 추출
            feature = extractor.process_signal(y, sr)
            # Reset index
            feature.reset_index(inplace=True)
            # Feature 선택
            feature = feature.iloc[0, 3:]
            # `ret_df`의 해당 행에 새로운 column으로 값 추가
            ret_df.loc[i, feature.index] = feature.values

    return ret_df

In [487]:
# 추출할 Feature의 목록 정의
feature_keys = ["eGeMAPSv02", "ComParE_2016"]

# 생성된 추출기를 저장할 딕셔너리 정의
extractor_dict = {}

# 각 Feature마다 생성된 추출기를 딕셔너리에 저장
for key in feature_keys:
    if key == "eGeMAPSv02":
        extractor_dict[key] = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals
        )
    elif key == "ComParE_2016":
        extractor_dict[key] = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals
        )

# 저장된 추출기 목록 확인
print(extractor_dict.keys())

# Feature 추출
test_features = extract_features_from_df(test_df, extractor_dict) # test data set
train_features = extract_features_from_df(train_df, extractor_dict) # train data set

  0%|          | 0/9000 [00:00<?, ?it/s]

dict_keys(['eGeMAPSv02', 'ComParE_2016'])


100%|██████████| 9000/9000 [2:10:17<00:00,  1.15it/s]  
100%|██████████| 21000/21000 [18:53:17<00:00,  3.24s/it]       


  file_dir_nm     file_id  class_id class_name  \
0     data\01  0_01_0.wav         1       male   
1     data\01  1_01_0.wav         1       male   
2     data\01  2_01_0.wav         1       male   
3     data\01  3_01_0.wav         1       male   
4     data\01  4_01_0.wav         1       male   

                        detail_class_name  \
0  german, 30, Europe, Germany, Wuerzburg   
1  german, 30, Europe, Germany, Wuerzburg   
2  german, 30, Europe, Germany, Wuerzburg   
3  german, 30, Europe, Germany, Wuerzburg   
4  german, 30, Europe, Germany, Wuerzburg   

   F0semitoneFrom27.5Hz_sma3nz_stddevNorm  \
0                                0.037471   
1                                0.175938   
2                                0.202385   
3                                0.046895   
4                                0.045627   

   F0semitoneFrom27.5Hz_sma3nz_percentile20.0  \
0                                   27.206049   
1                                   26.505709   
2         

### 데이터 전처리

In [498]:
try:
    test_features = pd.merge(test_df[['file_dir_nm', 'file_id', 'class_id', 'class_name']], test_features)
    train_features = pd.merge(train_df[['file_dir_nm', 'file_id', 'class_id', 'class_name']], train_features)
except Exception as e:
    print(f"데이터프레임을 합치는 중 오류가 발생했습니다: {e}")

In [None]:
train_features.head()

In [None]:
# label : class_id(int)
train_labels = train_features["class_id"].astype(int)
test_labels = test_features["class_id"].astype(int)

# ('file_dir_nm', 'file_id', 'class_name', 'detail_class_name') 제외
train_features = train_features.iloc[:, 5:]
test_features = test_features.iloc[:, 5:]

In [None]:
train_features.columns = [re.sub("\\[(\d+)\\]", "\g<1>", column) for column in train_features.columns]
test_features.columns = [re.sub("\\[(\d+)\\]", "\g<1>", column) for column in test_features.columns]

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_df, train_labels, test_size=0.3, shuffle=True, random_state=0)
test_x, val_x, test_y, val_y = train_test_split(val_x, val_y, test_size=0.5, shuffle=True, random_state=0)

# 특정 열 삭제
train_x = train_x.drop(columns=['class_id'])
val_x = val_x.drop(columns=['class_id'])
test_x = test_x.drop(columns=['class_id'])

# # 데이터 확인(x)
# print(f"------------------------------------train x data df:{len(train_x)}")
# print(train_x[:2])
# print()
# print(f"------------------------------------validation x data df:{len(val_x)}")
# print(val_x[:2])
# print()
# print(f"------------------------------------test x data df:{len(test_x)}")
# print(test_x[:2])

# 데이터 확인(y)
print(f'train y length : {train_y.unique()}')
print(f'validation y length : {val_y.unique()}')
print(f'test y length : {test_y.unique()}')

train y length : [1 0]
validation y length : [1 0]
test y length : [1 0]


In [None]:
# Column별 결측값의 수 확인
print("Column별 결측값의 수")
print(train_features.isnull().sum())
print()
print(test_features.isnull().sum())
print()

# 전체 결측값 수 확인
print("총 결측값 수")
print(train_features.isnull().sum().sum())
print(test_features.isnull().sum().sum())

### 모델 학습

In [None]:
# 데이터 분리
train_x, dev_x, train_y, dev_y = train_test_split(train_features, train_labels, test_size=0.1, shuffle=True, random_state=0)
test_x, test_y = test_features, test_labels

In [None]:
# 스케일링 : MinMax
scaler = MinMaxScaler()

train_x.loc[:,:] = scaler.fit_transform(train_x)
dev_x.loc[:,:] = scaler.transform(dev_x)
test_x.loc[:,:] = scaler.transform(test_x)

In [468]:
SEED = 42
NUM_CLASSES = 2  # 다중 클래스의 수
N_JOBS = -1  # 사용할 CPU 코어 수, -1 : 전체 사용

In [437]:
# XGBoost Model 정의
xgb_model = XGBClassifier(
    random_state=SEED,
    num_class=NUM_CLASSES,
    tree_method='hist',
    n_jobs=N_JOBS,
    objective='multi:softmax',  # 다중 클래스 분류
    n_estimators=30,  # 부스팅을 위해 생성할 결정 트리의 수
    learning_rate=0.1,
    max_depth=7,
    eval_metric='mlogloss',  # 평가 지표 설정
    enable_categorical=True  # 카테고리 데이터를 사용할 수 있도록 설정
)

# 학습 진행
xgb_model.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    verbose=True
)

# 검증 데이터로 성능 평가
xgb_pred_prob = xgb_model.predict_proba(val_x)
xgb_logloss = log_loss(val_y, xgb_pred_prob)  # log_loss : 모델의 예측 확률과 실제 레이블 간의 차이를 측정하는 지표
print(xgb_logloss)

[0]	validation_0-mlogloss:0.59815
[1]	validation_0-mlogloss:0.52035
[2]	validation_0-mlogloss:0.45547
[3]	validation_0-mlogloss:0.40063
[4]	validation_0-mlogloss:0.35378
[5]	validation_0-mlogloss:0.31343
[6]	validation_0-mlogloss:0.27842
[7]	validation_0-mlogloss:0.24789
[8]	validation_0-mlogloss:0.22114
[9]	validation_0-mlogloss:0.19760
[10]	validation_0-mlogloss:0.17682
[11]	validation_0-mlogloss:0.15843
[12]	validation_0-mlogloss:0.14210
[13]	validation_0-mlogloss:0.12758
[14]	validation_0-mlogloss:0.11464
[15]	validation_0-mlogloss:0.10308
[16]	validation_0-mlogloss:0.09276
[17]	validation_0-mlogloss:0.08351
[18]	validation_0-mlogloss:0.07523
[19]	validation_0-mlogloss:0.06780
[20]	validation_0-mlogloss:0.06113
[21]	validation_0-mlogloss:0.05513
[22]	validation_0-mlogloss:0.04974
[23]	validation_0-mlogloss:0.04489
[24]	validation_0-mlogloss:0.04053
[25]	validation_0-mlogloss:0.03660
[26]	validation_0-mlogloss:0.03305
[27]	validation_0-mlogloss:0.02986
[28]	validation_0-mlogloss:0.0

In [438]:
# XGBoost 모델로부터 feature 중요도 불러오기
feature_importance = pd.DataFrame({
    'name' : xgb_model.feature_names_in_,
    'imp' : xgb_model.feature_importances_,
})
# 중요도 순으로 Feature를 정렬
feature_importance.sort_values('imp', ascending=False, inplace=True)
# feature_importance.reset_index(drop=True, inplace=True)

# 중요도 상위 5개 Feature 확인
display(feature_importance.head())

Unnamed: 0,name,imp
0,file_dir_nm,1.0
4305,pcm_zcr_sma_linregc1,0.0
4315,audSpec_Rfilt_sma0_rqmean,0.0
4314,audSpec_Rfilt_sma0_flatness,0.0
4313,audSpec_Rfilt_sma0_amean,0.0


In [None]:
# 전체 Feature의 중요도 시각화
plt.figure(figsize=(7,3))
plt.plot(feature_importance['imp'])
plt.xlim((-50, 3000))
plt.title("Feature importance check")
plt.show()

In [None]:
# 중요도 상위 20% Feature 확인
imp_feats = feature_importance["name"].head(int(0.2 * len(feature_importance)))

# 나머지 Feature 제거
train_x = train_x[imp_feats]
dev_x = dev_x[imp_feats]
train_features = train_features[imp_feats]
test_features = test_features[imp_feats]

# 확인
print(train_x.shape, dev_x.shape, train_features.shape, test_features.shape)

### **모델 학습 by Hyperparameter 튜닝**

In [440]:
# 목적 함수 정의
def objective(trial):
    
    # 하이퍼 파라미터 후보 설정
    model_params = {
        "objective" : 'multi:softprob',
        "num_class" : NUM_CLASSES,
        "n_estimators" : 30,
        
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 10, 30),
        'min_child_weight' : trial.suggest_float('min_child_weight', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma' : trial.suggest_float('gamma', 0, 1, step=0.5),
        
        "tree_method" : 'hist',
        "n_jobs" : N_JOBS,
        "random_state" : SEED,
    }
    
    fit_params = {
        "eval_set" : [(val_x, val_y)],
        "verbose" : 5,
    }
    
    xgbc = XGBClassifier(**model_params)
    xgbc.fit(train_x, train_y, **fit_params)
    pred_probs = xgbc.predict_proba(val_x)
    logloss = log_loss(val_y, pred_probs)
    return logloss

In [441]:
# Optuna 스터디 생성 및 최적화 진행
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)


[I 2024-07-13 15:31:33,931] A new study created in memory with name: no-name-6c6c62eb-8cb9-44ea-afcd-a173fc0fee4a


[0]	validation_0-mlogloss:0.60650
[5]	validation_0-mlogloss:0.33577
[10]	validation_0-mlogloss:0.19848
[15]	validation_0-mlogloss:0.12099
[20]	validation_0-mlogloss:0.07498
[25]	validation_0-mlogloss:0.04692
[29]	validation_0-mlogloss:0.03239


[I 2024-07-13 15:35:03,306] Trial 0 finished with value: 0.032392377818624175 and parameters: {'learning_rate': 0.09077106087422517, 'max_depth': 27, 'min_child_weight': 36.23255448773862, 'subsample': 0.99, 'colsample_bytree': 0.9948320969246158, 'gamma': 0.0}. Best is trial 0 with value: 0.032392377818624175.


[0]	validation_0-mlogloss:0.64436
[5]	validation_0-mlogloss:0.45903
[10]	validation_0-mlogloss:0.33695
[15]	validation_0-mlogloss:0.25136
[20]	validation_0-mlogloss:0.18970
[25]	validation_0-mlogloss:0.14433
[29]	validation_0-mlogloss:0.11647


[I 2024-07-13 15:38:53,305] Trial 1 finished with value: 0.11646954119950532 and parameters: {'learning_rate': 0.050053493621183026, 'max_depth': 16, 'min_child_weight': 43.908087915034535, 'subsample': 0.71, 'colsample_bytree': 0.9595882569059477, 'gamma': 0.0}. Best is trial 0 with value: 0.032392377818624175.


[0]	validation_0-mlogloss:0.68250
[5]	validation_0-mlogloss:0.63266
[10]	validation_0-mlogloss:0.58809
[15]	validation_0-mlogloss:0.54749
[20]	validation_0-mlogloss:0.51022
[25]	validation_0-mlogloss:0.47602
[29]	validation_0-mlogloss:0.45082


[I 2024-07-13 15:42:52,998] Trial 2 finished with value: 0.45082367887099584 and parameters: {'learning_rate': 0.010704302133142416, 'max_depth': 13, 'min_child_weight': 17.60937194101015, 'subsample': 0.94, 'colsample_bytree': 0.8229828344067576, 'gamma': 0.5}. Best is trial 0 with value: 0.032392377818624175.


In [442]:
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value: 0.032392377818624175
  Params: 
    learning_rate: 0.09077106087422517
    max_depth: 27
    min_child_weight: 36.23255448773862
    subsample: 0.99
    colsample_bytree: 0.9948320969246158
    gamma: 0.0


In [454]:
# 최적의 하이퍼파라미터 불러오기
best_params = study.best_params

# 기타 필요한 파라미터 업데이트
best_params.update(
    {
        "objective" : 'multi:softmax',
        "num_class" : NUM_CLASSES,
        "n_estimators" : 100,
        "tree_method" : 'hist',
        "n_jobs" : N_JOBS,
        "random_state" : SEED,
    }
)

In [456]:
# 전체 데이터 사용
model = XGBClassifier(**best_params)

try:
    # Fitting the model
    model.fit(train_x, train_y,
              eval_set=[(val_x, val_y)],
              verbose=True)
    
except xgb.core.XGBoostError as e:
    print("XGBoostError:", e)
except Exception as e:
    print("General error:", e)

[0]	validation_0-mlogloss:0.60650
[1]	validation_0-mlogloss:0.53422
[2]	validation_0-mlogloss:0.47302
[3]	validation_0-mlogloss:0.42059
[4]	validation_0-mlogloss:0.37525
[5]	validation_0-mlogloss:0.33577
[6]	validation_0-mlogloss:0.30117
[7]	validation_0-mlogloss:0.27070
[8]	validation_0-mlogloss:0.24374
[9]	validation_0-mlogloss:0.21980
[10]	validation_0-mlogloss:0.19848
[11]	validation_0-mlogloss:0.17944
[12]	validation_0-mlogloss:0.16239
[13]	validation_0-mlogloss:0.14710
[14]	validation_0-mlogloss:0.13336
[15]	validation_0-mlogloss:0.12099
[16]	validation_0-mlogloss:0.10983
[17]	validation_0-mlogloss:0.09976
[18]	validation_0-mlogloss:0.09067
[19]	validation_0-mlogloss:0.08243
[20]	validation_0-mlogloss:0.07498
[21]	validation_0-mlogloss:0.06823
[22]	validation_0-mlogloss:0.06211
[23]	validation_0-mlogloss:0.05655
[24]	validation_0-mlogloss:0.05151
[25]	validation_0-mlogloss:0.04692
[26]	validation_0-mlogloss:0.04276
[27]	validation_0-mlogloss:0.03897
[28]	validation_0-mlogloss:0.0

In [479]:
test_x.columns

Index(['file_dir_nm', 'file_id', 'class_name', 'detail_class_name',
       'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
       'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
       'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
       ...
       'mfcc_sma_de14_peakRangeAbs', 'mfcc_sma_de14_peakRangeRel',
       'mfcc_sma_de14_peakMeanAbs', 'mfcc_sma_de14_peakMeanMeanDist',
       'mfcc_sma_de14_peakMeanRel', 'mfcc_sma_de14_minRangeRel',
       'mfcc_sma_de14_meanRisingSlope', 'mfcc_sma_de14_stddevRisingSlope',
       'mfcc_sma_de14_meanFallingSlope', 'mfcc_sma_de14_stddevFallingSlope'],
      dtype='object', length=6463)

In [485]:
try : 
    # 예측
    pred_labels = model.predict(test_x)
    # Los loss를 계산하기 위한 proba 계산
    pred_probs = model.predict_proba(test_y)

    # 내용 확인
    print(pred_labels.shape)
    print(pred_labels)
except Exception as e: 
    e

{<class 'pandas.core.series.Series'>}
Not supported type for data.<class 'xgboost.core.DMatrix'>




In [None]:
# 정확도 계산
test_accuracy = sum(pred_labels == test_labels.values) / len(pred_labels)
# Log Loss 계산
test_logloss = log_loss(test_labels, pred_probs)

# 출력
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Log Loss: {test_logloss}")