In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # 분류분석
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score, v_measure_score, mutual_info_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# data load

In [2]:
VGG_vectors = np.load('../../data/VGG_vectors.npy')
VGG_vectors.shape

(80158, 512)

In [3]:
painting = pd.read_csv('painting.csv')
painting.head()

Unnamed: 0,file,title,artist,genre,style
0,Realism/vincent-van-gogh_pine-trees-in-the-fen...,pine-trees-in-the-fen-1884,vincent-van-gogh,landscape,Realism
1,Baroque/rembrandt_the-angel-appearing-to-the-s...,the-angel-appearing-to-the-shepherds-1634,rembrandt,religious painting,Baroque
2,Post_Impressionism/paul-cezanne_portrait-of-th...,portrait-of-the-artist-s-son,paul-cezanne,portrait,Post_Impressionism
3,Impressionism/pierre-auguste-renoir_young-girl...,young-girl-seated-in-a-meadow-1916,pierre-auguste-renoir,genre painting,Impressionism
4,Romanticism/ivan-aivazovsky_morning-1851.jpg,morning-1851,ivan-aivazovsky,marina,Romanticism


In [4]:
X = VGG_vectors
y = painting['style']

In [5]:
# 독립변수 스케일 조정
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

In [6]:
undersample = NearMiss(version=1)
X_sampled, y_sampled = undersample.fit_resample(X, y)

In [7]:
pd.Series(y_sampled).value_counts()

Abstract_Expressionism        12683
Minimalism                    12683
Synthetic_Cubism              12683
Symbolism                     12683
Romanticism                   12683
Rococo                        12683
Realism                       12683
Post_Impressionism            12683
Pop_Art                       12683
Pointillism                   12683
Northern_Renaissance          12683
New_Realism                   12683
Naive_Art_Primitivism         12683
Mannerism_Late_Renaissance    12683
Action_painting               12683
Impressionism                 12683
High_Renaissance              12683
Fauvism                       12683
Expressionism                 12683
Early_Renaissance             12683
Cubism                        12683
Contemporary_Realism          12683
Color_Field_Painting          12683
Baroque                       12683
Art_Nouveau_Modern            12683
Analytical_Cubism             12683
Ukiyo_e                       12683
Name: style, dtype: int64

In [8]:
train_X, test_X, train_y, test_y = train_test_split(X_sampled, y_sampled, 
                                                    test_size=0.3, 
                                                    shuffle=True,  # 분할 전 데이터 섞기
                                                    stratify=y_sampled     # 층화추출
                                                   )
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((239708, 512), (102733, 512), (239708,), (102733,))

# 분류분석

## DecisionTreeClassifier

In [9]:
# 의사결정나무를 구현한 DecisionTreeClassifier를 통한 모형
dt_model = DecisionTreeClassifier(random_state=1)
dt_model = dt_model.fit(train_X, train_y)  # 머신러닝에서는 fit시 독립변수에 array
dt_model

In [10]:
dt_model.score(test_X, test_y)

0.624015652224699

In [11]:
dt_pred_y = dt_model.predict(test_X)
pd.crosstab(test_y, dt_pred_y)  # 실제값, 예측값

col_0,Abstract_Expressionism,Action_painting,Analytical_Cubism,Art_Nouveau_Modern,Baroque,Color_Field_Painting,Contemporary_Realism,Cubism,Early_Renaissance,Expressionism,...,Northern_Renaissance,Pointillism,Pop_Art,Post_Impressionism,Realism,Rococo,Romanticism,Symbolism,Synthetic_Cubism,Ukiyo_e
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abstract_Expressionism,2062,23,7,89,30,171,39,105,30,137,...,41,31,157,98,60,20,67,100,16,52
Action_painting,24,3712,1,5,0,2,4,1,2,6,...,4,1,4,7,1,1,1,6,0,2
Analytical_Cubism,3,0,3737,2,1,0,2,9,2,11,...,2,0,0,8,2,0,4,6,5,4
Art_Nouveau_Modern,124,13,5,1599,83,39,44,112,84,198,...,103,55,106,140,132,62,110,169,10,113
Baroque,19,4,0,63,1819,6,27,21,63,110,...,123,23,28,61,220,291,222,102,2,15
Color_Field_Painting,174,3,1,25,5,3057,21,27,6,24,...,12,9,74,16,22,1,10,29,5,11
Contemporary_Realism,40,3,2,35,28,16,3146,33,15,26,...,27,15,34,51,41,15,37,44,3,15
Cubism,117,14,25,90,21,50,27,2253,43,209,...,44,18,116,100,37,13,34,70,71,64
Early_Renaissance,25,7,8,58,76,4,23,46,2614,78,...,137,22,14,68,54,60,62,73,3,24
Expressionism,155,10,18,182,101,42,69,222,77,1015,...,134,49,116,184,158,64,151,184,21,98


In [12]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = test_y
y_pred_labels = dt_pred_y  # 모델 예측값도 동일하게 변환

# ✅ 상세한 성능 지표 출력
print("🔹 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# ✅ 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f"🎯 Precision: {precision:.4f}")
print(f"🔄 Recall: {recall:.4f}")
print(f"🔥 F1-score: {f1:.4f}")

🔹 Classification Report:
                            precision    recall  f1-score   support

    Abstract_Expressionism     0.5375    0.5419    0.5397      3805
           Action_painting     0.9597    0.9756    0.9675      3805
         Analytical_Cubism     0.9659    0.9821    0.9739      3805
        Art_Nouveau_Modern     0.4398    0.4202    0.4298      3805
                   Baroque     0.4895    0.4781    0.4837      3805
      Color_Field_Painting     0.7928    0.8034    0.7981      3805
      Contemporary_Realism     0.7781    0.8268    0.8017      3805
                    Cubism     0.5943    0.5921    0.5932      3805
         Early_Renaissance     0.6797    0.6870    0.6833      3805
             Expressionism     0.2819    0.2668    0.2741      3805
                   Fauvism     0.6853    0.7057    0.6953      3805
          High_Renaissance     0.6618    0.6857    0.6736      3805
             Impressionism     0.2487    0.2229    0.2351      3805
Mannerism_Late_Renaiss

In [13]:
# fbeta score의 beta=1 : f1 score
# fbeta score의 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
# fbeta score의 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score
print(fbeta_score(test_y, dt_pred_y, beta=0.5, average='macro'))
print(fbeta_score(test_y, dt_pred_y, beta=1, average='macro'))
print(fbeta_score(test_y, dt_pred_y, beta=2, average='macro'))

0.6175044067161766
0.6197924170995668
0.6222636703382227


## MLPClassifier

In [14]:
# 다중신경망 모형
mlp_model = MLPClassifier(hidden_layer_sizes=(38, 64, 32), max_iter=500, random_state=1)
mlp_model = mlp_model.fit(train_X, train_y)
mlp_model

In [15]:
mlp_model.score(test_X, test_y)

0.7234189598279034

In [16]:
mlp_pred_y = mlp_model.predict(test_X)
mlp_result = pd.crosstab(test_y, mlp_pred_y)
mlp_result

col_0,Abstract_Expressionism,Action_painting,Analytical_Cubism,Art_Nouveau_Modern,Baroque,Color_Field_Painting,Contemporary_Realism,Cubism,Early_Renaissance,Expressionism,...,Northern_Renaissance,Pointillism,Pop_Art,Post_Impressionism,Realism,Rococo,Romanticism,Symbolism,Synthetic_Cubism,Ukiyo_e
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abstract_Expressionism,2656,17,1,55,6,118,10,135,13,150,...,1,2,185,29,15,2,1,126,5,24
Action_painting,10,3791,0,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
Analytical_Cubism,3,0,3783,0,0,0,0,10,8,1,...,0,0,0,0,0,0,0,0,0,0
Art_Nouveau_Modern,84,2,0,1635,39,4,24,102,61,295,...,85,20,117,221,86,48,52,371,0,119
Baroque,11,0,0,32,2073,3,4,6,60,46,...,111,2,8,24,152,661,190,72,0,5
Color_Field_Painting,217,5,0,3,3,3363,8,6,2,5,...,0,3,36,2,0,0,0,13,0,1
Contemporary_Realism,17,0,0,8,2,4,3612,5,1,14,...,1,0,27,7,8,0,8,41,0,6
Cubism,109,0,6,52,12,14,1,2835,3,269,...,6,8,108,101,3,2,1,25,42,7
Early_Renaissance,1,0,0,61,43,0,0,6,3321,20,...,92,1,3,17,7,13,13,35,0,3
Expressionism,190,5,3,238,71,7,25,363,48,1034,...,60,27,114,358,137,29,34,340,5,35


In [17]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = test_y
y_pred_labels = mlp_pred_y  # 모델 예측값도 동일하게 변환

# ✅ 상세한 성능 지표 출력
print("🔹 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# ✅ 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f"🎯 Precision: {precision:.4f}")
print(f"🔄 Recall: {recall:.4f}")
print(f"🔥 F1-score: {f1:.4f}")

🔹 Classification Report:
                            precision    recall  f1-score   support

    Abstract_Expressionism     0.6690    0.6980    0.6832      3805
           Action_painting     0.9880    0.9963    0.9921      3805
         Analytical_Cubism     0.9961    0.9942    0.9951      3805
        Art_Nouveau_Modern     0.5048    0.4297    0.4642      3805
                   Baroque     0.5135    0.5448    0.5287      3805
      Color_Field_Painting     0.9092    0.8838    0.8963      3805
      Contemporary_Realism     0.9350    0.9493    0.9421      3805
                    Cubism     0.7073    0.7451    0.7257      3805
         Early_Renaissance     0.7920    0.8728    0.8305      3805
             Expressionism     0.3102    0.2717    0.2897      3805
                   Fauvism     0.8202    0.8778    0.8480      3805
          High_Renaissance     0.7903    0.7982    0.7942      3805
             Impressionism     0.4596    0.3782    0.4149      3805
Mannerism_Late_Renaiss

In [18]:
# fbeta score의 beta=1 : f1 score
# fbeta score의 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
# fbeta score의 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score
print(fbeta_score(test_y, mlp_pred_y, beta=0.5, average='macro'))
print(fbeta_score(test_y, mlp_pred_y, beta=1, average='macro'))
print(fbeta_score(test_y, mlp_pred_y, beta=2, average='macro'))

0.7148040872596299
0.716573342839279
0.7201190376098922


# 군집분석

In [19]:
os.environ['OMP_NUM_THREADS'] = '1'

In [20]:
# 그래프 저장 시 해상도 높게
%config InlineBackend.figure_format = 'retina'

# 한글설정
# plt.rc('font', family='Malgun Gothic')  # 윈도우즈
# plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rc('font', family='AppleGothic') # Mac 운영체제
plt.rc('axes', unicode_minus=False)  # 축의 -(마이너스) 깨짐 방지

## KMeans 클러스터링

In [21]:
len(set(y))

27

In [22]:
kmeans_model = KMeans(n_clusters=len(set(y)),  # 클러스터 갯수:2개의 그룹으로 나눔
               init='random', # random(중심초기점이 random), k-means++(멀리 떨어진 초기점)
               n_init=5,
               max_iter=300)  # 300번 중심점 이동
kmeans_model.fit(X)

In [23]:
kmeans_model.cluster_centers_  # 최종 두 클러스터의 중심점

array([[ 1.6422336 ,  0.40981466,  4.805969  , ...,  0.39349532,
         1.0919907 ,  1.1235657 ],
       [ 1.5721786 ,  0.5540645 ,  0.73407924, ...,  1.1769555 ,
         3.2021556 ,  2.0652304 ],
       [ 2.6720524 ,  0.8716313 ,  3.103767  , ...,  1.3163974 ,
         2.0373752 ,  3.6216328 ],
       ...,
       [ 0.891065  ,  0.31821203,  0.55722046, ...,  0.35978925,
         0.76995975,  0.46872425],
       [ 5.6154795 ,  1.6334686 ,  1.978476  , ...,  2.08456   ,
         3.448163  ,  6.4242907 ],
       [ 5.0331397 ,  2.4982724 ,  6.4244356 , ..., 11.2992935 ,
         1.981703  , 17.796188  ]], dtype=float32)

In [24]:
len(kmeans_model.cluster_centers_)

27

In [25]:
pred = kmeans_model.predict(X)
print(' 예 측 값 : ', pred)
print('modelLabel: ', kmeans_model.labels_)
print(' 실제 y값 : ', y)

 예 측 값 :  [19  0 20 ...  5 25  6]
modelLabel:  [19  0 20 ...  5 25  6]
 실제 y값 :  0                    Realism
1                    Baroque
2         Post_Impressionism
3              Impressionism
4                Romanticism
                 ...        
342436               Ukiyo_e
342437               Ukiyo_e
342438               Ukiyo_e
342439               Ukiyo_e
342440               Ukiyo_e
Name: style, Length: 342441, dtype: object


In [26]:
pd.crosstab(y, kmeans_model.labels_, rownames=['실제값'], colnames=['k-means값'])

k-means값,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abstract_Expressionism,1432,289,1532,227,1,1076,125,366,167,334,...,568,766,141,0,14,46,1192,1960,244,35
Action_painting,5629,3,320,26,0,524,0,0,139,29,...,445,86,0,0,0,0,254,517,0,0
Analytical_Cubism,23,1259,560,315,0,76,2250,0,14,0,...,16,0,0,0,0,217,0,0,321,118
Art_Nouveau_Modern,465,57,290,213,246,2118,226,362,673,456,...,2057,92,506,272,165,394,1359,664,165,275
Baroque,480,5,8,102,768,196,421,53,625,46,...,375,4,650,2012,1948,619,752,297,13,934
Color_Field_Painting,167,44,728,43,0,140,0,12,8,44,...,45,4121,9,0,0,0,2396,4570,19,4
Contemporary_Realism,381,85,308,209,42,151,196,136,697,643,...,277,19,2756,1023,585,175,757,2253,58,524
Cubism,229,1892,1067,375,67,979,833,529,511,106,...,703,672,104,66,101,94,370,734,2128,145
Early_Renaissance,706,20,29,154,696,244,723,335,1874,162,...,364,0,13,429,1371,207,158,193,113,737
Expressionism,544,144,808,459,1082,1489,559,464,555,381,...,733,60,271,541,218,219,460,427,858,847


## 군집모형 성능평가

In [27]:
VGG_vectors = np.load('../data/VGG_vectors.npy')
painting = pd.read_csv('painting.csv')

X = VGG_vectors
y = painting['style']

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # 문자열 라벨을 숫자로 변환

In [28]:
len(le.classes_)

27

In [29]:
model = KMeans(n_clusters=len(le.classes_), random_state=1, n_init=10)
model.fit(X)

In [30]:
pred = model.predict(X)
all(pred == model.labels_)

True

In [31]:
# 예측된 클러스터 라벨
pred = model.labels_

# 각 클러스터에서 가장 많이 등장하는 실제 라벨 찾기
mapping = {}
for cluster in range(len(le.classes_)):
    mask = (pred == cluster)
    if np.any(mask):  # 해당 클러스터에 속한 샘플이 존재하는 경우만 실행
        most_common_label = np.bincount(y_encoded[mask]).argmax()
        mapping[cluster] = most_common_label

# 클러스터 인덱스를 실제 라벨과 매핑
adjusted_pred = np.vectorize(mapping.get)(pred)

# 교차표 생성
cross_tab = pd.crosstab(y_encoded, adjusted_pred, rownames=['실제값'], colnames=['예측값'])

# 결과 출력
cross_tab

예측값,3,4,5,8,9,12,20,21,23,24
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,119,27,432,6,490,482,104,611,287,215
1,4,0,5,0,9,30,0,42,3,5
2,1,21,1,0,10,55,1,8,10,3
3,622,202,71,118,793,947,125,735,539,102
4,102,1380,3,46,84,1218,18,918,363,4
5,12,1,873,1,68,32,3,239,312,74
6,13,48,11,1,19,206,7,133,25,18
7,107,105,226,10,819,468,100,211,68,82
8,49,289,1,263,92,356,38,232,51,6
9,329,505,95,39,1616,1963,243,1342,272,242


In [32]:
pred_str = le.inverse_transform(adjusted_pred)
pd.crosstab(y, pred_str)

col_0,Art_Nouveau_Modern,Baroque,Color_Field_Painting,Early_Renaissance,Expressionism,Impressionism,Post_Impressionism,Realism,Romanticism,Symbolism
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Abstract_Expressionism,119,27,432,6,490,482,104,611,287,215
Action_painting,4,0,5,0,9,30,0,42,3,5
Analytical_Cubism,1,21,1,0,10,55,1,8,10,3
Art_Nouveau_Modern,622,202,71,118,793,947,125,735,539,102
Baroque,102,1380,3,46,84,1218,18,918,363,4
Color_Field_Painting,12,1,873,1,68,32,3,239,312,74
Contemporary_Realism,13,48,11,1,19,206,7,133,25,18
Cubism,107,105,226,10,819,468,100,211,68,82
Early_Renaissance,49,289,1,263,92,356,38,232,51,6
Expressionism,329,505,95,39,1616,1963,243,1342,272,242


## 조정된 rand지수 외 성능평가 기준 함수

In [33]:
adjusted_rand_score(labels_true=y, labels_pred=pred_str)

0.07769517162932903

In [34]:
adjusted_rand_score(labels_true=y_encoded, labels_pred=pred)

0.043476740248968304

In [35]:
homogeneity_score(y, pred_str)

0.09576844871655185

In [36]:
completeness_score(y, pred_str)

0.1512225712738543

In [37]:
v_measure_score(y, pred_str)

0.1172702640151675

In [38]:
mutual_info_score(y, pred_str)

0.2701851150361978

# 앙상블모형

In [39]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((239708, 512), (239708,), (102733, 512), (102733,))

In [40]:
le = LabelEncoder()
train_y = le.fit_transform(train_y)
test_y = le.fit_transform(test_y)

def model_measure(model, train_X=train_X, train_y=train_y, test_X=test_X, test_y=test_y):
    model.fit(train_X, train_y)
    pred = model.predict(test_X)
    accuracy  = model.score(test_X, test_y)
    precision = precision_score(test_y, pred, average="macro")
    recall    = recall_score(test_y, pred, average="macro")
    f1score  = f1_score(test_y, pred, average="macro")
    return '정확도:{:.3f}, 정밀도:{:.3f}, 재현율:{:.3f}, f1_score:{:.3f}'.format(accuracy, precision, recall, f1score)

# 의사결정 방식

## 배깅 방식

In [42]:
rf = RandomForestClassifier(n_estimators=100,         # 의사결정나무 100개
                            max_features=len(set(y)), # 2개 특징으로 나눔
                            random_state=42).fit(train_X, train_y)
rf.score(test_X, test_y)

0.8779068069656294

## 부스팅 방식

In [44]:
xgb = model_measure(XGBClassifier())
xgb

'정확도:0.847, 정밀도:0.845, 재현율:0.847, f1_score:0.846'

In [45]:
lgb = model_measure(LGBMClassifier(force_col_wise=True))
lgb

[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 239708, number of used features: 512
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295733
[LightGBM] [Info] Start training from score -3.295845
[LightGBM] [Info] Start training from score -3.295845
[Ligh

'정확도:0.801, 정밀도:0.796, 재현율:0.801, f1_score:0.798'

## 투표를 이용한 앙상블

In [48]:
# ✅ 경량화된 랜덤포레스트 모델
rf_model = RandomForestClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=5,          # 트리 깊이 제한
    max_features='sqrt',  # 최적의 특징 개수 자동 선택
    random_state=42
)

# ✅ 경량화된 XGBoost 모델
xgb_model = XGBClassifier(
    max_depth=4,          # 트리 깊이 제한
    n_estimators=50,      # 트리 개수 줄이기
    learning_rate=0.1,    # 학습 속도 증가
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8, # 일부 특성만 사용
    tree_method='hist',   # 히스토그램 기반 트리 (메모리 절약)
    eval_metric='logloss',
#     use_label_encoder=True,
    random_state=42
)

# ✅ 경량화된 LightGBM 모델
lgb_model = LGBMClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=4,          # 트리 깊이 제한
    num_leaves=16,        # 리프 개수 줄이기
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8, # 일부 특성만 사용
    verbose=-1,           # 불필요한 출력 제거
    random_state=42
)
print(model_measure(rf_model))
print(model_measure(xgb_model))
print(model_measure(lgb_model))

정확도:0.339, 정밀도:0.337, 재현율:0.339, f1_score:0.264
정확도:0.575, 정밀도:0.561, 재현율:0.575, f1_score:0.560
정확도:0.636, 정밀도:0.622, 재현율:0.636, f1_score:0.625


In [49]:
voting_model_hard = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='hard') # voting='hard' 기본값
voting_model_hard.fit(X_sampled, y_sampled)

In [50]:
voting_model_hard.predict(test_X[0].reshape(1, -1))

array(['Northern_Renaissance'], dtype=object)

In [51]:
# voting 알고리즘 - soft 방식
voting_model_soft = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='soft') # voting='hard' 기본값
voting_model_soft.fit(X_sampled, y_sampled)

In [52]:
voting_model_soft.predict(test_X[0].reshape(1, -1))

array(['Northern_Renaissance'], dtype=object)

In [53]:
test_y[0]

17

In [54]:
inverse_test_y = le.inverse_transform(test_y)
inverse_test_y[0]

'Northern_Renaissance'

In [55]:
model_measure(voting_model_hard)

'정확도:0.588, 정밀도:0.576, 재현율:0.588, f1_score:0.568'

In [56]:
model_measure(voting_model_soft)

'정확도:0.619, 정밀도:0.604, 재현율:0.619, f1_score:0.605'

In [57]:
# voting_model_hard의 개별 모델들 딕셔너리 형태로 반환
voting_model_hard.named_estimators_

{'rfm': RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
                num_leaves=16, random_state=42, subsample=0.8,

In [58]:
# voting_model_soft의 개별 모델들 딕셔너리 형태로 반환
voting_model_soft.named_estimators_

{'rfm': RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
                num_leaves=16, random_state=42, subsample=0.8,

In [59]:
print(model_measure(voting_model_hard.named_estimators_['rfm']))  # 단일모델로 개별모델 access 가능
print(model_measure(voting_model_hard.named_estimators_['xgb']))
print(model_measure(voting_model_hard.named_estimators_['lgb']))

정확도:0.339, 정밀도:0.337, 재현율:0.339, f1_score:0.264
정확도:0.575, 정밀도:0.561, 재현율:0.575, f1_score:0.560
정확도:0.636, 정밀도:0.622, 재현율:0.636, f1_score:0.625


In [60]:
print(model_measure(voting_model_soft.named_estimators_['rfm']))  # 단일모델로 개별모델 access 가능
print(model_measure(voting_model_soft.named_estimators_['xgb']))
print(model_measure(voting_model_soft.named_estimators_['lgb']))

정확도:0.339, 정밀도:0.337, 재현율:0.339, f1_score:0.264
정확도:0.575, 정밀도:0.561, 재현율:0.575, f1_score:0.560
정확도:0.636, 정밀도:0.622, 재현율:0.636, f1_score:0.625


In [61]:
voting_model_hard.get_params()  # 모델 내의 모든 파라미터(하이퍼파라미터)

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
           

In [62]:
voting_model_soft.get_params()  # 모델 내의 모든 파라미터(하이퍼파라미터)

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
           

## 머신러닝 모형 저장

In [64]:
# 대용량 모형일 때 : joblib 파일로 저장 (joblib 라이브러리 사용)
print(joblib.dump(voting_model_hard, 'voting_model_hard.joblib'))
print(joblib.dump(voting_model_soft, 'voting_model_soft.joblib'))

['voting_model_hard.joblib']
['voting_model_soft.joblib']


In [65]:
# 모델 load
loaded_voting_model_hard = joblib.load('voting_model_hard.joblib')
model_measure(loaded_voting_model_hard)

'정확도:0.588, 정밀도:0.576, 재현율:0.588, f1_score:0.568'

In [66]:
# 모델 load
loaded_voting_model_soft = joblib.load('voting_model_soft.joblib')
model_measure(loaded_voting_model_soft)

'정확도:0.619, 정밀도:0.604, 재현율:0.619, f1_score:0.605'