In [17]:
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # 분류분석
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score, v_measure_score, mutual_info_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

In [18]:
# 그래프 저장 시 해상도 높게
%config InlineBackend.figure_format = 'retina'

# 한글설정
# plt.rc('font', family='Malgun Gothic')  # 윈도우즈
# plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rc('font', family='AppleGothic') # Mac 운영체제
plt.rc('axes', unicode_minus=False)  # 축의 -(마이너스) 깨짐 방지

In [19]:
os.environ['OMP_NUM_THREADS'] = '1'

# data load

In [20]:
filepath = '../team_project/data/'

In [21]:
VGG_vectors = np.load(os.path.join(filepath,'VGG_vectors.npy'))
VGG_vectors.shape

(80158, 512)

In [22]:
painting = pd.read_csv(os.path.join(filepath,'painting.csv'))
painting.head()

Unnamed: 0,file,title,artist,genre,style
0,Realism/vincent-van-gogh_pine-trees-in-the-fen...,pine-trees-in-the-fen-1884,vincent-van-gogh,landscape,Realism
1,Baroque/rembrandt_the-angel-appearing-to-the-s...,the-angel-appearing-to-the-shepherds-1634,rembrandt,religious painting,Baroque
2,Post_Impressionism/paul-cezanne_portrait-of-th...,portrait-of-the-artist-s-son,paul-cezanne,portrait,Post_Impressionism
3,Impressionism/pierre-auguste-renoir_young-girl...,young-girl-seated-in-a-meadow-1916,pierre-auguste-renoir,genre painting,Impressionism
4,Romanticism/ivan-aivazovsky_morning-1851.jpg,morning-1851,ivan-aivazovsky,marina,Romanticism


In [23]:
X = VGG_vectors
y = painting['genre']
X, y 

(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.13876809,  0.        ],
        [ 0.        ,  0.        ,  7.3608236 , ...,  0.        ,
          2.1667717 ,  0.        ],
        [17.689674  ,  2.8541045 ,  6.410571  , ...,  3.4064777 ,
          0.52448237, 10.3166    ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        , 20.376156  , ...,  0.6702498 ,
          1.7302468 ,  4.738069  ],
        [ 4.411827  ,  0.99009806, 33.64073   , ...,  6.051113  ,
          0.19890343, 24.475027  ]], dtype=float32),
 0                 landscape
 1        religious painting
 2                  portrait
 3            genre painting
 4                    marina
                 ...        
 80153       animal painting
 80154                design
 80155       flower painting
 80156       animal painting
 80157        genre painting
 Name: genre, Length: 80158, dtype: obj

In [24]:
print(pd.Series(y).value_counts())

portrait                    14294
landscape                   13478
genre painting              11089
religious painting           6694
abstract                     5264
cityscape                    4686
sketch and study             3959
still life                   2907
illustration                 1914
nude painting                1912
mythological painting        1367
marina                       1351
design                       1311
flower painting              1275
self-portrait                1193
figurative                   1020
animal painting               908
symbolic painting             723
sculpture                     704
history painting              616
allegorical painting          604
interior                      508
literary painting             402
battle painting               268
caricature                    263
poster                        261
installation                  256
veduta                        156
cloudscape                    155
wildlife paint

In [25]:
# 독립변수 스케일 조정
smote = SMOTE(sampling_strategy='auto', k_neighbors=2) # 특정 클래스의 샘플수가 3개 미만이라 k_neighbors=2
X, y = smote.fit_resample(X, y)

In [26]:
undersample = NearMiss(version=1)
X_sampled, y_sampled = undersample.fit_resample(X, y)

In [27]:
pd.Series(y_sampled).value_counts()

Canadian                    14294
abstract                    14294
mythological painting       14294
nude painting               14294
nude painting (nu)          14294
panorama                    14294
pastorale                   14294
photo                       14294
portrait                    14294
poster                      14294
quadratura                  14294
religious painting          14294
sculpture                   14294
self-portrait               14294
sketch and study            14294
still life                  14294
symbolic painting           14294
tessellation                14294
vanitas                     14294
veduta                      14294
wildlife painting           14294
miniature                   14294
marina                      14294
literary painting           14294
cloudscape                  14294
advertisement               14294
allegorical painting        14294
animal painting             14294
battle painting             14294
bird-and-flowe

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, 
                                                    test_size=0.3, 
                                                    shuffle=True,  # 분할 전 데이터 섞기
                                                    stratify=y_sampled     # 층화추출
                                                   )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((440255, 512), (188681, 512), (440255,), (188681,))

# 분류분석

## DecisionTreeClassifier

In [13]:
# 의사결정나무를 구현한 DecisionTreeClassifier를 통한 모형
dt_model = DecisionTreeClassifier(random_state=1)
dt_model = dt_model.fit(X_train, y_train)  # 머신러닝에서는 fit시 독립변수에 array로
dt_model

In [14]:
dt_model.score(X_test, y_test)

0.8471547214610904

In [15]:
y_dt_pred = dt_model.predict(X_test)
pd.crosstab(y_test, y_dt_pred)  # y_test : 실제값, y_dt_pred : 예측값

col_0,Canadian,abstract,advertisement,allegorical painting,animal painting,battle painting,bird-and-flower painting,capriccio,caricature,cityscape,...,sculpture,self-portrait,sketch and study,still life,symbolic painting,tessellation,vanitas,veduta,wildlife painting,yakusha-e
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canadian,4287,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abstract,1,2679,5,34,58,27,4,2,8,65,...,76,13,86,112,59,1,6,0,5,0
advertisement,0,0,4279,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
allegorical painting,0,19,2,3786,19,14,0,0,2,10,...,4,11,18,24,30,0,0,0,0,0
animal painting,2,30,3,23,3556,7,0,2,5,32,...,8,12,36,38,41,0,4,1,6,0
battle painting,0,10,0,8,8,4151,0,0,1,12,...,2,0,7,2,8,0,0,0,0,0
bird-and-flower painting,0,0,0,0,0,0,4281,0,0,0,...,0,0,1,1,0,0,0,0,0,0
capriccio,0,0,0,0,0,0,0,4271,0,5,...,0,1,1,0,1,0,0,0,0,0
caricature,0,5,0,1,3,0,0,0,4195,5,...,0,3,11,1,3,0,0,0,0,0
cityscape,1,66,7,28,33,16,2,9,5,2631,...,26,14,79,34,33,0,0,47,9,1


In [16]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = y_test
y_pred_labels = y_dt_pred

# 상세한 성능 지표 출력
print(" Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-score: {f1:.4f}")

 Classification Report:
                          precision    recall  f1-score   support

                Canadian     0.9956    0.9998    0.9977      4288
                abstract     0.6912    0.6246    0.6562      4289
           advertisement     0.9821    0.9977    0.9898      4289
    allegorical painting     0.8284    0.8829    0.8548      4288
         animal painting     0.7996    0.8293    0.8142      4288
         battle painting     0.9173    0.9681    0.9420      4288
bird-and-flower painting     0.9926    0.9984    0.9955      4288
               capriccio     0.9889    0.9958    0.9923      4289
              caricature     0.9497    0.9783    0.9638      4288
               cityscape     0.6477    0.6134    0.6301      4289
              cloudscape     0.9506    0.9862    0.9681      4288
                  design     0.7711    0.8004    0.7854      4288
              figurative     0.7851    0.8120    0.7983      4288
         flower painting     0.8632    0.8839    0.

In [17]:
print(fbeta_score(y_test, y_dt_pred, beta=0.5, average='macro'))# fbeta score의 beta=1 : f1 score
print(fbeta_score(y_test, y_dt_pred, beta=1, average='macro')) # fbeta score의 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
print(fbeta_score(y_test, y_dt_pred, beta=2, average='macro')) # fbeta score의 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score

0.8398610081385957
0.8422338881955447
0.8450519009428816


## MLPClassifier

In [18]:
# 다중신경망 모형
mlp_model = MLPClassifier(hidden_layer_sizes=(38, 64, 32), 
                          max_iter=500,
                         random_state=1) # 중심점 이동 
                          
mlp_model = mlp_model.fit(X_train, y_train)
mlp_model

In [19]:
mlp_model.score(X_test, y_test)

0.8397453903678695

In [20]:
y_mlp_pred = mlp_model.predict(X_test)
mlp_result = pd.crosstab(y_test, y_mlp_pred)
mlp_result

col_0,Canadian,abstract,advertisement,allegorical painting,animal painting,battle painting,bird-and-flower painting,capriccio,caricature,cityscape,...,sculpture,self-portrait,sketch and study,still life,symbolic painting,tessellation,vanitas,veduta,wildlife painting,yakusha-e
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canadian,4288,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abstract,1,2676,1,18,75,4,4,0,3,25,...,67,27,67,129,98,0,2,0,5,0
advertisement,0,0,4289,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
allegorical painting,0,1,0,3698,14,7,0,0,4,4,...,5,23,40,6,42,0,4,0,1,0
animal painting,0,13,0,20,3631,6,0,1,0,14,...,1,12,14,34,59,0,0,0,25,0
battle painting,0,0,0,8,2,4191,0,0,0,2,...,0,0,0,0,6,0,0,0,0,0
bird-and-flower painting,0,0,0,0,0,0,4280,0,0,0,...,0,0,0,0,0,0,0,0,0,0
capriccio,0,0,0,0,0,0,0,4282,0,6,...,0,0,0,0,0,0,0,0,0,0
caricature,0,0,0,8,0,0,0,0,4177,0,...,5,11,23,0,0,0,0,0,0,0
cityscape,0,47,4,17,31,20,1,4,4,2571,...,10,14,64,49,51,1,0,40,0,0


In [21]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = y_test
y_pred_labels = y_mlp_pred  # 모델 예측값도 동일하게 변환

# 상세한 성능 지표 출력
print(" Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-score: {f1:.4f}")

 Classification Report:
                          precision    recall  f1-score   support

                Canadian     0.9974    1.0000    0.9987      4288
                abstract     0.7319    0.6239    0.6736      4289
           advertisement     0.9853    1.0000    0.9926      4289
    allegorical painting     0.8334    0.8624    0.8477      4288
         animal painting     0.7989    0.8468    0.8221      4288
         battle painting     0.9431    0.9774    0.9599      4288
bird-and-flower painting     0.9974    0.9981    0.9978      4288
               capriccio     0.9958    0.9984    0.9971      4289
              caricature     0.9734    0.9741    0.9738      4288
               cityscape     0.7126    0.5994    0.6511      4289
              cloudscape     0.9779    0.9916    0.9847      4288
                  design     0.6456    0.7976    0.7136      4288
              figurative     0.7037    0.7910    0.7448      4288
         flower painting     0.8543    0.8547    0.

In [22]:
print(fbeta_score(y_test, y_mlp_pred, beta=0.5, average='macro')) # fbeta score의 beta=1 : f1 score
print(fbeta_score(y_test, y_mlp_pred, beta=1, average='macro')) # 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
print(fbeta_score(y_test, y_mlp_pred, beta=2, average='macro')) # 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score

0.8318878397366892
0.8331852171842804
0.836473301974166


# 군집분석

## KMeans 클러스터링

In [23]:
len(set(y))

44

In [24]:
kmeans_model = KMeans(n_clusters=len(set(y)),  # 클러스터 갯수:2개의 그룹으로 나눔
               init='random', # random(중심초기점이 random)
               n_init=5,
               max_iter=300)  # 300번 중심점 이동
kmeans_model.fit(X)

In [25]:
kmeans_model.cluster_centers_  # 최종 두 클러스터의 중심점

array([[0.21165824, 0.3688122 , 6.220317  , ..., 1.2366172 , 0.13267839,
        2.3576405 ],
       [4.2272305 , 0.715013  , 6.345366  , ..., 0.21372253, 1.9048864 ,
        1.525845  ],
       [1.8401865 , 0.75162697, 2.8953836 , ..., 0.1647445 , 1.1354598 ,
        1.2211778 ],
       ...,
       [2.015356  , 3.783264  , 4.0577197 , ..., 0.75075126, 1.9496106 ,
        1.4889381 ],
       [1.5777566 , 0.96769154, 3.105816  , ..., 0.06336528, 2.2397072 ,
        0.7422844 ],
       [0.1630435 , 0.43954897, 6.260829  , ..., 0.3842915 , 0.42215168,
        0.76618266]], dtype=float32)

In [26]:
len(kmeans_model.cluster_centers_)

44

In [27]:
pred = kmeans_model.predict(X)
print(' 예 측 값 : ', pred)
print('modelLabel: ', kmeans_model.labels_)
print(' 실제 y값 : ', y)

 예 측 값 :  [32 33  9 ... 37 37 41]
modelLabel:  [32 33  9 ... 37 37 41]
 실제 y값 :  0                  landscape
1         religious painting
2                   portrait
3             genre painting
4                     marina
                 ...        
628931             yakusha-e
628932             yakusha-e
628933             yakusha-e
628934             yakusha-e
628935             yakusha-e
Name: genre, Length: 628936, dtype: object


In [28]:
pd.crosstab(y, kmeans_model.labels_, rownames=['실제값'], colnames=['k-means값'])

k-means값,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canadian,0,0,0,0,0,0,0,0,0,1129,...,0,4701,0,94,0,0,0,1760,0,0
abstract,459,108,1548,504,32,0,35,742,106,439,...,705,4,1051,696,3,121,0,54,2,2
advertisement,526,0,0,122,0,0,0,12,6061,0,...,481,0,0,9,0,0,0,0,0,154
allegorical painting,248,201,790,19,5,0,22,597,19,308,...,390,28,892,384,2,2029,0,2,3,13
animal painting,359,366,1216,12,0,564,7,965,41,717,...,306,5,1155,1028,0,744,3,40,16,11
battle painting,505,574,2198,61,0,0,163,384,0,151,...,186,0,1422,119,0,2641,0,136,0,0
bird-and-flower painting,2117,0,0,0,0,0,0,3133,0,0,...,0,0,595,206,0,106,0,0,0,0
capriccio,0,2370,1830,0,0,0,2452,0,0,0,...,827,0,0,0,0,19,0,0,2007,0
caricature,5608,158,429,0,0,12,0,1880,0,27,...,55,0,60,20,0,83,0,14,0,124
cityscape,74,2558,846,60,2,2,3219,128,27,80,...,741,29,331,81,0,82,0,158,2006,6


## 군집모형 성능평가

In [29]:
VGG_vectors = np.load(os.path.join(filepath,'VGG_vectors.npy'))
painting = pd.read_csv(os.path.join(filepath,'painting.csv'))

X = VGG_vectors
y = painting['genre']

# 라벨인코딩
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [30]:
len(le.classes_)

44

In [31]:
model = KMeans(n_clusters=len(le.classes_), n_init=10, random_state=1)
model.fit(X)

In [32]:
pred = model.predict(X)
all(pred == model.labels_)

True

In [33]:
# 예측된 클러스터 라벨
pred = model.labels_

# 각 클러스터에서 가장 많이 등장하는 실제 라벨 찾기
mapping = {}
for cluster in range(len(le.classes_)):
    mask = (pred == cluster)
    if np.any(mask):  # 해당 클러스터에 속한 샘플이 존재하는 경우만 실행
        most_common_label = np.bincount(y_encoded[mask]).argmax()
        mapping[cluster] = most_common_label

# 클러스터 인덱스를 실제 라벨과 매핑
adjusted_pred = np.vectorize(mapping.get)(pred)

# 교차표 생성
cross_tab = pd.crosstab(y_encoded, adjusted_pred, rownames=['실제값'], colnames=['예측값'])

# 결과 출력
cross_tab

예측값,1,9,13,14,17,20,30,33,36,37
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,2,0,5,0,1,2,2,0,1
1,3101,78,22,590,126,512,447,187,174,27
2,9,4,0,9,6,1,14,5,4,0
3,28,11,9,132,18,82,165,142,7,10
4,76,13,6,249,25,197,213,91,31,7
5,9,8,1,47,15,94,29,58,6,1
6,2,0,1,4,9,2,2,1,5,0
7,2,29,0,7,0,11,4,3,0,0
8,27,2,2,26,99,22,45,19,21,0
9,190,2230,0,747,56,1158,116,147,40,2


In [34]:
pred_str = le.inverse_transform(adjusted_pred)
pd.crosstab(y, pred_str)

col_0,abstract,cityscape,flower painting,genre painting,illustration,landscape,portrait,religious painting,sketch and study,still life
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Canadian,0,2,0,5,0,1,2,2,0,1
abstract,3101,78,22,590,126,512,447,187,174,27
advertisement,9,4,0,9,6,1,14,5,4,0
allegorical painting,28,11,9,132,18,82,165,142,7,10
animal painting,76,13,6,249,25,197,213,91,31,7
battle painting,9,8,1,47,15,94,29,58,6,1
bird-and-flower painting,2,0,1,4,9,2,2,1,5,0
capriccio,2,29,0,7,0,11,4,3,0,0
caricature,27,2,2,26,99,22,45,19,21,0
cityscape,190,2230,0,747,56,1158,116,147,40,2


## 조정된 rand지수 외 성능평가 기준 함수

In [35]:
adjusted_rand_score(labels_true=y, labels_pred=pred_str)

0.17784512628955776

In [36]:
adjusted_rand_score(labels_true=y_encoded, labels_pred=pred)

0.06599707465394923

In [37]:
homogeneity_score(y, pred_str)

0.15728402312823125

In [38]:
completeness_score(y, pred_str)

0.21855092825783656

In [39]:
v_measure_score(y, pred_str)

0.18292374952371843

In [40]:
mutual_info_score(y, pred_str)

0.423435287293873

# 앙상블모형

In [41]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((440255, 512), (440255,), (188681, 512), (188681,))

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [30]:
def model_measure(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy  = model.score(X_test, y_test)
    precision = precision_score(y_test, pred, average="macro")
    recall    = recall_score(y_test, pred, average="macro")
    f1score  = f1_score(y_test, pred, average="macro")
    return '정확도:{:.3f}, 정밀도:{:.3f}, 재현율:{:.3f}, f1_score:{:.3f}'.format(accuracy, precision, recall, f1score)

In [None]:
# LabelEncoder 객체 저장
joblib.dump(le, os.path.join(filepath,'genre_label.pkl'))

## 의사결정 방식

### 배깅 방식

In [32]:
rf = RandomForestClassifier(n_estimators=100,         # 의사결정나무 100개
                            max_features=len(set(y)),
                            random_state=42).fit(X_train, y_train)
rf.score(X_test, y_test)

0.9587557835712128

In [33]:
rf_model = model_measure(RandomForestClassifier())
rf_model

'정확도:0.958, 정밀도:0.956, 재현율:0.958, f1_score:0.956'

### 부스팅 방식

In [46]:
xgb = model_measure(XGBClassifier())
xgb

'정확도:0.942, 정밀도:0.941, 재현율:0.942, f1_score:0.941'

In [47]:
lgb = model_measure(LGBMClassifier(force_col_wise=True))
lgb

[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 440255, number of used features: 512
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784269
[LightGBM] [Info] Start training from score -3.784269
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784269
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784269
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784269
[LightGBM] [Info] Start training from score -3.784169
[LightGBM] [Info] Start training from score -3.784169
[Ligh

'정확도:0.921, 정밀도:0.920, 재현율:0.921, f1_score:0.920'

### 투표를 이용한 앙상블

In [48]:
rf_model = RandomForestClassifier(n_estimators=100, # 의사결정나무 100개
                                  max_features=2, # 2개의 특징으로 나눔
                                  random_state=42)
xgb_model = XGBClassifier(max_depth=10,          # 트리의 최대 깊이
                          n_estimators=100,      # 트리 갯수
                          learning_rate=0.01,    # 학습률
                          eval_metric='logloss') # 평가지표(이진분류)
lgb_model = LGBMClassifier(force_col_wise=True, verbose=-1)

print(model_measure(rf_model))
print(model_measure(xgb_model))
print(model_measure(lgb_model))

정확도:0.954, 정밀도:0.951, 재현율:0.954, f1_score:0.951
정확도:0.900, 정밀도:0.896, 재현율:0.900, f1_score:0.897
정확도:0.921, 정밀도:0.920, 재현율:0.921, f1_score:0.920


In [49]:
# 경량화된 랜덤포레스트 모델
rf_model = RandomForestClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=5,          # 트리 깊이 제한
    max_features='sqrt',  # 최적의 특징 개수 자동 선택
    random_state=42)

# 경량화된 XGBoost 모델
xgb_model = XGBClassifier(
    max_depth=4,          # 트리 깊이 제한
    n_estimators=50,      # 트리 개수 줄이기
    learning_rate=0.1,    # 학습 속도 증가
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8, # 일부 특성만 사용
    tree_method='hist',   # 히스토그램 기반 트리 (메모리 절약)
    eval_metric='logloss',
    random_state=42)

# 경량화된 LightGBM 모델
lgb_model = LGBMClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=4,          # 트리 깊이 제한
    num_leaves=16,        # 리프 개수 줄이기
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8,  # 일부 특성만 사용
    verbose=-1,           # 불필요한 출력 제거
    random_state=42)

print(model_measure(rf_model))
print(model_measure(xgb_model))
print(model_measure(lgb_model))

정확도:0.432, 정밀도:0.440, 재현율:0.432, f1_score:0.342
정확도:0.758, 정밀도:0.749, 재현율:0.758, f1_score:0.749
정확도:0.811, 정밀도:0.804, 재현율:0.811, f1_score:0.806


In [50]:
# voting 알고리즘 - hard 방식
voting_model_hard = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='hard') # voting='hard' 기본값

voting_model_hard.fit(X_sampled, y_sampled)

In [52]:
voting_model_hard.predict(X_test[0].reshape(1, -1))

array(['landscape'], dtype=object)

In [53]:
# voting 알고리즘 - soft 방식
voting_model_soft = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='soft') # voting='hard' 기본값

voting_model_soft.fit(X_sampled, y_sampled)

In [54]:
voting_model_soft.predict(X_test[0].reshape(1, -1))

array(['landscape'], dtype=object)

In [55]:
y_test[0]

20

In [56]:
y_inverse_test = le.inverse_transform(y_test)
y_inverse_test[0]

'landscape'

In [57]:
model_measure(voting_model_hard)

'정확도:0.770, 정밀도:0.762, 재현율:0.770, f1_score:0.758'

In [58]:
model_measure(voting_model_soft)

'정확도:0.801, 정밀도:0.794, 재현율:0.801, f1_score:0.795'

In [59]:
# voting_model_hard의 개별 모델들 딕셔너리 형태로 반환
voting_model_hard.named_estimators_

{'rfm': RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
                num_leaves=16, random_state=42, subsample=0.8,

In [60]:
# voting_model_soft의 개별 모델들 딕셔너리 형태로 반환
voting_model_soft.named_estimators_

{'rfm': RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
                num_leaves=16, random_state=42, subsample=0.8,

In [61]:
# voting_model_hard.named_estimators_ 단일모델로 개별모델 access 가능
print(model_measure(voting_model_hard.named_estimators_['rfm']))  
print(model_measure(voting_model_hard.named_estimators_['xgb']))
print(model_measure(voting_model_hard.named_estimators_['lgb']))

정확도:0.432, 정밀도:0.440, 재현율:0.432, f1_score:0.342
정확도:0.758, 정밀도:0.749, 재현율:0.758, f1_score:0.749
정확도:0.811, 정밀도:0.804, 재현율:0.811, f1_score:0.806


In [62]:
# voting_model_soft.named_estimators_ 단일모델로 개별모델 access 가능
print(model_measure(voting_model_soft.named_estimators_['rfm']))  
print(model_measure(voting_model_soft.named_estimators_['xgb']))
print(model_measure(voting_model_soft.named_estimators_['lgb']))

정확도:0.432, 정밀도:0.440, 재현율:0.432, f1_score:0.342
정확도:0.758, 정밀도:0.749, 재현율:0.758, f1_score:0.749
정확도:0.811, 정밀도:0.804, 재현율:0.811, f1_score:0.806


In [63]:
# 모델 내의 모든 파라미터(하이퍼파라미터)
voting_model_hard.get_params()  

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
           

In [64]:
# 모델 내의 모든 파라미터(하이퍼파라미터)
voting_model_soft.get_params()  

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=5, n_estimators=50, random_state=42)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=50,
           

# 사용할 머신러닝 학습 모형 저장

In [37]:
# 모델저장하기 
filename = 'rf.joblib'
file_path = os.path.join(filepath,filename)
print(joblib.dump(rf, file_path))

['../team_project/data/rf.joblib']


In [38]:
# 모델 load
jobilb_rf = joblib.load(os.path.join(file_path))
model_measure(jobilb_rf)

'정확도:0.959, 정밀도:0.956, 재현율:0.959, f1_score:0.957'