In [2]:
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # 분류분석
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score, v_measure_score, mutual_info_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

filepath=r'E:\ai\Downloads\Data'

# data load

In [3]:
VGG_vectors = np.load(os.path.join(filepath,'VGG_vectors.npy'))
VGG_vectors.shape

(80158, 512)

In [4]:
painting = pd.read_csv(os.path.join(filepath,'painting.csv'))
painting.head()

Unnamed: 0,file,title,artist,genre,style
0,Realism/vincent-van-gogh_pine-trees-in-the-fen...,pine-trees-in-the-fen-1884,vincent-van-gogh,landscape,Realism
1,Baroque/rembrandt_the-angel-appearing-to-the-s...,the-angel-appearing-to-the-shepherds-1634,rembrandt,religious painting,Baroque
2,Post_Impressionism/paul-cezanne_portrait-of-th...,portrait-of-the-artist-s-son,paul-cezanne,portrait,Post_Impressionism
3,Impressionism/pierre-auguste-renoir_young-girl...,young-girl-seated-in-a-meadow-1916,pierre-auguste-renoir,genre painting,Impressionism
4,Romanticism/ivan-aivazovsky_morning-1851.jpg,morning-1851,ivan-aivazovsky,marina,Romanticism


In [5]:
X = VGG_vectors
y = painting['artist']

In [6]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X, y)

In [7]:
from imblearn.under_sampling import RandomUnderSampler

# y가 numpy.ndarray일 경우, np.unique()를 사용하여 고유 값들을 가져옵니다
unique_classes = np.unique(y)

# 각 클래스별로 샘플 수를 200개로 설정
rus = RandomUnderSampler(sampling_strategy={class_name: 200 for class_name in unique_classes}, random_state=42)

# X와 y에 대해 언더샘플링 적용
X,y = rus.fit_resample(X, y)

In [8]:
pd.Series(y).value_counts()

a.y.-jackson            200
martin-barre            200
masaccio                200
mary-fedden             200
mary-cassatt            200
                       ... 
gerrit-dou              200
geta-bratescu           200
gheorghe-tattarescu     200
giacomo-balla           200
zinaida-serebriakova    200
Name: artist, Length: 1104, dtype: int64

In [9]:
np.save(os.path.join(filepath,'X.npy'),X)
np.save(os.path.join(filepath,'y.npy'),y)

In [10]:
X = np.load(os.path.join(filepath,'X.npy'))
y = np.load(os.path.join(filepath,'y.npy'), allow_pickle=True)

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    shuffle=True,  # 분할 전 데이터 섞기
                                                    stratify=y     # 층화추출
                                                   )
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((154560, 512), (66240, 512), (154560,), (66240,))

In [12]:
np.save(os.path.join(filepath,'train_x.npy'),train_X)
np.save(os.path.join(filepath,'test_x.npy'),test_X)
np.save(os.path.join(filepath,'train_y.npy'),train_y)
np.save(os.path.join(filepath,'test_y.npy'),test_y)

In [13]:
train_X = np.load(os.path.join(filepath,'train_x.npy')).astype('float32')
test_X = np.load(os.path.join(filepath,'test_x.npy')).astype('float32')
train_y = np.load(os.path.join(filepath,'train_y.npy'), allow_pickle=True)
test_y = np.load(os.path.join(filepath,'test_y.npy'), allow_pickle=True)

# 분류분석

## RandomForestClassifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, max_features='sqrt', random_state=1, n_jobs=-1)
rf_model.fit(train_X, train_y)
rf_model

In [15]:
# 대용량 모형일 때 : joblib 파일로 저장 (joblib 라이브러리 사용)
print(joblib.dump(rf_model, os.path.join(filepath,'rf_model.joblib')))

['E:\\ai\\Downloads\\Data\\rf_model.joblib']


In [16]:
# 모델 load
rf_model = joblib.load(os.path.join(filepath,'rf_model.joblib'))

In [17]:
rf_model.score(test_X, test_y)

0.09726751207729468

In [18]:
rf_pred_y = rf_model.predict(test_X)
pd.crosstab(test_y, rf_pred_y)  # 실제값, 예측값

col_0,a.y.-jackson,aaron-siskind,abdullah-suriosubroto,abidin-dino,adnan-coker,adolf-fleischmann,adriaen-van-de-velde,adriaen-van-de-venne,agnes-martin,agnolo-bronzino,...,william-baziotes,william-blake,william-congdon,william-merritt-chase,william-shayer,wu-guanzhong,xu-beihong,yayoi-kusama,yves-gaucher,yves-klein
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a.y.-jackson,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron-siskind,0,0,0,0,0,0,0,0,0,0,...,0,0,6,0,0,0,0,0,0,0
abdullah-suriosubroto,0,0,0,0,0,0,0,0,0,0,...,0,0,22,0,0,0,0,0,0,0
abidin-dino,0,0,0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abraham-manievich,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yiannis-tsaroychis,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
yov-kondzelevych,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yves-gaucher,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0
yves-klein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = test_y
y_pred_labels = rf_pred_y  # 모델 예측값도 동일하게 변환

# ✅ 상세한 성능 지표 출력
print("🔹 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# ✅ 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f"🎯 Precision: {precision:.4f}")
print(f"🔄 Recall: {recall:.4f}")
print(f"🔥 F1-score: {f1:.4f}")

🔹 Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                  precision    recall  f1-score   support

                    a.y.-jackson     0.0317    0.0333    0.0325        60
                   aaron-siskind     0.0000    0.0000    0.0000        60
           abdullah-suriosubroto     0.0000    0.0000    0.0000        60
                     abidin-dino     0.8889    0.1333    0.2319        60
               abraham-manievich     0.0000    0.0000    0.0000        60
                    ad-reinhardt     0.0000    0.0000    0.0000        60
                    adam-baltatu     0.0000    0.0000    0.0000        60
                     adnan-coker     0.0876    0.2833    0.1339        60
               adolf-fleischmann     0.2979    0.2333    0.2617        60
                    adolf-hitler     0.0000    0.0000    0.0000        60
adolphe-joseph-thomas-monticelli     0.0000    0.0000    0.0000        60
                 adriaen-brouwer     0.0000    0.0000    0.0000        60
            adriaen-van-de-velde     

  _warn_prf(average, modifier, msg_start, len(result))


🎯 Precision: 0.3198
🔄 Recall: 0.0973
🔥 F1-score: 0.0900


In [20]:
# fbeta score의 beta=1 : f1 score
# fbeta score의 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
# fbeta score의 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score
print(fbeta_score(test_y, rf_pred_y, beta=0.5, average='macro'))
print(fbeta_score(test_y, rf_pred_y, beta=1, average='macro'))
print(fbeta_score(test_y, rf_pred_y, beta=2, average='macro'))

0.12776278071493863
0.0899808728893325
0.08118411589340498


## MLPClassifier

In [22]:
# 다중신경망 모형
mlp_model = MLPClassifier(hidden_layer_sizes=(38, 64, 32), max_iter=500, random_state=1)
mlp_model = mlp_model.fit(train_X, train_y)
mlp_model

In [23]:
mlp_model.score(test_X, test_y)

0.4549969806763285

In [24]:
mlp_pred_y = mlp_model.predict(test_X)
mlp_result = pd.crosstab(test_y, mlp_pred_y)
mlp_result

col_0,a.y.-jackson,aaron-siskind,abdullah-suriosubroto,abidin-dino,abraham-manievich,ad-reinhardt,adam-baltatu,adnan-coker,adolf-fleischmann,adolf-hitler,...,wolfgang-paalen,wu-guanzhong,xu-beihong,yayoi-kusama,yiannis-moralis,yiannis-tsaroychis,yov-kondzelevych,yves-gaucher,yves-klein,zinaida-serebriakova
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a.y.-jackson,27,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron-siskind,0,27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdullah-suriosubroto,0,0,60,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abidin-dino,0,0,0,60,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abraham-manievich,0,0,0,0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yiannis-tsaroychis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
yov-kondzelevych,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,60,0,0,0
yves-gaucher,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,24,2,0
yves-klein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,47,0


In [25]:
# y_test는 실제 정답 (원-핫 인코딩된 경우 argmax로 변환)
y_test_labels = test_y
y_pred_labels = mlp_pred_y  # 모델 예측값도 동일하게 변환

# ✅ 상세한 성능 지표 출력
print("🔹 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, digits=4))

# ✅ 개별 지표 계산 (매크로 평균 사용)
precision = precision_score(y_test_labels, y_pred_labels, average='macro')
recall = recall_score(y_test_labels, y_pred_labels, average='macro')
f1 = f1_score(y_test_labels, y_pred_labels, average='macro')

print(f"🎯 Precision: {precision:.4f}")
print(f"🔄 Recall: {recall:.4f}")
print(f"🔥 F1-score: {f1:.4f}")

🔹 Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                  precision    recall  f1-score   support

                    a.y.-jackson     0.4154    0.4500    0.4320        60
                   aaron-siskind     0.3462    0.4500    0.3913        60
           abdullah-suriosubroto     0.6316    1.0000    0.7742        60
                     abidin-dino     0.6383    1.0000    0.7792        60
               abraham-manievich     0.2857    0.3333    0.3077        60
                    ad-reinhardt     0.1389    0.0833    0.1042        60
                    adam-baltatu     0.0455    0.0167    0.0244        60
                     adnan-coker     1.0000    1.0000    1.0000        60
               adolf-fleischmann     0.9524    1.0000    0.9756        60
                    adolf-hitler     0.2899    0.3333    0.3101        60
adolphe-joseph-thomas-monticelli     0.2500    0.1833    0.2115        60
                 adriaen-brouwer     0.1746    0.1833    0.1789        60
            adriaen-van-de-velde     

  _warn_prf(average, modifier, msg_start, len(result))


🎯 Precision: 0.4085
🔄 Recall: 0.4550
🔥 F1-score: 0.4113


In [26]:
# fbeta score의 beta=1 : f1 score
# fbeta score의 2>=beta>1 : recall의 가중치가 높게 조정된 f1 score
# fbeta score의 0<=beta<1 : precision의 가중치가 높게 조정된 f1 score
print(fbeta_score(test_y, mlp_pred_y, beta=0.5, average='macro'))
print(fbeta_score(test_y, mlp_pred_y, beta=1, average='macro'))
print(fbeta_score(test_y, mlp_pred_y, beta=2, average='macro'))

0.40160019696647176
0.4113096079426252
0.43268640462063523


# 군집분석

In [27]:
os.environ['OMP_NUM_THREADS'] = '1'

## KMeans 클러스터링

In [28]:
len(set(y))

1104

In [29]:
kmeans_model = KMeans(n_clusters=len(set(y)),  # 클러스터 갯수:2개의 그룹으로 나눔
               init='random', # random(중심초기점이 random), k-means++(멀리 떨어진 초기점)
               n_init=5,
               max_iter=300)  # 300번 중심점 이동
kmeans_model.fit(X)

In [30]:
kmeans_model.cluster_centers_  # 최종 두 클러스터의 중심점

array([[ 56.548378  ,   3.9741023 ,  -3.7660558 , ...,  11.927933  ,
         -3.3473277 ,  -6.7931695 ],
       [ 33.657715  ,  13.974932  , -28.182302  , ...,  -4.753823  ,
         -2.0227087 ,   2.9349484 ],
       [ 63.113914  ,  13.542155  ,   1.168807  , ...,  -0.24177289,
         -0.3052293 ,  -1.807678  ],
       ...,
       [-30.363344  ,   6.509787  , -10.315461  , ...,  -0.7998073 ,
          0.50404805,   1.0504757 ],
       [-18.69838   , -16.232683  , -28.903456  , ...,  -1.2520957 ,
          3.0923548 ,   0.6265442 ],
       [-20.526297  ,  39.26541   ,  11.346867  , ...,   2.201813  ,
         -6.982558  ,   1.6055441 ]], dtype=float32)

In [31]:
len(kmeans_model.cluster_centers_)

1104

In [32]:
pred = kmeans_model.predict(X)
print(' 예 측 값 : ', pred)
print('modelLabel: ', kmeans_model.labels_)
print(' 실제 y값 : ', y)

 예 측 값 :  [373 373 211 ... 376 591 168]
modelLabel:  [373 373 211 ... 376 591 168]
 실제 y값 :  ['a.y.-jackson' 'a.y.-jackson' 'a.y.-jackson' ... 'zinaida-serebriakova'
 'zinaida-serebriakova' 'zinaida-serebriakova']


In [33]:
pd.crosstab(y, kmeans_model.labels_, rownames=['실제값'], colnames=['k-means값'])

k-means값,0,1,2,3,4,5,6,7,8,9,...,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a.y.-jackson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,8,0,0,0,0,0,0
aaron-siskind,0,0,0,0,0,18,13,0,0,0,...,9,0,0,0,0,14,0,0,0,0
abdullah-suriosubroto,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abidin-dino,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abraham-manievich,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yiannis-tsaroychis,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
yov-kondzelevych,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yves-gaucher,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yves-klein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 군집모형 성능평가

In [14]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # 문자열 라벨을 숫자로 변환

In [35]:
len(le.classes_)

1104

In [36]:
model = KMeans(n_clusters=len(le.classes_), random_state=1, n_init=10)
model.fit(X)

In [37]:
pred = model.predict(X)
all(pred == model.labels_)

True

In [38]:
# 예측된 클러스터 라벨
pred = model.labels_

# 각 클러스터에서 가장 많이 등장하는 실제 라벨 찾기
mapping = {}
for cluster in range(len(le.classes_)):
    mask = (pred == cluster)
    if np.any(mask):  # 해당 클러스터에 속한 샘플이 존재하는 경우만 실행
        most_common_label = np.bincount(y_encoded[mask]).argmax()
        mapping[cluster] = most_common_label

# 클러스터 인덱스를 실제 라벨과 매핑
adjusted_pred = np.vectorize(mapping.get)(pred)

# 교차표 생성
cross_tab = pd.crosstab(y_encoded, adjusted_pred, rownames=['실제값'], colnames=['예측값'])

# 결과 출력
cross_tab

예측값,0,2,4,7,8,13,15,16,19,20,...,1078,1079,1080,1082,1090,1094,1096,1097,1100,1102
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,68,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,28,0,27,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,4,2,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1100,0,0,0,0,14,0,0,0,0,0,...,0,0,0,14,0,0,0,0,21,0
1101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36


In [39]:
pred_str = le.inverse_transform(adjusted_pred)
pd.crosstab(y, pred_str)

col_0,a.y.-jackson,abdullah-suriosubroto,abraham-manievich,adnan-coker,adolf-fleischmann,adriaen-van-de-venne,aelbert-cuyp,afro,agostino-carracci,aki-kuroda,...,willem-kalf,willi-baumeister,william-adolphe-bouguereau,william-blake,william-shayer,wolfgang-paalen,xu-beihong,yayoi-kusama,yov-kondzelevych,yves-klein
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a.y.-jackson,68,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron-siskind,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdullah-suriosubroto,0,28,0,27,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abidin-dino,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abraham-manievich,0,0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yiannis-tsaroychis,4,2,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
yov-kondzelevych,0,0,0,0,14,0,0,0,0,0,...,0,0,0,14,0,0,0,0,21,0
yves-gaucher,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yves-klein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36


## 조정된 rand지수 외 성능평가 기준 함수

In [40]:
adjusted_rand_score(labels_true=y, labels_pred=pred_str)

0.07772341337665915

In [41]:
adjusted_rand_score(labels_true=y_encoded, labels_pred=pred)

0.07723746587727041

In [42]:
homogeneity_score(y, pred_str)

0.47860058193020094

In [43]:
completeness_score(y, pred_str)

0.5539896148243851

In [44]:
v_measure_score(y, pred_str)

0.5135430355073454

In [45]:
mutual_info_score(y, pred_str)

3.3534084129717696

# 앙상블모형

In [15]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((154560, 512), (154560,), (66240, 512), (66240,))

In [16]:
le = LabelEncoder()
train_y = le.fit_transform(train_y)
test_y = le.fit_transform(test_y)

def model_measure(model, test_X=test_X, test_y=test_y):
    pred = model.predict(test_X)
    accuracy  = model.score(test_X, test_y)
    precision = precision_score(test_y, pred, average="macro")
    recall    = recall_score(test_y, pred, average="macro")
    f1score  = f1_score(test_y, pred, average="macro")
    return '정확도:{:.3f}, 정밀도:{:.3f}, 재현율:{:.3f}, f1_score:{:.3f}'.format(accuracy, precision, recall, f1score)

In [17]:
# LabelEncoder 객체 저장
joblib.dump(le, os.path.join(filepath,'label_encoder.pkl'))

['E:\\ai\\Downloads\\Data\\label_encoder.pkl']

# 의사결정 방식

## 배깅 방식

In [None]:
# ✅ 경량화된 랜덤포레스트 모델
rf_model = RandomForestClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=10,          # 트리 깊이 제한
    max_features='sqrt',  # 최적의 특징 개수 자동 선택
    random_state=1,
    n_jobs=-1
)

rf_model.fit(train_X,train_y)

print(model_measure(rf_model))

## 부스팅 방식

In [18]:
# ✅ 경량화된 XGBoost 모델
xgb_model = XGBClassifier(
    max_depth=10,          # 트리 깊이 제한
    n_estimators=50,      # 트리 개수 줄이기
    learning_rate=0.1,    # 학습 속도 증가
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8, # 일부 특성만 사용
    tree_method='hist',   # 히스토그램 기반 트리 (메모리 절약)
    eval_metric='logloss',
#     use_label_encoder=True,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(train_X,train_y)

print(model_measure(xgb_model))

정확도:0.907, 정밀도:0.902, 재현율:0.907, f1_score:0.901


In [None]:
# ✅ 경량화된 LightGBM 모델
lgb_model = LGBMClassifier(
    n_estimators=50,      # 트리 개수 줄이기
    max_depth=10,          # 트리 깊이 제한
    num_leaves=16,        # 리프 개수 줄이기
    subsample=0.8,        # 데이터 일부 샘플링
    colsample_bytree=0.8, # 일부 특성만 사용
    verbose=-1,           # 불필요한 출력 제거
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(train_X,train_y)

print(model_measure(lgb_model))

In [19]:
joblib.dump(xgb_model, os.path.join(filepath,'xgb_artist_model.joblib'))

['E:\\ai\\Downloads\\Data\\xgb_artist_model.joblib']

In [20]:
loaded_xgb = joblib.load(os.path.join(filepath,'xgb_artist_model.joblib'))

In [21]:
print(model_measure(loaded_xgb))

정확도:0.907, 정밀도:0.902, 재현율:0.907, f1_score:0.901


## 투표를 이용한 앙상블

In [8]:
X = np.load(os.path.join(filepath,'X.npy'))
y = np.load(os.path.join(filepath,'y.npy'), allow_pickle=True)

In [10]:
voting_model_hard = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='hard') # voting='hard' 기본값

In [11]:
voting_model_hard.predict(test_X[0].reshape(1, -1))

array(['pablo-picasso'], dtype=object)

In [13]:
# voting 알고리즘 - soft 방식
voting_model_soft = VotingClassifier(estimators=[('rfm', rf_model),
                                                 ('xgb', xgb_model),
                                                 ('lgb', lgb_model)],
                                     voting='soft') # voting='hard' 기본값
voting_model_soft.fit(X,y)

In [14]:
voting_model_soft.predict(test_X[0].reshape(1, -1))

array(['pablo-picasso'], dtype=object)

In [15]:
test_y[0]

911

In [16]:
inverse_test_y = le.inverse_transform(test_y)
inverse_test_y[0]

'ramon-oviedo'

In [17]:
model_measure(voting_model_hard)

  _warn_prf(average, modifier, msg_start, len(result))


'정확도:0.400, 정밀도:0.677, 재현율:0.400, f1_score:0.411'

In [18]:
model_measure(voting_model_soft)

  _warn_prf(average, modifier, msg_start, len(result))


'정확도:0.102, 정밀도:0.177, 재현율:0.102, f1_score:0.100'

In [19]:
# voting_model_hard의 개별 모델들 딕셔너리 형태로 반환
voting_model_hard.named_estimators_

{'rfm': RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, random_state=1),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=-1, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=10, n_estimators=50, n_jobs=-1,
                num_leaves=16, random_st

In [20]:
# voting_model_soft의 개별 모델들 딕셔너리 형태로 반환
voting_model_soft.named_estimators_

{'rfm': RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, random_state=1),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=50,
               n_jobs=-1, num_parallel_tree=None, objective='multi:softprob', ...),
 'lgb': LGBMClassifier(colsample_bytree=0.8, max_depth=10, n_estimators=50, n_jobs=-1,
                num_leaves=16, random_st

In [21]:
print(model_measure(voting_model_hard.named_estimators_['rfm']))  # 단일모델로 개별모델 access 가능
print(model_measure(voting_model_hard.named_estimators_['xgb']))
print(model_measure(voting_model_hard.named_estimators_['lgb']))

  _warn_prf(average, modifier, msg_start, len(result))


정확도:0.145, 정밀도:0.380, 재현율:0.145, f1_score:0.132
정확도:0.907, 정밀도:0.902, 재현율:0.907, f1_score:0.901
정확도:0.042, 정밀도:0.081, 재현율:0.042, f1_score:0.043


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
voting_model_hard.get_params()  # 모델 내의 모든 파라미터(하이퍼파라미터)

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, random_state=1)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=-1, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=10, n_estimators=50, 

In [24]:
voting_model_soft.get_params()  # 모델 내의 모든 파라미터(하이퍼파라미터)

{'estimators': [('rfm',
   RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, random_state=1)),
  ('xgb',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric='logloss',
                 feature_types=None, gamma=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None,
                 learning_rate=0.1, max_bin=None, max_cat_threshold=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, multi_strategy=None, n_estimators=50,
                 n_jobs=-1, num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgb',
   LGBMClassifier(colsample_bytree=0.8, max_depth=10, n_estimators=50, 

## 머신러닝 모형 저장

In [25]:
# 대용량 모형일 때 : joblib 파일로 저장 (joblib 라이브러리 사용)
print(joblib.dump(voting_model_hard, os.path.join(filepath,'voting_model_hard.joblib')))
print(joblib.dump(voting_model_soft, os.path.join(filepath,'voting_model_soft.joblib')))

['E:\\ai\\Downloads\\Data\\voting_model_hard.joblib']
['E:\\ai\\Downloads\\Data\\voting_model_soft.joblib']


In [6]:
# 모델 load
loaded_voting_model_hard = joblib.load(os.path.join(filepath,'voting_model_hard.joblib'))
model_measure(loaded_voting_model_hard)

  _warn_prf(average, modifier, msg_start, len(result))


'정확도:0.400, 정밀도:0.677, 재현율:0.400, f1_score:0.411'

In [16]:
# 모델 load
loaded_voting_model_soft = joblib.load(os.path.join(filepath,'voting_model_soft.joblib'))
model_measure(loaded_voting_model_soft)

ValueError: Mix of label input types (string and number)