In [38]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.CodeMirror {font-family:Consolas;}
div#toc-wrapper{padding-top:120px;}
span.toc-item-num{display:none;}
</style>
"""))

In [39]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

In [40]:
import tensorflow as tf

# GPU 메모리를 점진적으로 할당하도록 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [41]:
import torch

torch.cuda.empty_cache()  # GPU 메모리 정리

In [42]:
filepath=r'E:\ai\Downloads\Data'

In [43]:
X = np.load(os.path.join(filepath,'VGG_vectors.npy'))

In [44]:
y = pd.read_csv(os.path.join(filepath,'painting.csv'))
y = y['artist']
y

0             vincent-van-gogh
1                    rembrandt
2                 paul-cezanne
3        pierre-auguste-renoir
4              ivan-aivazovsky
                 ...          
80153              mary-fedden
80154          arman-manookian
80155           andre-bauchant
80156           ivan-generalic
80157       natalia-goncharova
Name: artist, Length: 80158, dtype: object

In [45]:
# 화가 이름별 갯수 계산
artist_counts = pd.Series(y).value_counts()

# 결과 출력
print(artist_counts)

vincent-van-gogh         1887
nicholas-roerich         1859
pierre-auguste-renoir    1400
claude-monet             1333
pyotr-konchalovsky        989
                         ... 
remedios-varo               1
thomas-downing              1
roman-opalka                1
takashi-murakami            1
frida-kahlo                 1
Name: artist, Length: 1104, dtype: int64


In [46]:
# 목표 데이터 개수 설정
target_total_samples = 500000
num_artists = len(artist_counts)

# 각 화가별 목표 샘플 수 (균등 분배)
target_per_artist = target_total_samples // num_artists

X_balanced = []
y_balanced = []

for artist, count in artist_counts.items():
    artist_indices = np.where(y == artist)[0]
    artist_data = X[artist_indices]
    
    if count > target_per_artist:
        # 데이터가 많으면 랜덤 샘플링
        selected_indices = np.random.choice(count, target_per_artist, replace=False)
        balanced_artist_data = artist_data[selected_indices]
    else:
        # 데이터가 부족하면 복제하여 target_per_artist에 맞추기
        replication_factor = target_per_artist // count
        remaining_data = target_per_artist % count

        balanced_artist_data = np.tile(artist_data, (replication_factor, 1))
        if remaining_data > 0:
            balanced_artist_data = np.vstack([balanced_artist_data, artist_data[:remaining_data]])

    # 결과 리스트에 추가
    X_balanced.append(balanced_artist_data)
    y_balanced.append([artist] * len(balanced_artist_data))

# 리스트를 배열로 변환
X_balanced = np.vstack(X_balanced)
y_balanced = np.concatenate(y_balanced)

# 결과 출력
print(f"Balanced X shape: {X_balanced.shape}")
print(f"Balanced y shape: {y_balanced.shape}")


Balanced X shape: (499008, 512)
Balanced y shape: (499008,)


In [47]:
X=X_balanced
y=y_balanced

In [48]:
# X와 y 결합
data = pd.DataFrame(X)
data['artist'] = y

# artist 컬럼을 기준으로 정렬
data_sorted = data.sort_values(by='artist')

# 정렬된 데이터를 X와 y로 다시 나누기
X_sorted = data_sorted.drop(columns='artist').values
y_sorted = data_sorted['artist'].values

In [49]:
X=X_sorted
y=y_sorted

In [50]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
y[::50], y_enc[::50]

(array(['a.y.-jackson', 'a.y.-jackson', 'a.y.-jackson', ...,
        'zinaida-serebriakova', 'zinaida-serebriakova',
        'zinaida-serebriakova'], dtype=object),
 array([   0,    0,    0, ..., 1103, 1103, 1103]))

In [51]:
# 클러스터 개수 설정
n_clusters = y_enc.max() + 1  

# MiniBatchKMeans 사용
model = MiniBatchKMeans(n_clusters=n_clusters, random_state=1, n_init=10, batch_size=1024)

# tqdm을 사용하여 진행률 표시
for i in tqdm(range(100), desc="Clustering Progress"):
    model.partial_fit(X)  # 점진적으로 학습 진행

# 최종 클러스터링 결과
labels = model.predict(X)

Clustering Progress: 100%|███████████████████████████████████████████████████████████| 100/100 [03:23<00:00,  2.04s/it]


In [52]:
pred = model.predict(X)
all(pred == model.labels_)

True

In [53]:
pred

array([ 788,  561,  105, ..., 1008,  693,  373])