In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA
from PIL import Image

from imgaug import augmenters as iaa
from sklearn.model_selection import train_test_split

from glob import glob
import cv2

In [5]:
# Load training data
training_data = pd.read_csv('./archive/fashion-mnist_train.csv')
train_y = training_data['label']
train_X = training_data.drop('label', axis=1)

# Load test data 
public_test_data = pd.read_csv('./new_public.csv')
private_test_data= pd.read_csv('./new_private.csv')

public_test_y = public_test_data['label']
public_test_X = public_test_data.drop('label', axis=1)

private_test_y = private_test_data['label']
private_test_X = private_test_data.drop('label', axis=1)

In [6]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,  # 랜덤 회전 각도 범위 [-20, 20]도
    width_shift_range=0.1,  # 랜덤 좌우 이동
    height_shift_range=0.1,  # 랜덤 상하 이동
    shear_range=0.2,  # 변형
    zoom_range=0.2,  # 랜덤 확대/축소
    fill_mode='constant',
    cval=0  # 신규로 생성된 픽셀을 채우는 데 사용되는 값
)

# 가정: train_X가 NumPy 배열임
# 데이터를 28x28 이미지로 재구성
reshaped_train_X = train_X.values.reshape(-1, 28, 28, 1)  # 그레이스케일 이미지를 가정
aug_X = []
aug_y = []

# 원본 샘플 당 증강 횟수
augmentation_factor = 60
for X_batch, y_batch in datagen.flow(reshaped_train_X, train_y, batch_size=32):
    aug_X.append(X_batch)
    aug_y.append(y_batch)
    
    if len(aug_X) >= augmentation_factor:
        break

# 리스트를 NumPy 배열로 변환
aug_X = np.concatenate(aug_X)
aug_y = np.concatenate(aug_y)
aug_X = aug_X.reshape(aug_X.shape[0], -1)
# 원본 훈련 데이터를 평평하게 만듦
train_X = reshaped_train_X.reshape(reshaped_train_X.shape[0], -1)

# 원본과 증강된 데이터를 연결
X_pre = np.concatenate([train_X, aug_X])
y_pre = np.concatenate([train_y, aug_y])

In [7]:
from lightgbm import LGBMClassifier

pca = PCA(n_components=380)
pca.fit(X_pre.reshape(len(X_pre), -1))
pca_train_X = pca.transform(X_pre.reshape(len(X_pre), -1))
public_test_X = pca.transform(public_test_X.values.reshape(len(public_test_X), -1))
private_test_X = pca.transform(private_test_X.values.reshape(len(private_test_X), -1))

best_knn_params = {'n_neighbors': 5}
best_rf_params = {'n_estimators': 200}
best_svm_params = {'C': 8, 'kernel': 'rbf' , 'gamma' : 'scale'}

# Individual models
best_knn = KNeighborsClassifier(**best_knn_params)
best_rf = RandomForestClassifier(**best_rf_params, random_state=0)
best_svm = SVC(**best_svm_params)
best_lgbm = LGBMClassifier(learning_rate=0.18, max_depth=5, min_child_samples=30, n_estimators=500, 
                      subsample=0.87, verbosity=-1)

# Apply PCA to each model
X_train_knn_pca = pca.transform(train_X)
best_knn.fit(X_train_knn_pca, train_y)
best_rf.fit(pca_train_X, y_pre)
best_svm.fit(pca_train_X, y_pre)
best_lgbm.fit(pca_train_X, y_pre)


In [8]:
# Stacking ensemble model
stacking_model = StackingClassifier(
     estimators=[('knn', best_knn), ('rf', best_rf), ('svm', best_svm), ('lgbm',best_lgbm)],
     final_estimator=LogisticRegression(C=10),
 )

# Train stacking model on the full training set
stacking_model.fit(pca_train_X, y_pre)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Predictions on test set
public_stacking_pred = stacking_model.predict(public_test_X)
private_stacking_pred = stacking_model.predict(private_test_X)

public_preds_dict = {image : label for image, label in enumerate(public_stacking_pred)}

with open('./stackingver9_public_testResult.txt', 'w') as file :
    for image, label in public_preds_dict.items():
        file.write(f'{image:05d} {label}\n')
        
private_preds_dict = {image : label for image, label in enumerate(private_stacking_pred)}
with open('./stackingVer9_private_testResult.txt', 'w') as file :
    for image, label in private_preds_dict.items():
        file.write(f'{image:05d} {label}\n') 