In [44]:
import cv2
import numpy as np
import os
from sklearn.decomposition import PCA  

In [45]:
# showing image
def show_image(image, title="Image", scale=5):
    h, w = image.shape
    resized_image = cv2.resize(image, (w * scale, h * scale), interpolation=cv2.INTER_NEAREST)
    cv2.imshow(title, resized_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
# load data

def load_data(folder_paths, view = "000"):
    train, test = [], []
    for folder in os.listdir(folder_paths):
        if folder == "train":
            tr_labels = []
            for img in os.listdir(f"{folder_paths}/{folder}"):
                if view in img:
                    im = np.array(cv2.imread(f"{folder_paths}/{folder}/{img}", cv2.IMREAD_GRAYSCALE))
                    tr_labels.append(img[0:3])
                    train.append(im)
        if folder == "test":
            te_labels = []
            for img in os.listdir(f"{folder_paths}/{folder}"):
                if view in img:
                    im = np.array(cv2.imread(f"{folder_paths}/{folder}/{img}", cv2.IMREAD_GRAYSCALE))
                    te_labels.append(img[0:3])
                    test.append(im)
    print(f"Hoàn thành tải dữ liệu lên từ {folder_paths}")
    return train, test, tr_labels, te_labels

final_data = "C:/Users/HUY/Desktop/my_project/final_data"
synthetic_path = "C:/Users/HUY/Desktop/my_project/synthetic_data"          
x_rtrain, x_rtest, y_rtrain, y_rtest = load_data(final_data, "090")
x_strain, x_stest, y_strain, y_stest = load_data(synthetic_path, "090")
print(len(x_rtrain), len(x_rtest), len(y_rtrain), len(y_rtest))
print(len(x_strain), len(x_stest), len(y_strain), len(y_stest))

Hoàn thành tải dữ liệu lên từ C:/Users/HUY/Desktop/my_project/final_data
Hoàn thành tải dữ liệu lên từ C:/Users/HUY/Desktop/my_project/synthetic_data
40 20 40 20
240 120 240 120


In [46]:
x_train, y_train = x_rtrain + x_strain, y_rtrain + y_strain
x_test, y_test = x_rtest + x_stest, y_rtest + y_stest

for i in range(len(y_train)):
    match y_train[i]:
        case '001': 
            y_train[i] = 0
        case '002':
            y_train[i] = 1
        case '003':
            y_train[i] = 2
        case '004':
            y_train[i] = 3
        case '005':
            y_train[i] = 4
        case '006':
            y_train[i] = 5
        case '007':
            y_train[i] = 6
        case '008':
            y_train[i] = 7
        case '009':
            y_train[i] = 8
        case '010':
            y_train[i] = 9

for i in range(len(y_test)):
    match y_test[i]:
        case '001': 
            y_test[i] = 0
        case '002':
            y_test[i] = 1
        case '003':
            y_test[i] = 2
        case '004':
            y_test[i] = 3
        case '005':
            y_test[i] = 4
        case '006':
            y_test[i] = 5
        case '007':
            y_test[i] = 6
        case '008':
            y_test[i] = 7
        case '009':
            y_test[i] = 8
        case '010':
            y_test[i] = 9

y_train, y_test = np.array(y_train), np.array(y_test)
print(f"Kích thước tập train và labels là {len(x_train)} {len(y_train)}")
print(f"Kích thước tập test và labels là {len(x_test)} {len(y_test)}")
# print(y_train)

Kích thước tập train và labels là 280 280
Kích thước tập test và labels là 140 140


In [47]:
# 4. Implement PCA manually
# dữ liệu đưa vào PCA là dữ liệu 2 chiều (n x (64x64)) w. n = số lượng dữ liệu trong tập data
def pca_transform(data, n_components):
    # input
    #   data 
    #   number of components
    # output
    #   the transformed data
    #   eigen vectors 
    f_data = []
    for i in data:
        f_data.append(i.flatten())

    mean = np.mean(f_data, axis=0)
    centered_data = f_data - mean
    covariance_matrix = np.cov(centered_data, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, sorted_indices[:n_components]]
    transformed_data = np.dot(centered_data, eigenvectors)
    return transformed_data, eigenvectors

# pca_data , pca_eigen = pca_transform(x_train, 300)

In [48]:
# 95 -> 31
# 96 -> 36
# ==> chọn số component = 32

pca = PCA(n_components = 32)
pca_data = pca.fit_transform([x.flatten() for x in x_train])
pca_test = pca.fit_transform([x.flatten() for x in x_test])

In [49]:
i = 0
print(f"Vector đặc trưng PCA:\n{pca_data[i]},\nNhãn dán: {y_train[i]}")

x_train = pca_data
x_test = pca_test

Vector đặc trưng PCA:
[-280.38842237  191.22098639 -419.34229765 -351.17232936  307.93137939
  133.81564601  -26.76074273  215.79843571  176.24875637  326.95345706
   25.00649876   13.83983682  219.83052316 -226.99409999   36.64503411
  -57.76231176   94.60619952 -108.26024926   96.266322     36.60499107
   99.81126702   40.26100878  -25.08146437 -117.09017588   -8.76261926
   91.82293035   -7.00867712    5.15798356  -82.38126601  100.94209123
  -32.88644431   77.44861808],
Nhãn dán: 0


## Mình sẽ thử với 3 mô hình khác nhau
### Decision Tree + XGBoost
### Mạng CNN
### Supported Vector MachineMachine

In [50]:
# phân loại bằng RandomForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

epochs = 5

for i in range(epochs):
    rf_model = RandomForestClassifier(
        n_estimators=300, 
        max_depth=20, 
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True
        )
    rf_model.fit(x_train, y_train)
    y_pred = rf_model.predict(pca_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 30.71%
Accuracy: 27.14%
Accuracy: 28.57%
Accuracy: 27.14%
Accuracy: 24.29%


In [51]:
# phân loại bằng XGBoost
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=500, 
    max_depth=16, 
    learning_rate=0.01,
    # colsample_bytree=0.8,
    # subsample=0.8,
    # gamma=0.2,
    # reg_lambda=1.0,
    )
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(pca_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 31.43%


In [52]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(5):
    mlp_model = MLPClassifier(hidden_layer_sizes=(256,128), activation='logistic', max_iter=1000)
    mlp_model.fit(x_train, y_train)

    y_pred = mlp_model.predict(pca_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 25.00%
Accuracy: 27.14%
Accuracy: 33.57%
Accuracy: 29.29%
Accuracy: 28.57%


In [53]:
# phân loại bằng SVM
from sklearn.svm import SVC

for i in range(5):
    svm_model = SVC(C=0.01, gamma=0.1)
    svm_model.fit(x_train, y_train)

    y_pred = svm_model.predict(pca_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 10.00%
Accuracy: 10.00%
Accuracy: 10.00%
Accuracy: 10.00%
Accuracy: 10.00%


In [None]:
# Thử dùng RVM classifier thử xem

In [None]:
print(x_train.shape, type(x_train))
print(y_train.shape, type(y_train))ss
# data_to_save = np.hstack((x_train, y_train.reshape(-1, 1)))


(280, 32) <class 'numpy.ndarray'>
(280,) <class 'numpy.ndarray'>


In [56]:
# # 5. Implement MDA manually (optional)
# def mda_transform(data, labels, n_components):
#     data = np.array(data) if not isinstance(data, np.ndarray) else data
#     labels = np.array(labels) if not isinstance(labels, np.ndarray) else labels
    
#     unique_classes = np.unique(labels)
#     mean_total = np.mean(data, axis=0)
#     S_b = np.zeros((data.shape[1], data.shape[1]))
#     S_w = np.zeros((data.shape[1], data.shape[1]))
    
#     for cls in unique_classes:
#         class_data = data[labels == cls]
#         mean_class = np.mean(class_data, axis=0)
#         S_b += len(class_data) * np.outer(mean_class - mean_total, mean_class - mean_total)
#         S_w += np.cov(class_data, rowvar=False) * (len(class_data) - 1)
    
#     eigvals, eigvecs = np.linalg.eigh(np.linalg.pinv(S_w).dot(S_b))
#     sorted_indices = np.argsort(eigvals)[::-1]
#     eigvecs = eigvecs[:, sorted_indices[:n_components]]
#     transformed_data = np.dot(data, eigvecs)
#     return transformed_data, eigvecs