In [1]:
import cv2
import numpy as np
import os
from sklearn.decomposition import PCA  

In [76]:
# showing image
def show_image(image, title="Image", scale=5):
    h, w = image.shape
    resized_image = cv2.resize(image, (w * scale, h * scale), interpolation=cv2.INTER_NEAREST)
    cv2.imshow(title, resized_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
# load data

def load_data(folder_paths, view = "000"):
    train, test = [], []
    for folder in os.listdir(folder_paths):
        if folder == "train":
            tr_labels = []
            for img in os.listdir(f"{folder_paths}/{folder}"):
                if view in img:
                    im = np.array(cv2.imread(f"{folder_paths}/{folder}/{img}", cv2.IMREAD_GRAYSCALE))
                    tr_labels.append(img[0:3])
                    train.append(im)
        if folder == "test":
            te_labels = []
            for img in os.listdir(f"{folder_paths}/{folder}"):
                if view in img:
                    im = np.array(cv2.imread(f"{folder_paths}/{folder}/{img}", cv2.IMREAD_GRAYSCALE))
                    te_labels.append(img[0:3])
                    test.append(im)
    print(f"Hoàn thành tải dữ liệu lên từ {folder_paths}")
    return train, test, tr_labels, te_labels

final_data = "C:/Users/HUY/Desktop/my_project/final_data"
synthetic_path = "C:/Users/HUY/Desktop/my_project/synthetic_data"          
x_rtrain, x_rtest, y_rtrain, y_rtest = load_data(final_data, "090")
x_strain, x_stest, y_strain, y_stest = load_data(synthetic_path, "090")
print(len(x_rtrain), len(x_rtest), len(y_rtrain), len(y_rtest))
print(len(x_strain), len(x_stest), len(y_strain), len(y_stest))

Hoàn thành tải dữ liệu lên từ C:/Users/HUY/Desktop/my_project/final_data
Hoàn thành tải dữ liệu lên từ C:/Users/HUY/Desktop/my_project/synthetic_data
40 20 40 20
480 240 480 240


In [89]:
x_train, y_train = x_rtrain + x_strain, y_rtrain + y_strain
x_test, y_test = x_rtest + x_stest, y_rtest + y_stest

for i in range(len(y_train)):
    match y_train[i]:
        case '001': 
            y_train[i] = 0
        case '002':
            y_train[i] = 1
        case '003':
            y_train[i] = 2
        case '004':
            y_train[i] = 3
        case '005':
            y_train[i] = 4
        case '006':
            y_train[i] = 5
        case '007':
            y_train[i] = 6
        case '008':
            y_train[i] = 7
        case '009':
            y_train[i] = 8
        case '010':
            y_train[i] = 9

for i in range(len(y_test)):
    match y_test[i]:
        case '001': 
            y_test[i] = 0
        case '002':
            y_test[i] = 1
        case '003':
            y_test[i] = 2
        case '004':
            y_test[i] = 3
        case '005':
            y_test[i] = 4
        case '006':
            y_test[i] = 5
        case '007':
            y_test[i] = 6
        case '008':
            y_test[i] = 7
        case '009':
            y_test[i] = 8
        case '010':
            y_test[i] = 9

y_train, y_test = np.array(y_train), np.array(y_test)
print(f"Kích thước tập train và labels là {len(x_train)} {len(y_train)}")
print(f"Kích thước tập test và labels là {len(x_test)} {len(y_test)}")
# print(y_train)

Kích thước tập train và labels là 520 520
Kích thước tập test và labels là 260 260


In [4]:
# 4. Implement PCA manually
# dữ liệu đưa vào PCA là dữ liệu 2 chiều (n x (64x64)) w. n = số lượng dữ liệu trong tập data
def pca_transform(data, n_components):
    # input
    #   data 
    #   number of components
    # output
    #   the transformed data
    #   eigen vectors 
    f_data = []
    for i in data:
        f_data.append(i.flatten())

    mean = np.mean(f_data, axis=0)
    centered_data = f_data - mean
    covariance_matrix = np.cov(centered_data, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, sorted_indices[:n_components]]
    transformed_data = np.dot(centered_data, eigenvectors)
    return transformed_data, eigenvectors

# pca_data , pca_eigen = pca_transform(x_train, 300)

In [98]:
# 95 -> 14 ~ 13
# 96 -> 15 ~ 14
# 97 -> 16 ~ 15
# 98 -> 17 ~ 16
# 99 -> 18 ~ 17
# ==> chọn số component = 16 = 2^4

pca = PCA(n_components = 16)
pca_data = pca.fit_transform([x.flatten() for x in x_train])
pca_test = pca.transform([x.flatten() for x in x_test])

i = 0
print(pca_data.shape, pca_test.shape)
print(f"Vector đặc trưng PCA:\n{pca_data[i]},\nNhãn dán: {y_train[i]}")
x_train = pca_data
x_test = pca_test

(520, 16) (260, 16)
Vector đặc trưng PCA:
[-196.49155885  385.99839924 -230.02106919 -246.07948653 -116.64690618
    5.52249122 -301.78246617  295.2184611   170.7668675   313.93907582
  -25.24523338 -185.30831981  119.54127468 -121.13395932  -85.40413036
 -148.65615865],
Nhãn dán: 0


## Mình sẽ thử với 3 mô hình khác nhau
### Decision Tree + XGBoost
### Mạng CNN
### Supported Vector MachineMachine

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [99]:
# phân loại bằng RandomForest
accuracy = []
for i in range(10):
    rf_model = RandomForestClassifier(
        n_estimators=300, max_depth=20, 
        min_samples_split=2,min_samples_leaf=1,
        max_features='sqrt',bootstrap=True
        )
    rf_model.fit(x_train, y_train)
    y_pred = rf_model.predict(pca_test)
    
    accuracy.append(accuracy_score(y_test, y_pred))
    print(f'Accuracy: {accuracy[i] * 100:.2f}%')

print(f'Average Accuracy: {np.mean(accuracy) * 100:.2f}%')

Accuracy: 96.54%
Accuracy: 97.31%
Accuracy: 96.54%
Accuracy: 97.69%
Accuracy: 96.92%
Accuracy: 96.54%
Accuracy: 96.54%
Accuracy: 96.54%
Accuracy: 96.15%
Accuracy: 97.31%
Average Accuracy: 96.81%


In [100]:
# phân loại bằng XGBoost
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=500, 
    max_depth=16, 
    learning_rate=0.01,
    )
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(pca_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 93.46%


In [95]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

accuracy = []
for i in range(10):
    mlp_model = MLPClassifier(hidden_layer_sizes=(256,128,64), activation='logistic', max_iter=1000)
    mlp_model.fit(x_train, y_train)

    y_pred = mlp_model.predict(pca_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    print(f'Accuracy: {accuracy[i] * 100:.2f}%')
    
print(f'Average Accuracy: {np.mean(accuracy) * 100:.2f}%')

Accuracy: 97.31%
Accuracy: 98.08%
Accuracy: 99.62%
Accuracy: 98.46%
Accuracy: 98.85%
Accuracy: 99.23%
Accuracy: 99.23%
Accuracy: 99.23%
Accuracy: 99.23%
Accuracy: 98.46%
Average Accuracy: 98.77%


In [62]:
# phân loại bằng SVM
from sklearn.svm import SVC

svm_model = SVC(C=0.01, gamma=0.1)
svm_model.fit(x_train, y_train)

y_pred = svm_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 10.00%


In [63]:
# KIỂM TRA THAM SỐ TỐI ƯU CHO Decision Tree classifier
check=[2,5,10,50,100,150,500]
for i in check:
    from sklearn.tree import DecisionTreeClassifier
    clf1 = DecisionTreeClassifier(random_state=0,min_samples_split=i)
    %time clf1.fit(x_train,y_train)
    print("Training accuracy : {:.4f}, \nTest Accuracy: {:.4f}".format(clf1.score(x_train,y_train),clf1.score(x_test,y_test)))
    print("min samples split is "+ str(i))
    print("------------------------------------------------------------")

CPU times: total: 15.6 ms
Wall time: 14.1 ms
Training accuracy : 1.0000, 
Test Accuracy: 0.5615
min samples split is 2
------------------------------------------------------------
CPU times: total: 15.6 ms
Wall time: 11 ms
Training accuracy : 0.9942, 
Test Accuracy: 0.5769
min samples split is 5
------------------------------------------------------------
CPU times: total: 15.6 ms
Wall time: 14.5 ms
Training accuracy : 0.9827, 
Test Accuracy: 0.5538
min samples split is 10
------------------------------------------------------------
CPU times: total: 15.6 ms
Wall time: 15 ms
Training accuracy : 0.9712, 
Test Accuracy: 0.5615
min samples split is 50
------------------------------------------------------------
CPU times: total: 15.6 ms
Wall time: 11 ms
Training accuracy : 0.9327, 
Test Accuracy: 0.5077
min samples split is 100
------------------------------------------------------------
CPU times: total: 15.6 ms
Wall time: 6 ms
Training accuracy : 0.7673, 
Test Accuracy: 0.4577
min sampl

In [64]:
##time to check other classifier out
from sklearn.linear_model import LogisticRegression
clf2 = LogisticRegression(C=.00001,solver='liblinear',multi_class='auto')
%time clf2 = clf2.fit(x_train, y_train)

print("Training accuracy : {:.4f}, \nTest Accuracy: {:.4f}".format(clf2.score(x_train,y_train),clf2.score(x_test,y_test)))
print("min samples split is "+ str(i))
print("------------------------------------------------------------")

CPU times: total: 15.6 ms
Wall time: 59 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
min samples split is 500
------------------------------------------------------------




In [65]:
from sklearn.svm import LinearSVC
for i in [1,0.8,0.3,0.1,0.03,0.001,0.0001]:
    svm = LinearSVC(C=i)
    %time svm = svm.fit(x_train, y_train)
    print("Training accuracy : {:.4f}, \nTest Accuracy: {:.4f}".format(clf2.score(x_train,y_train),clf2.score(x_test,y_test)))
    print ("C chosen is "+str(i))
    print ("-----------------------------------------------------------------------")

CPU times: total: 109 ms
Wall time: 186 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
C chosen is 1
-----------------------------------------------------------------------
CPU times: total: 141 ms
Wall time: 141 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
C chosen is 0.8
-----------------------------------------------------------------------
CPU times: total: 78.1 ms
Wall time: 102 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
C chosen is 0.3
-----------------------------------------------------------------------
CPU times: total: 78.1 ms
Wall time: 98.5 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
C chosen is 0.1
-----------------------------------------------------------------------
CPU times: total: 93.8 ms
Wall time: 98.5 ms
Training accuracy : 0.9558, 
Test Accuracy: 0.4192
C chosen is 0.03
-----------------------------------------------------------------------
CPU times: total: 46.9 ms
Wall time: 55 ms
Training accuracy : 0.9558, 
Test Accuracy:

In [28]:
clf = LogisticRegression(C=.01,solver ='lbfgs',multi_class='auto',max_iter=250)
%time clf = clf.fit(x_train, y_train)
y_train_pred=clf.predict(x_train)
y_test_pred=clf.predict(x_test)
print("Training accuracy: {:.4f}, Test Accuracy: {:.4f}".format(accuracy_score(y_train_pred, y_train), accuracy_score(y_test_pred, y_test)))

CPU times: total: 62.5 ms
Wall time: 62.6 ms
Training accuracy: 1.0000, Test Accuracy: 0.3143




In [11]:
# Thử dùng RVM classifier thử xem

In [15]:
print(x_train.shape, type(x_train))
print(y_train.shape, type(y_train))
# data_to_save = np.hstack((x_train, y_train.reshape(-1, 1)))


(280, 32) <class 'numpy.ndarray'>
(280,) <class 'numpy.ndarray'>


In [None]:
# # 5. Implement MDA manually (optional)
# def mda_transform(data, labels, n_components):
#     data = np.array(data) if not isinstance(data, np.ndarray) else data
#     labels = np.array(labels) if not isinstance(labels, np.ndarray) else labels
    
#     unique_classes = np.unique(labels)
#     mean_total = np.mean(data, axis=0)
#     S_b = np.zeros((data.shape[1], data.shape[1]))
#     S_w = np.zeros((data.shape[1], data.shape[1]))
    
#     for cls in unique_classes:
#         class_data = data[labels == cls]
#         mean_class = np.mean(class_data, axis=0)
#         S_b += len(class_data) * np.outer(mean_class - mean_total, mean_class - mean_total)
#         S_w += np.cov(class_data, rowvar=False) * (len(class_data) - 1)
    
#     eigvals, eigvecs = np.linalg.eigh(np.linalg.pinv(S_w).dot(S_b))
#     sorted_indices = np.argsort(eigvals)[::-1]
#     eigvecs = eigvecs[:, sorted_indices[:n_components]]
#     transformed_data = np.dot(data, eigvecs)
#     return transformed_data, eigvecs