# HUẤN LUYỆN VÀ ĐÁNH GIÁ MÔ HÌNH

# **Thông tin của tác giả, ngày cập nhật**

<hr>

**Thành viên nhóm**:
- **Trần Đình Khánh Đăng - 22520195**
- **Tăng Nhất - 22521027**
- **Lê Minh Nhựt - 22521060**

**Ngày cập nhật**: 22/01/2025

## Install và Import thư viện cần thiết

In [None]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    "cudf-cu12==24.12.*" "dask-cudf-cu12==24.12.*" "cuml-cu12==24.12.*" \
    "cugraph-cu12==24.12.*" "nx-cugraph-cu12==24.12.*" "cuspatial-cu12==24.12.*" \
    "cuproj-cu12==24.12.*" "cuxfilter-cu12==24.12.*" "cucim-cu12==24.12.*" \
    "pylibraft-cu12==24.12.*" "raft-dask-cu12==24.12.*" "cuvs-cu12==24.12.*" \
    "nx-cugraph-cu12==24.12.*"

In [None]:
import os
import re
import random

import numpy as np
import pandas as pd
import seaborn as sns
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.cluster import (KMeans, DBSCAN, AgglomerativeClustering, 
                             MeanShift, Birch, SpectralClustering)
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve

from cuml.svm import SVC as cuSVC
from cuml.ensemble import RandomForestClassifier as cuRFClassifier
from cuml.neighbors import KNeighborsClassifier as cuKNNClassifier


## **Khởi tạo đường dẫn**

In [1]:
dataset_dir = '/kaggle/working/dataset'

cropped_file_name = 'cropped_dataset.csv'
cropped_dropdup_file_name = 'dropdup_dataset.csv'

categories = ['Others', 'Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast']

base_dir = '/kaggle/input/cs114-final-project-full-dataset'
dataset_name = 'CarDataset.csv'
file_name_cars = 'CarDataset-1.csv'
file_name_categories = 'CarDataset-2.csv'
extracted_features_file_name='extracted_features.npz'

cropped_base_dir= '/kaggle/input/cs114-cropped-full-dataset/dataset'
cropped_dataset_name = 'cropped_CarDataset.csv'
cropped_file_name_cars = 'cropped_CarDataset-1.csv'
cropped_file_name_categories = 'cropped_CarDataset-2.csv'
cropped_extracted_features_file_name='cropped_extracted_features.npz'

cropped_dropdup_base_dir = '/kaggle/input/cs114-cropped-full-dataset/dataset'
cropped_dropdup_dataset_name = 'cropped_dropdup_CarDataset.csv'
cropped_dropdup_file_name_cars = 'cropped_dropdup_CarDataset-1.csv'
cropped_dropdup_file_name_categories = 'cropped_dropdup_CarDataset-2.csv'
cropped_dropdup_extracted_features_file_name='dropdup_extracted_features.npz'
cropped_dropdup_extracted_features_csv = 'dropdup_extracted_features.csv'

augmented_base_dir='/kaggle/input/cs114-augmented-dataset/augmented_images'
augmented_dataset_name = 'augmented_CarDataset.csv'
augmented_file_name_cars = 'augmented_CarDataset-1.csv'
augmented_file_name_categories = 'augmented_CarDataset-2.csv'
augmented_extracted_features_file_name='augmented_extracted_features.npz'

full_augmented_base_dir='/kaggle/input/cs114-full-augmented-dataset/full_augmented_dataset'
full_augmented_dataset_name = 'full_augmented_CarDataset.csv'
full_augmented_file_name_cars = 'full_augmented_CarDataset-1.csv'
full_augmented_file_name_categories = 'full_augmented_CarDataset-2.csv'
full_augmented_extracted_features_file_name='full_augmented_extracted_features.npz'

def get_indexing(categories):
    indexing = {category: idx for idx, category in enumerate(categories)}
    invert_indexing = {idx: category for category, idx in indexing.items()}
    return indexing, invert_indexing

indexing, invert_indexing = get_indexing(categories)

num_splits = 5

## **Khai báo một số hàm cần thiết**

In [None]:
def get_dataset(base_dir='./',
                dataset_dir='/',
                categories=['Others', 'Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast'],
                save_csv=False,
                file_name='CarDataset.csv',
                ) -> pd.DataFrame:
    
    os.makedirs(dataset_dir, exist_ok=True)  # Tạo thư mục đầu ra nếu chưa tồn tại

    path_list = []  # Lưu đường dẫn đầy đủ của hình ảnh
    categoryid_list = []  # Lưu mã danh mục tương ứng với từng hình ảnh


    student_ids_pattern = r'(\d{8}(?:-\d{8})*)' # Lấy MSSV hợp lệ (đủ 8 số)
    categories_pattern = '|'.join(categories) # Lấy hiệu xe hợp lệ
    file_extension_pattern = r'\.\d+\.(jpg|jpeg|png)$' # Lấy extension hợp lệ (chỉ chấp nhận file .jpg, .jpeg và .png)
    # Regex lấy tên file hợp lệ
    accepted_filename = re.compile(fr'{student_ids_pattern}\.({categories_pattern}){file_extension_pattern}')

    for category in tqdm(categories, desc="Processing categories"): # Duyệt qua các hiệu xe
        category_path = os.path.join(base_dir, category)
        if os.path.isdir(category_path): # Kiểm tra nếu thư mục tồn tại
            for filename in os.listdir(category_path):
                match = accepted_filename.match(filename)
                if match: # Chỉ xử lý file có tên hợp lệ
                    _, car_category, _ = match.groups()
                    if car_category in categories:
                        full_path = os.path.join(car_category, filename)
                        path_list.append(full_path)
                        categoryid_list.append(indexing[car_category])
    # Tạo DataFrame từ danh sách đường dẫn và mã hiệu xe
    df = pd.DataFrame({
        'ImageFullPath': path_list,
        'CategoryID': categoryid_list
    })

    # Lưu CSV nếu cần
    if save_csv:
        output_file = os.path.join(dataset_dir, file_name)
        df.to_csv(output_file, index=False)
        print(f"{file_name} saved to {output_file}")

    return df
def augmented_get_dataset(base_dir='./',
                dataset_dir='/',
                categories=['Others', 'Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast'],
                save_csv=False,
                file_name='augmented_CarDataset.csv',
                ) -> pd.DataFrame:

    os.makedirs(dataset_dir, exist_ok=True)

    path_list = []
    categoryid_list = []

    student_ids_pattern = r'(\d{8}(?:-\d{8})*)'
    categories_pattern = '|'.join(categories)
    file_extension_pattern = r'\.(jpg|jpeg|png)$'
    
    # Updated regex to handle files like "20520918.Mitsubishi.10_augmented_1.jpg"
    accepted_filename = re.compile(fr'{student_ids_pattern}\.({categories_pattern})\.[\w-]+{file_extension_pattern}')

    for category in tqdm(categories, desc="Processing categories"):
        category_path = os.path.join(base_dir, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                match = accepted_filename.match(filename)
                if match:
                    _, car_category, _ = match.groups()
                    if car_category in categories:
                        full_path = os.path.join(category, filename)  # Relative path within base_dir
                        path_list.append(full_path)
                        categoryid_list.append(indexing[car_category])

    df = pd.DataFrame({
        'ImageFullPath': path_list,
        'CategoryID': categoryid_list
    })

    if save_csv:
        output_file = os.path.join(dataset_dir, file_name)
        df.to_csv(output_file, index=False)
        print(f"{file_name} saved to {output_file}")

    return df

In [None]:
def split_kfold_datasets(data=None,
                        num_splits=5,
                        base_dir='./',
                        dataset_dir='./',
                        save_csv=False,
                        file_name='CarDataset.csv',
                        prefix=None,
                        random_state=42,
                        ):
    os.makedirs(dataset_dir, exist_ok=True)  # Tạo thư mục đầu ra nếu chưa tồn tại
    if data == None:
      # Đọc dữ liệu từ file CSV ban đầu
      data_path = os.path.join(dataset_dir, file_name)
      data = pd.read_csv(data_path)

    path_list = data['ImageFullPath'].values # Danh sách đường dẫn hình ảnh
    categoryid_list = data['CategoryID'].values # Danh sách mã hiệu xe tương ứng

    train_splits = []  # Danh sách chứa các DataFrame tập train
    test_splits = []  # Danh sách chứa các DataFrame tập test

    # Khởi tạo đối tượng KFold
    kfold = KFold(n_splits=num_splits, shuffle=True, random_state=random_state)

    for i, (train_index, test_index) in enumerate(kfold.split(path_list)):
        # Đường dẫn file train/test cho từng tập con
        if prefix is not None:
            train_file_path = os.path.join(dataset_dir, f'{prefix}_CarDataset-Splits-{i + 1}-Train.csv')
            test_file_path = os.path.join(dataset_dir, f'{prefix}_CarDataset-Splits-{i + 1}-Test.csv')
        else:
            train_file_path = os.path.join(dataset_dir, f'CarDataset-Splits-{i + 1}-Train.csv')
            test_file_path = os.path.join(dataset_dir, f'CarDataset-Splits-{i + 1}-Test.csv')
        # Tạo DataFrame cho tập train và test
        train_data = pd.DataFrame({
            'ImageFullPath': [path_list[idx] for idx in train_index],
            'CategoryID': [categoryid_list[idx] for idx in train_index]
        })
        test_data = pd.DataFrame({
            'ImageFullPath': [path_list[idx] for idx in test_index],
            'CategoryID': [categoryid_list[idx] for idx in test_index]
        })

        if save_csv:
            # Lưu DataFrame ra file CSV
            train_data.to_csv(train_file_path, index=False)
            test_data.to_csv(test_file_path, index=False)

            print(f"Train fold {i + 1} saved to {train_file_path}")
            print(f"Test fold {i + 1} saved to {test_file_path}")

        train_splits.append(train_data)
        test_splits.append(test_data)

    return train_splits, test_splits
def display_splits(train_splits, test_splits, max_files=5):
    for i, (train_data, test_data) in enumerate(zip(train_splits, test_splits)):
        print(f"\n=================== Split {i + 1} ===================")

        print("Train Files:")
        train_files = train_data['ImageFullPath'].tolist()
        for file in train_files[:max_files]:
            print(file)
        if len(train_files) > max_files:
            print(f"... and {len(train_files) - max_files} more")

        print("\nTest Files:")
        test_files = test_data['ImageFullPath'].tolist()
        for file in test_files[:max_files]:
            print(file)
        if len(test_files) > max_files:
            print(f"... and {len(test_files) - max_files} more")

def display_images(csv_file='CarDataset-Splits-1-Train.csv',
                   base_dir='./',
                   dataset_dir='./',
                   num_imgs_per_row=10,
                   img_height=150,
                   img_width=150,):
    # Đọc file CSV
    csv_path = os.path.join(dataset_dir, csv_file)
    df = pd.read_csv(csv_path)

    # Lấy danh sách các CategoryID
    categories = df['CategoryID'].unique()

    # Thiết lập vùng visualize/Điều chỉnh figsize cho phù hợp với số lượng ảnh
    fig_height = len(categories) * (img_height / 100)
    fig_width = num_imgs_per_row * (img_width / 100)
    plt.figure(figsize=(fig_height, fig_width))

    for i, category in enumerate(tqdm(categories, desc="Displaying images")):
        # Lấy ảnh thuộc category hiện tại
        category_imgs = df[df['CategoryID'] == category]['ImageFullPath'].tolist()

        # Chọn ngẫu nhiên ảnh
        selected_imgs = random.sample(category_imgs, min(len(category_imgs), num_imgs_per_row))

        # Tạo subplot cho CategoryID (đặt nó ở cột đầu tiên mỗi hàng)
        ax = plt.subplot(len(categories), num_imgs_per_row + 1, i * (num_imgs_per_row + 1) + 1)
        ax.text(0.5, 0.5, invert_indexing[category],
                ha='center', va='center', fontsize=12, fontweight='bold')
        ax.axis("off")

        # Hiển thị các ảnh trong hàng
        for j, img_path in enumerate(selected_imgs):
            ax = plt.subplot(len(categories), num_imgs_per_row + 1, i * (num_imgs_per_row + 1) + j + 2)
            try:
                img = Image.open(os.path.join(base_dir, img_path))
                img = img.resize((img_width, img_height))
                ax.imshow(img)
                ax.axis("off")
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
                ax.axis("off")

    # Điều chỉnh layout
    plt.subplots_adjust(wspace=0.2, hspace=0.4)
    plt.tight_layout()
    plt.show()
def plot_class_distributions(splits, mode='train'):
    num_splits = len(splits)

    fig, axes = plt.subplots(1, num_splits, figsize=(5 * num_splits, 5), sharey=True)
    title = f"Class Distributions Across {'Training' if mode == 'train' else 'Testing'} Splits"
    fig.suptitle(title)

    for i, ax in enumerate(axes):
        class_counts = splits[i]['CategoryID'].value_counts().sort_index()
        class_counts.plot(kind='bar', ax=ax)
        ax.set_title(f'Car-Splits-{i + 1}-Train' if mode == 'train' else f'Car-Splits-{i + 1}-Test')
        ax.set_xlabel('CategoryID')
        if i == 0:
            ax.set_ylabel('Số lượng ảnh')

    plt.tight_layout()
    plt.show()

In [None]:
def load_features(file_path):
    try:
        data = np.load(file_path, allow_pickle=True)
        extracted_features = data['extracted_features']

        formatted_features = []
        for item in extracted_features:
            formatted_features.append({
                'ImageFullPath': item['ImageFullPath'],
                'CategoryID': item['CategoryID'],
                'Extracted Features': item['Extracted Features']
            })

        print(f"Loaded extracted features from {file_path}")
        return formatted_features
    except Exception as e:
        print(f"Error loading features from {file_path}: {e}")
        return None
    
def load_split_with_features(csv_file,
                             features_df):
    split_df = pd.read_csv(csv_file)
    merged_df = pd.merge(split_df, features_df, on='ImageFullPath')
    merged_df = merged_df[['ImageFullPath', 'CategoryID', 'Extracted Features']]
    
    return merged_df

## **Load các đặc trưng đã được trích xuất**

In [None]:
extracted_features = load_features('/kaggle/input/cs114-extracted-features/fulldata_extracted_features.npz')
df_extracted_features = pd.DataFrame([feature for feature in tqdm(extracted_features)])

In [None]:
cropped_extracted_features = load_features('/kaggle/input/cs114-cropped-full-dataset/dataset/cropped_extracted_features.npz')
df_cropped_extracted_features = pd.DataFrame([feature for feature in tqdm(cropped_extracted_features)])

In [None]:
cropped_dropdup_extracted_features = load_features('/kaggle/input/cs114-extracted-features/dropdup_extracted_features.npz')
df_cropped_dropdup_extracted_features = pd.DataFrame([feature for feature in tqdm(cropped_dropdup_extracted_features)])

In [None]:
augmented_extracted_features = load_features('/kaggle/input/cs114-augmented-dataset/augmented_images/augmented_extracted_features.npz')
df_augmented_extracted_features = pd.DataFrame([feature for feature in tqdm(augmented_extracted_features)])

In [None]:
full_augmented_extracted_features = load_features('/kaggle/input/cs114-full-augmented-dataset/full_augmented_extracted_features.npz')
df_full_augmented_extracted_features = pd.DataFrame([feature for feature in tqdm(full_augmented_extracted_features)])

In [None]:
def augmented_get_dataset(base_dir='./',
                dataset_dir='/',
                categories=['Others', 'Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast'],
                save_csv=False,
                file_name='augmented_CarDataset.csv',
                ) -> pd.DataFrame:

    os.makedirs(dataset_dir, exist_ok=True)

    path_list = []
    categoryid_list = []

    student_ids_pattern = r'(\d{8}(?:-\d{8})*)'
    categories_pattern = '|'.join(categories)
    file_extension_pattern = r'\.(jpg|jpeg|png)$'
    
    # Updated regex to handle files like "20520918.Mitsubishi.10_augmented_1.jpg"
    accepted_filename = re.compile(fr'{student_ids_pattern}\.({categories_pattern})\.[\w-]+{file_extension_pattern}')

    for category in tqdm(categories, desc="Processing categories"):
        category_path = os.path.join(base_dir, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                match = accepted_filename.match(filename)
                if match:
                    _, car_category, _ = match.groups()
                    if car_category in categories:
                        full_path = os.path.join(category, filename)  # Relative path within base_dir
                        path_list.append(full_path)
                        categoryid_list.append(indexing[car_category])

    df = pd.DataFrame({
        'ImageFullPath': path_list,
        'CategoryID': categoryid_list
    })

    if save_csv:
        output_file = os.path.join(dataset_dir, file_name)
        df.to_csv(output_file, index=False)
        print(f"{file_name} saved to {output_file}")

    return df

## **Tạo các Dataframe để tiến hành Split dữ liệu**

In [None]:
data = get_dataset(base_dir=cropped_base_dir,
                            dataset_dir=dataset_dir,
                            file_name=dataset_name,
                            save_csv=True)
cropped_data = get_dataset(base_dir=cropped_base_dir,
                            dataset_dir=dataset_dir,
                            file_name=cropped_dataset_name,
                            save_csv=True)
cropped_dropdup_data = get_dataset(base_dir=cropped_base_dir,
                            dataset_dir=dataset_dir,
                            file_name=cropped_dropdup_dataset_name,
                            save_csv=True)
augmented_data = augmented_get_dataset(base_dir=augmented_base_dir,
                            dataset_dir=dataset_dir,
                            file_name=augmented_dataset_name,
                            save_csv=True)
full_augmented_data = augmented_get_dataset(base_dir=full_augmented_base_dir,
                            dataset_dir=dataset_dir,
                            file_name=full_augmented_dataset_name,
                            save_csv=True)

In [None]:
train_splits, test_splits = split_kfold_datasets(num_splits=num_splits,
                                                 dataset_dir=dataset_dir,
                                                 file_name=dataset_name,
                                                 save_csv=True,
                                                 random_state=42)
cropped_train_splits, cropped_test_splits = split_kfold_datasets(num_splits=num_splits,
                                                 dataset_dir=dataset_dir,
                                                 file_name=cropped_dataset_name,
                                                 save_csv=True,
                                                 prefix='cropped',
                                                 random_state=42)
cropped_dropdup_train_splits, cropped_dropdup_test_splits = split_kfold_datasets(num_splits=num_splits,
                                                 dataset_dir=dataset_dir,
                                                 file_name=cropped_dropdup_dataset_name,
                                                 save_csv=True,
                                                 prefix='cropped_dropdup',                                                                              
                                                 random_state=42)
augmented_train_splits, augmented_test_splits = split_kfold_datasets(num_splits=num_splits,
                                                 dataset_dir=dataset_dir,
                                                 file_name=augmented_dataset_name,
                                                 prefix='augmented',
                                                 save_csv=True,
                                                 random_state=42)
full_augmented_train_splits, full_augmented_test_splits = split_kfold_datasets(num_splits=num_splits,
                                                 dataset_dir=dataset_dir,
                                                 file_name=full_augmented_dataset_name,
                                                 prefix='full_augmented',
                                                 save_csv=True,
                                                 random_state=42)

In [None]:
display_splits(train_splits, test_splits)

In [None]:
plot_class_distributions(train_splits, mode='train')
plot_class_distributions(test_splits, mode='test')

In [None]:
display_splits(cropped_train_splits, cropped_test_splits)

In [None]:
plot_class_distributions(cropped_train_splits, mode='train')
plot_class_distributions(cropped_test_splits, mode='test')

In [None]:
display_splits(cropped_dropdup_train_splits, cropped_dropdup_test_splits)

In [None]:
plot_class_distributions(cropped_dropdup_train_splits, mode='train')
plot_class_distributions(cropped_dropdup_test_splits, mode='test')

In [None]:
display_splits(augmented_train_splits, augmented_test_splits)

In [None]:
plot_class_distributions(augmented_train_splits, mode='train')
plot_class_distributions(augmented_test_splits, mode='test')

In [None]:
display_splits(full_augmented_train_splits, full_augmented_test_splits)

In [None]:
plot_class_distributions(full_augmented_train_splits, mode='train')
plot_class_distributions(full_augmented_test_splits, mode='test')

## **Đưa dữ liệu từ các splits vào các biến**

In [None]:
def load_and_prepare_data(train_files, test_files, features_df):
    train_splits = []
    test_splits = []

    def load_split_with_features(csv_file, features_df):
        split_df = pd.read_csv(csv_file)
        merged_df = pd.merge(split_df, features_df, on='ImageFullPath')
        merged_df = merged_df[['ImageFullPath', 'CategoryID', 'Extracted Features']]
        return merged_df

    for train_file in train_files:
      train_split_df = load_split_with_features(train_file, features_df)
      train_splits.append(train_split_df)

    for test_file in test_files:
      test_split_df = load_split_with_features(test_file, features_df)
      test_splits.append(test_split_df)

    return train_splits, test_splits

In [None]:
df_features = df_extracted_features.drop('CategoryID', axis=1)
df_cropped_features = df_cropped_extracted_features.drop('CategoryID', axis=1)
df_cropped_dropdup_features = df_cropped_dropdup_extracted_features.drop('CategoryID', axis=1)
df_augmented_features = df_augmented_extracted_features.drop('CategoryID', axis=1)
df_full_augmented_features = df_full_augmented_extracted_features.drop('CategoryID', axis=1)

In [None]:
# Original Dataset
train_files = [f'{dataset_dir}/CarDataset-Splits-{i + 1}-Train.csv' for i in tqdm(range(num_splits), desc="Generating Original Train File")]
test_files = [f'{dataset_dir}/CarDataset-Splits-{i + 1}-Test.csv' for i in tqdm(range(num_splits), desc="Generating Original Test File")]
train_splits_with_features, test_splits_with_features = load_and_prepare_data(train_files, test_files, df_features)

# Cropped Dataset
cropped_train_files = [f'{dataset_dir}/cropped_CarDataset-Splits-{i + 1}-Train.csv' for i in tqdm(range(num_splits), desc="Generating Cropped Train File")]
cropped_test_files = [f'{dataset_dir}/cropped_CarDataset-Splits-{i + 1}-Test.csv' for i in tqdm(range(num_splits), desc="Generating Cropped Test File")]
cropped_train_splits_with_features, cropped_test_splits_with_features = load_and_prepare_data(cropped_train_files, cropped_test_files, df_cropped_features)

# Cropped and Deduplicated Dataset
cropped_dropdup_train_files = [f'{dataset_dir}/cropped_dropdup_CarDataset-Splits-{i + 1}-Train.csv' for i in tqdm(range(num_splits), desc="Generating Cropped Dropdup Train File")]
cropped_dropdup_test_files = [f'{dataset_dir}/cropped_dropdup_CarDataset-Splits-{i + 1}-Test.csv' for i in tqdm(range(num_splits), desc="Generating Cropped Dropdup Test File")]
cropped_dropdup_train_splits_with_features, cropped_dropdup_test_splits_with_features = load_and_prepare_data(cropped_dropdup_train_files, cropped_dropdup_test_files, df_cropped_dropdup_features)

# Augmented Dataset
augmented_train_files = [f'{dataset_dir}/augmented_CarDataset-Splits-{i + 1}-Train.csv' for i in tqdm(range(num_splits), desc="Generating Augmented Train File")]
augmented_test_files = [f'{dataset_dir}/augmented_CarDataset-Splits-{i + 1}-Test.csv' for i in tqdm(range(num_splits), desc="Generating Augmented Test File")]
augmented_train_splits_with_features, augmented_test_splits_with_features = load_and_prepare_data(augmented_train_files, augmented_test_files, df_augmented_features)

# Full Augmented Dataset
full_augmented_train_files = [f'{dataset_dir}/full_augmented_CarDataset-Splits-{i + 1}-Train.csv' for i in tqdm(range(num_splits), desc="Generating Full Augmented Train File")]
full_augmented_test_files = [f'{dataset_dir}/full_augmented_CarDataset-Splits-{i + 1}-Test.csv' for i in tqdm(range(num_splits), desc="Generating Full Augmented Test File")]
full_augmented_train_splits_with_features, full_augmented_test_splits_with_features = load_and_prepare_data(full_augmented_train_files, augmented_test_files, df_augmented_features)

In [None]:
# Full Dataset
train_df_1 = train_splits_with_features[0]
train_df_2 = train_splits_with_features[1]
train_df_3 = train_splits_with_features[2]
train_df_4 = train_splits_with_features[3]
train_df_5 = train_splits_with_features[4]

test_df_1 = test_splits_with_features[0]
test_df_2 = test_splits_with_features[1]
test_df_3 = test_splits_with_features[2]
test_df_4 = test_splits_with_features[3]
test_df_5 = test_splits_with_features[4]

# Cropped Dataset
cropped_train_df_1 = cropped_train_splits_with_features[0]
cropped_train_df_2 = cropped_train_splits_with_features[1]
cropped_train_df_3 = cropped_train_splits_with_features[2]
cropped_train_df_4 = cropped_train_splits_with_features[3]
cropped_train_df_5 = cropped_train_splits_with_features[4]

cropped_test_df_1 = cropped_test_splits_with_features[0]
cropped_test_df_2 = cropped_test_splits_with_features[1]
cropped_test_df_3 = cropped_test_splits_with_features[2]
cropped_test_df_4 = cropped_test_splits_with_features[3]
cropped_test_df_5 = cropped_test_splits_with_features[4]

# Cropped + Drop Duplicate Dataset
cropped_dropdup_train_df_1 = cropped_dropdup_train_splits_with_features[0]
cropped_dropdup_train_df_2 = cropped_dropdup_train_splits_with_features[1]
cropped_dropdup_train_df_3 = cropped_dropdup_train_splits_with_features[2]
cropped_dropdup_train_df_4 = cropped_dropdup_train_splits_with_features[3]
cropped_dropdup_train_df_5 = cropped_dropdup_train_splits_with_features[4]

cropped_dropdup_test_df_1 = cropped_dropdup_test_splits_with_features[0]
cropped_dropdup_test_df_2 = cropped_dropdup_test_splits_with_features[1]
cropped_dropdup_test_df_3 = cropped_dropdup_test_splits_with_features[2]
cropped_dropdup_test_df_4 = cropped_dropdup_test_splits_with_features[3]
cropped_dropdup_test_df_5 = cropped_dropdup_test_splits_with_features[4]

# Augmented Dataset
augmented_train_df_1 = augmented_train_splits_with_features[0]
augmented_train_df_2 = augmented_train_splits_with_features[1]
augmented_train_df_3 = augmented_train_splits_with_features[2]
augmented_train_df_4 = augmented_train_splits_with_features[3]
augmented_train_df_5 = augmented_train_splits_with_features[4]

augmented_test_df_1 = augmented_test_splits_with_features[0]
augmented_test_df_2 = augmented_test_splits_with_features[1]
augmented_test_df_3 = augmented_test_splits_with_features[2]
augmented_test_df_4 = augmented_test_splits_with_features[3]
augmented_test_df_5 = augmented_test_splits_with_features[4]

# Full Augmented Dataset
full_augmented_train_df_1 = full_augmented_train_splits_with_features[0]
full_augmented_train_df_2 = full_augmented_train_splits_with_features[1]
full_augmented_train_df_3 = full_augmented_train_splits_with_features[2]
full_augmented_train_df_4 = full_augmented_train_splits_with_features[3]
full_augmented_train_df_5 = full_augmented_train_splits_with_features[4]

full_augmented_test_df_1 = full_augmented_test_splits_with_features[0]
full_augmented_test_df_2 = full_augmented_test_splits_with_features[1]
full_augmented_test_df_3 = full_augmented_test_splits_with_features[2]
full_augmented_test_df_4 = full_augmented_test_splits_with_features[3]
full_augmented_test_df_5 = full_augmented_test_splits_with_features[4]

In [None]:

X_train_1 = np.array(train_df_1['Extracted Features'].apply(lambda x: x).tolist())
X_train_2 = np.array(train_df_2['Extracted Features'].apply(lambda x: x).tolist())
X_train_3 = np.array(train_df_3['Extracted Features'].apply(lambda x: x).tolist())
X_train_4 = np.array(train_df_4['Extracted Features'].apply(lambda x: x).tolist())
X_train_5 = np.array(train_df_5['Extracted Features'].apply(lambda x: x).tolist())

X_test_1 = np.array(test_df_1['Extracted Features'].apply(lambda x: x).tolist())
X_test_2 = np.array(test_df_2['Extracted Features'].apply(lambda x: x).tolist())
X_test_3 = np.array(test_df_3['Extracted Features'].apply(lambda x: x).tolist())
X_test_4 = np.array(test_df_4['Extracted Features'].apply(lambda x: x).tolist())
X_test_5 = np.array(test_df_5['Extracted Features'].apply(lambda x: x).tolist())

y_train_1 = np.array(train_df_1['CategoryID'].tolist())
y_train_2 = np.array(train_df_2['CategoryID'].tolist())
y_train_3 = np.array(train_df_3['CategoryID'].tolist())
y_train_4 = np.array(train_df_4['CategoryID'].tolist())
y_train_5 = np.array(train_df_5['CategoryID'].tolist())

y_test_1 = np.array(test_df_1['CategoryID'].tolist())
y_test_2 = np.array(test_df_2['CategoryID'].tolist())
y_test_3 = np.array(test_df_3['CategoryID'].tolist())
y_test_4 = np.array(test_df_4['CategoryID'].tolist())
y_test_5 = np.array(test_df_5['CategoryID'].tolist())


In [None]:

cropped_X_train_1 = np.array(cropped_train_df_1['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_train_2 = np.array(cropped_train_df_2['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_train_3 = np.array(cropped_train_df_3['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_train_4 = np.array(cropped_train_df_4['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_train_5 = np.array(cropped_train_df_5['Extracted Features'].apply(lambda x: x).tolist())

cropped_X_test_1 = np.array(cropped_test_df_1['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_test_2 = np.array(cropped_test_df_2['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_test_3 = np.array(cropped_test_df_3['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_test_4 = np.array(cropped_test_df_4['Extracted Features'].apply(lambda x: x).tolist())
cropped_X_test_5 = np.array(cropped_test_df_5['Extracted Features'].apply(lambda x: x).tolist())

cropped_y_train_1 = np.array(cropped_train_df_1['CategoryID'].tolist())
cropped_y_train_2 = np.array(cropped_train_df_2['CategoryID'].tolist())
cropped_y_train_3 = np.array(cropped_train_df_3['CategoryID'].tolist())
cropped_y_train_4 = np.array(cropped_train_df_4['CategoryID'].tolist())
cropped_y_train_5 = np.array(cropped_train_df_5['CategoryID'].tolist())

cropped_y_test_1 = np.array(cropped_test_df_1['CategoryID'].tolist())
cropped_y_test_2 = np.array(cropped_test_df_2['CategoryID'].tolist())
cropped_y_test_3 = np.array(cropped_test_df_3['CategoryID'].tolist())
cropped_y_test_4 = np.array(cropped_test_df_4['CategoryID'].tolist())
cropped_y_test_5 = np.array(cropped_test_df_5['CategoryID'].tolist())


In [None]:
cropped_dropdup_X_train_1 = np.array(cropped_dropdup_train_df_1['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_train_2 = np.array(cropped_dropdup_train_df_2['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_train_3 = np.array(cropped_dropdup_train_df_3['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_train_4 = np.array(cropped_dropdup_train_df_4['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_train_5 = np.array(cropped_dropdup_train_df_5['Extracted Features'].apply(lambda x: x).tolist())

cropped_dropdup_X_test_1 = np.array(cropped_dropdup_test_df_1['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_test_2 = np.array(cropped_dropdup_test_df_2['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_test_3 = np.array(cropped_dropdup_test_df_3['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_test_4 = np.array(cropped_dropdup_test_df_4['Extracted Features'].apply(lambda x: x).tolist())
cropped_dropdup_X_test_5 = np.array(cropped_dropdup_test_df_5['Extracted Features'].apply(lambda x: x).tolist())

cropped_dropdup_y_train_1 = np.array(cropped_dropdup_train_df_1['CategoryID'].tolist())
cropped_dropdup_y_train_2 = np.array(cropped_dropdup_train_df_2['CategoryID'].tolist())
cropped_dropdup_y_train_3 = np.array(cropped_dropdup_train_df_3['CategoryID'].tolist())
cropped_dropdup_y_train_4 = np.array(cropped_dropdup_train_df_4['CategoryID'].tolist())
cropped_dropdup_y_train_5 = np.array(cropped_dropdup_train_df_5['CategoryID'].tolist())

cropped_dropdup_y_test_1 = np.array(cropped_dropdup_test_df_1['CategoryID'].tolist())
cropped_dropdup_y_test_2 = np.array(cropped_dropdup_test_df_2['CategoryID'].tolist())
cropped_dropdup_y_test_3 = np.array(cropped_dropdup_test_df_3['CategoryID'].tolist())
cropped_dropdup_y_test_4 = np.array(cropped_dropdup_test_df_4['CategoryID'].tolist())
cropped_dropdup_y_test_5 = np.array(cropped_dropdup_test_df_5['CategoryID'].tolist())


In [None]:
augmented_X_train_1 = np.array(augmented_train_df_1['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_train_2 = np.array(augmented_train_df_2['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_train_3 = np.array(augmented_train_df_3['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_train_4 = np.array(augmented_train_df_4['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_train_5 = np.array(augmented_train_df_5['Extracted Features'].apply(lambda x: x).tolist())

augmented_X_test_1 = np.array(augmented_test_df_1['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_test_2 = np.array(augmented_test_df_2['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_test_3 = np.array(augmented_test_df_3['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_test_4 = np.array(augmented_test_df_4['Extracted Features'].apply(lambda x: x).tolist())
augmented_X_test_5 = np.array(augmented_test_df_5['Extracted Features'].apply(lambda x: x).tolist())

augmented_y_train_1 = np.array(augmented_train_df_1['CategoryID'].tolist())
augmented_y_train_2 = np.array(augmented_train_df_2['CategoryID'].tolist())
augmented_y_train_3 = np.array(augmented_train_df_3['CategoryID'].tolist())
augmented_y_train_4 = np.array(augmented_train_df_4['CategoryID'].tolist())
augmented_y_train_5 = np.array(augmented_train_df_5['CategoryID'].tolist())

augmented_y_test_1 = np.array(augmented_test_df_1['CategoryID'].tolist())
augmented_y_test_2 = np.array(augmented_test_df_2['CategoryID'].tolist())
augmented_y_test_3 = np.array(augmented_test_df_3['CategoryID'].tolist())
augmented_y_test_4 = np.array(augmented_test_df_4['CategoryID'].tolist())
augmented_y_test_5 = np.array(augmented_test_df_5['CategoryID'].tolist())

In [None]:
full_augmented_X_train_1 = np.array(full_augmented_train_df_1['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_train_2 = np.array(full_augmented_train_df_2['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_train_3 = np.array(full_augmented_train_df_3['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_train_4 = np.array(full_augmented_train_df_4['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_train_5 = np.array(full_augmented_train_df_5['Extracted Features'].apply(lambda x: x).tolist())

full_augmented_X_test_1 = np.array(full_augmented_test_df_1['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_test_2 = np.array(full_augmented_test_df_2['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_test_3 = np.array(full_augmented_test_df_3['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_test_4 = np.array(full_augmented_test_df_4['Extracted Features'].apply(lambda x: x).tolist())
full_augmented_X_test_5 = np.array(full_augmented_test_df_5['Extracted Features'].apply(lambda x: x).tolist())

full_augmented_y_train_1 = np.array(full_augmented_train_df_1['CategoryID'].tolist())
full_augmented_y_train_2 = np.array(full_augmented_train_df_2['CategoryID'].tolist())
full_augmented_y_train_3 = np.array(full_augmented_train_df_3['CategoryID'].tolist())
full_augmented_y_train_4 = np.array(full_augmented_train_df_4['CategoryID'].tolist())
full_augmented_y_train_5 = np.array(full_augmented_train_df_5['CategoryID'].tolist())

full_augmented_y_test_1 = np.array(full_augmented_test_df_1['CategoryID'].tolist())
full_augmented_y_test_2 = np.array(full_augmented_test_df_2['CategoryID'].tolist())
full_augmented_y_test_3 = np.array(full_augmented_test_df_3['CategoryID'].tolist())
full_augmented_y_test_4 = np.array(full_augmented_test_df_4['CategoryID'].tolist())
full_augmented_y_test_5 = np.array(full_augmented_test_df_5['CategoryID'].tolist())


## **Tiến hành huấn luyện và đánh giá**

### GPU
Chỉ hoạt động với T4 GPU

In [None]:
datasets = [
    (X_train_1, y_train_1, X_test_1, y_test_1),
    (X_train_2, y_train_2, X_test_2, y_test_2),
    (X_train_3, y_train_3, X_test_3, y_test_3),
    (X_train_4, y_train_4, X_test_4, y_test_4),
    (X_train_5, y_train_5, X_test_5, y_test_5)
]
cropped_datasets = [
    (cropped_X_train_1, cropped_y_train_1, cropped_X_test_1, cropped_y_test_1),
    (cropped_X_train_2, cropped_y_train_2, cropped_X_test_2, cropped_y_test_2),
    (cropped_X_train_3, cropped_y_train_3, cropped_X_test_3, cropped_y_test_3),
    (cropped_X_train_4, cropped_y_train_4, cropped_X_test_4, cropped_y_test_4),
    (cropped_X_train_5, cropped_y_train_5, cropped_X_test_5, cropped_y_test_5)
]

cropped_dropdup_datasets = [
    (cropped_dropdup_X_train_1, cropped_dropdup_y_train_1, cropped_dropdup_X_test_1, cropped_dropdup_y_test_1),
    (cropped_dropdup_X_train_2, cropped_dropdup_y_train_2, cropped_dropdup_X_test_2, cropped_dropdup_y_test_2),
    (cropped_dropdup_X_train_3, cropped_dropdup_y_train_3, cropped_dropdup_X_test_3, cropped_dropdup_y_test_3),
    (cropped_dropdup_X_train_4, cropped_dropdup_y_train_4, cropped_dropdup_X_test_4, cropped_dropdup_y_test_4),
    (cropped_dropdup_X_train_5, cropped_dropdup_y_train_5, cropped_dropdup_X_test_5, cropped_dropdup_y_test_5)
]

augmented_datasets = [
    (augmented_X_train_1, augmented_y_train_1, augmented_X_test_1, augmented_y_test_1),
    (augmented_X_train_2, augmented_y_train_2, augmented_X_test_2, augmented_y_test_2),
    (augmented_X_train_3, augmented_y_train_3, augmented_X_test_3, augmented_y_test_3),
    (augmented_X_train_4, augmented_y_train_4, augmented_X_test_4, augmented_y_test_4),
    (augmented_X_train_5, augmented_y_train_5, augmented_X_test_5, augmented_y_test_5)
]

full_augmented_datasets = [
    (full_augmented_X_train_1, full_augmented_y_train_1, full_augmented_X_test_1, full_augmented_y_test_1),
    (full_augmented_X_train_2, full_augmented_y_train_2, full_augmented_X_test_2, full_augmented_y_test_2),
    (full_augmented_X_train_3, full_augmented_y_train_3, full_augmented_X_test_3, full_augmented_y_test_3),
    (full_augmented_X_train_4, full_augmented_y_train_4, full_augmented_X_test_4, full_augmented_y_test_4),
    (full_augmented_X_train_5, full_augmented_y_train_5, full_augmented_X_test_5, full_augmented_y_test_5)
]

def fitting_datasets(datasets, prefix, model_name):
    results = []
    if model_name == 'SVM':
        model = cuSVC(kernel='rbf', random_state=42)
    elif model_name == 'RF':
        model = cuRFClassifier(random_state=42)
    else:
        model = cuKNNClassifier(random_state=42)
    for i, (X_train, y_train, X_test, y_test) in enumerate(tqdm(datasets, desc=f"Training {prefix} models")):
        with tqdm(total=2, desc=f"Fitting {model_name} on {prefix} Dataset {i+1}", leave=False) as pbar:
            model.fit(X_train, y_train)
            pbar.update(1)
            
            # Make predictions
            y_pred = model.predict(X_test)
            pbar.update(1)
        
        accuracy = accuracy_score(y_test, y_pred)
        results.append(accuracy)
        print(f'{model_name} Accuracy for {prefix} dataset {i+1}: {accuracy:.6f}')
    
    average_accuracy = sum(results) / len(results)
    print(f'Average {model_name} Accuracy for {prefix} datasets: {average_accuracy:.6f}')
    
    return model, results, average_accuracy

In [None]:
svm_original_model, svm_original_results, svm_average_accuracy = fitting_datasets(datasets, 'original', 'SVM')
svm_cropped_model, svm_cropped_results, svm_cropped_average_accuracy = fitting_datasets(cropped_datasets, 'cropped', 'SVM')
svm_cropped_dropdup_model, svm_cropped_dropdup_results, svm_cropped_dropdup_average_accuracy = fitting_datasets(cropped_dropdup_datasets, 'cropped_dropdup', 'SVM')
svm_augmented_model, svm_augmented_results, svm_augmented_average_accuracy = fitting_datasets(augmented_datasets, 'augmented', 'SVM')
svm_full_augmented_model, svm_full_augmented_results, svm_full_augmented_average_accuracy = fitting_datasets(full_augmented_datasets, 'full_augmented', 'SVM')

In [None]:
rf_original_model, rf_original_results, rf_average_accuracy = fitting_datasets(datasets, 'original', 'RF')
rf_cropped_model, rf_cropped_results, rf_cropped_average_accuracy = fitting_datasets(cropped_datasets, 'cropped', 'RF')
rf_cropped_dropdup_model, rf_cropped_dropdup_results, rf_cropped_dropdup_average_accuracy = fitting_datasets(cropped_dropdup_datasets, 'cropped_dropdup', 'RF')
rf_augmented_model, rf_augmented_results, rf_augmented_average_accuracy = fitting_datasets(augmented_datasets, 'augmented', 'RF')
rf_full_augmented_model, rf_full_augmented_results, rf_full_augmented_average_accuracy = fitting_datasets(full_augmented_datasets, 'full_augmented', 'RF')

In [None]:
knn_original_model, knn_original_results, knn_average_accuracy = fitting_datasets(datasets, 'original', 'KNN')
knn_cropped_model, knn_cropped_results, knn_cropped_average_accuracy = fitting_datasets(cropped_datasets, 'cropped', 'KNN')
knn_cropped_dropdup_model, knn_cropped_dropdup_results, knn_cropped_dropdup_average_accuracy = fitting_datasets(cropped_dropdup_datasets, 'cropped_dropdup', 'KNN')
knn_augmented_model, knn_augmented_results, knn_augmented_average_accuracy = fitting_datasets(augmented_datasets, 'augmented', 'KNN')
knn_full_augmented_model, knn_full_augmented_results, knn_full_augmented_average_accuracy = fitting_datasets(full_augmented_datasets, 'full_augmented', 'KNN')

## **Visualize Kết quả**

In [None]:
def plot_confusion_matrix(y_true,
                          y_pred,
                          labels,
                          title,
                          prefix=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    title = prefix + '_' + title if prefix is not None else prefix
    plt.title(title)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
    

In [None]:
_, _, X_test, y_test = datasets[-1]
svm_y_pred = svm_original_model.predict(X_test)
rf_y_pred = rf_original_model.predict(X_test)
knn_y_pred = knn_original_model.predict(X_test)

plot_confusion_matrix(y_test,
                      svm_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for SVM (Dataset 5)',
                      prefix='full')

plot_confusion_matrix(y_test,
                      rf_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for Random Forest (Dataset 5)',
                      prefix='full')

plot_confusion_matrix(y_test,
                      knn_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for KNN (Dataset 5)',
                      prefix='full')


In [None]:
_, _, cropped_X_test, cropped_y_test = cropped_datasets[-1]
svm_y_pred = svm_cropped_model.predict(cropped_X_test)
rf_y_pred = rf_cropped_model.predict(cropped_X_test)
knn_y_pred = knn_cropped_model.predict(cropped_X_test)

plot_confusion_matrix(cropped_y_test,
                      svm_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for SVM (Dataset 5)',
                      prefix='cropped')

plot_confusion_matrix(cropped_y_test,
                      rf_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for Random Forest (Dataset 5)',
                      prefix='cropped')

plot_confusion_matrix(cropped_y_test,
                      knn_y_pred,
                      labels=[0, 1],
                      title='Confusion Matrix for KNN (Dataset 5)',
                      prefix='cropped')

In [None]:
_, _, cropped_dropdup_X_test, cropped_dropdup_y_test = cropped_dropdup_datasets[-1]
svm_y_pred = svm_cropped_dropdup_model.predict(cropped_dropdup_X_test)
rf_y_pred = rf_cropped_dropdup_model.predict(cropped_dropdup_X_test)
knn_y_pred = knn_cropped_dropdup_model.predict(cropped_dropdup_X_test)

plot_confusion_matrix(cropped_dropdup_y_test,
                      svm_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for SVM (Dataset 5)',
                      prefix='cropped_dropdup')

plot_confusion_matrix(cropped_dropdup_y_test,
                      rf_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for Random Forest (Dataset 5)',
                      prefix='cropped_dropdup')

plot_confusion_matrix(cropped_dropdup_y_test,
                      knn_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for KNN (Dataset 5)',
                      prefix='cropped_dropdup')


In [None]:
_, _, augmented_X_test, augmented_y_test = augmented_datasets[-1]
svm_y_pred = svm_augmented_model.predict(augmented_X_test)
rf_y_pred = rf_augmented_model.predict(augmented_X_test)
knn_y_pred = knn_augmented_model.predict(augmented_X_test)

plot_confusion_matrix(augmented_y_test,
                      svm_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for SVM (Dataset 5)',
                      prefix='augmented')

plot_confusion_matrix(augmented_y_test,
                      rf_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for Random Forest (Dataset 5)',
                      prefix='augmented')

plot_confusion_matrix(augmented_y_test,
                      knn_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for KNN (Dataset 5)',
                      prefix='augmented')

In [None]:
_, _, full_augmented_X_test, full_augmented_y_test = full_augmented_datasets[-1]
svm_y_pred = svm_full_augmented_model.predict(full_augmented_X_test)
rf_y_pred = rf_full_augmented_model.predict(full_augmented_X_test)
knn_y_pred = knn_full_augmented_model.predict(full_augmented_X_test)

plot_confusion_matrix(full_augmented_y_test,
                      svm_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for SVM (Dataset 5)',
                      prefix='full_augmented')

plot_confusion_matrix(full_augmented_y_test,
                      rf_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for Random Forest (Dataset 5)',
                      prefix='full_augmented')

plot_confusion_matrix(full_augmented_y_test,
                      knn_y_pred,
                      labels=[label for label in range(len(categories))],
                      title='Confusion Matrix for KNN (Dataset 5)',
                      prefix='full_augmented')

In [None]:
def visualize_predictions(base_dir,
                          image_paths,
                          y_true,
                          model,
                          X_test,
                          categories,
                          num_images=5):
    y_pred = model.predict(X_test)

    correct_indices = np.where(y_pred == y_true)[0]
    incorrect_indices = np.where(y_pred != y_true)[0]

    def display_images(indices, title):
        plt.figure(figsize=(12, 5))
        plt.suptitle(title, fontsize=16)
        for i, idx in enumerate(random.sample(list(indices), min(num_images, len(indices)))):
            plt.subplot(1, num_images, i + 1)
            img_path = os.path.join(base_dir, image_paths[idx])
            try:
                img = Image.open(img_path)
                plt.imshow(img)
                plt.title(f"True Label: {categories[y_true[idx]]}\nPredicted Label: {categories[y_pred[idx]]}")
                plt.axis("off")
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
        plt.tight_layout()
        plt.show()

    print("Visualizing Correctly Classified Images:")
    display_images(correct_indices, "Correctly Classified Images")

    print("Visualizing Incorrectly Classified Images:")
    display_images(incorrect_indices, "Incorrectly Classified Images")

In [None]:
visualize_predictions(base_dir=base_dir,
                      image_paths=test_df_1['ImageFullPath'].tolist(),
                      y_true=y_test_1,
                      model=svm_original_model,
                      X_test=X_test_1,
                      categories=categories,
                      num_images=5)

In [None]:
visualize_predictions(base_dir=cropped_base_dir,
                      image_paths=cropped_test_df_1['ImageFullPath'].tolist(),
                      y_true=cropped_y_test_1,
                      model=svm_cropped_model,
                      X_test=cropped_X_test_1,
                      categories=categories,
                      num_images=5)

In [None]:
visualize_predictions(base_dir=cropped_dropdup_base_dir,
                      image_paths=cropped_dropdup_test_df_1['ImageFullPath'].tolist(),
                      y_true=cropped_dropdup_y_test_1,
                      model=svm_cropped_dropdup_model,
                      X_test=cropped_dropdup_X_test_1,
                      categories=categories,
                      num_images=5)

In [None]:
visualize_predictions(base_dir=augmented_base_dir,
                      image_paths=augmented_test_df_1['ImageFullPath'].tolist(),
                      y_true=augmented_y_test_1,
                      model=svm_augmented_model,
                      X_test=augmented_X_test_1,
                      categories=categories,
                      num_images=5)

In [None]:
visualize_predictions(base_dir=full_augmented_base_dir,
                      image_paths=full_augmented_test_df_1['ImageFullPath'].tolist(),
                      y_true=full_augmented_y_test_1,
                      model=svm_full_augmented_model,
                      X_test=full_augmented_X_test_1,
                      categories=categories,
                      num_images=5)