# SỬ DỤNG ĐẶC TRƯNG ĐÃ ĐƯỢC TRÍCH XUẤT ĐỂ LOẠI BỎ ẢNH TRÙNG

# **Thông tin của tác giả, ngày cập nhật**

<hr>

**Thành viên nhóm**:
- **Trần Đình Khánh Đăng - 22520195**
- **Tăng Nhất - 22521027**
- **Lê Minh Nhựt - 22521060**

**Ngày cập nhật**: 22/01/2025

## Import thư viện cần thiết

In [None]:
import os
import gc
import cv2
import time
import torch
import torchvision


import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from matplotlib import patches
from scipy.spatial.distance import cdist, euclidean

from PIL import Image
from tqdm.notebook import tqdm
from tensorflow.keras.applications import (MobileNet, MobileNetV2, MobileNetV3Small, MobileNetV3Large, 
                                           ResNet50, ResNet101, ResNet152,
                                           VGG16, VGG19,
                                           EfficientNetB0, EfficientNetB1, EfficientNetB7,
                                           InceptionV3, Xception)
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet import preprocess_input

from skimage.io import imread
from skimage.color import rgb2gray
from skimage.metrics import structural_similarity as ssim
from sklearn.metrics.pairwise import cosine_similarity

## Khởi tạo đường dẫn

In [None]:
dataset_dir = '/kaggle/working/dataset'
extracted_features_file_name='extracted_features.npz'

cropped_file_name = 'cropped_dataset.csv'
cropped_dropdup_file_name = 'dropdup_dataset.csv'

cropped_base_dir= '/kaggle/input/cs114-cropped-full-dataset/dataset'
cropped_dataset_name = 'cropped_CarDataset.csv'
cropped_file_name_cars = 'cropped_CarDataset-1.csv'
cropped_file_name_categories = 'cropped_CarDataset-2.csv'
cropped_extracted_features_file_name='cropped_extracted_features.npz'

cropped_dropdup_extracted_features_file_name='dropdup_extracted_features.npz'
cropped_dropdup_extracted_features_csv = 'dropdup_extracted_features.csv'


In [None]:
def extract_feature_one_img(image_path, model, input_shape=(224, 224)):
    img = image.load_img(image_path, target_size=input_shape)
    if img.mode != 'RGB':
        img = img.convert('RGB')
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x, verbose=0)
    return features.flatten()

def extract_features(data=None,
                     base_dir='./',
                     dataset_dir='./',
                     file_csv='CarDataset-Splits-1-Train.csv',
                     model_name='MobileNet',
                     input_shape=(224, 224),
                     partition=False,
                     partition_size=1000,
                     random_state=42,
                     save_result=False,
                     save_name='extracted_features-Splits-1.npz'):
    models = {
        'MobileNet': MobileNet,
        'MobileNetV2': MobileNetV2,
        'MobileNetV3Small': MobileNetV3Small,
        'MobileNetV3Large': MobileNetV3Large,
        'ResNet50': ResNet50,
        'ResNet101': ResNet101,
        'ResNet152': ResNet152,
        'VGG16': VGG16,
        'VGG19': VGG19,
        'EfficientNetB0': EfficientNetB0,
        'EfficientNetB1': EfficientNetB1,
        'EfficientNetB7': EfficientNetB7,
        'InceptionV3': InceptionV3,
        'Xception': Xception
    }

    if model_name not in models:
        model_name = 'MobileNet'

    device = '/device:GPU:0' if tf.config.list_physical_devices('GPU') else '/device:CPU:0'
    with tf.device(device):
        model = models[model_name](weights='imagenet', include_top=False, pooling='avg')

    if data is None:
        data = pd.read_csv(os.path.join(dataset_dir, file_csv))

    if partition:
        sampled_data = data.sample(n=min(partition_size, len(data)), random_state=random_state).reset_index(drop=True)
        print(f"Processing {len(sampled_data)} images out of {len(data)} available.")
    else:
        sampled_data = data
    print("Extracting features...")

    result = []
    for _, row in tqdm(sampled_data.iterrows(), desc="Extracting Features", total=len(sampled_data), file=sys.stdout, leave=True):
        image_path = row["ImageFullPath"]
        categoryid = row["CategoryID"]

        full_path = os.path.join(base_dir, image_path)
        try:
            extracted_features = extract_feature_one_img(full_path, model, input_shape=input_shape)
            result.append({'ImageFullPath': image_path, 'CategoryID': categoryid, 'Extracted Features': extracted_features})
        except Exception as e:
            print(f"Error processing image {full_path}: {e}. Skipping...")

    print(f"Successfully processed {len(result)} images")

    if save_result:
        save_path = os.path.join(dataset_dir, save_name)
        np.savez(save_path, extracted_features=result)
        print(f"Extracted features saved to {save_path}")

    return result

def load_features(file_path):
    try:
        # Tải file .npz và lấy dữ liệu 'extracted_features'
        data = np.load(file_path, allow_pickle=True)
        extracted_features = data['extracted_features']

        # Đảm bảo định dạng giống như khi lưu
        formatted_features = []
        for item in extracted_features:
            # Chuyển từng phần tử từ ndarray về dictionary với đúng format
            formatted_features.append({
                'ImageFullPath': item['ImageFullPath'],
                'CategoryID': item['CategoryID'],
                'Extracted Features': item['Extracted Features']
            })

        print(f"Loaded extracted features from {file_path}")
        return formatted_features
    except Exception as e:
        print(f"Error loading features from {file_path}: {e}")
        return None

In [None]:
df_cropped = pd.read_csv(cropped_file_name)

In [None]:
st = time.time()
extracted_features = extract_features(base_dir=cropped_base_dir,
                                      dataset_dir=dataset_dir,
                                      file_csv=cropped_file_name,
                                      model_name='XceptionNet',
                                      input_shape=(224, 224),
                                      partition=False,
                                      partition_size=1000,
                                      save_result=True,
                                      save_name=extracted_features_file_name
                                      )
print(f"Total time: {(time.time()-st)/3600:.2f}h")

# Load extracted_features đã có từ trước
# extracted_features = load_features('/kaggle/input/cs114-cropped-full-dataset/dataset/cropped_extracted_features.npz')
# df_extracted_features = pd.DataFrame([feature for feature in tqdm(extracted_features, desc="Turning to DataFrame")])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def to_grayscale(features_list):
    processed_features = []
    for feature in features_list:
        if feature.ndim == 3:
            gray_feature = rgb2gray(feature)
        else:
            gray_feature = feature
        processed_features.append(gray_feature)
    return np.array(processed_features)

def find_duplicate(extracted_features=None,
                   base_dir='./',  # Used for 'ssim'
                   dataset_dir='./',
                   extracted_file='extracted_features.npz',
                   similarity_threshold=0.9,
                   metric='cosine',
                   save_result=False,
                   save_name='duplicate_images.csv'):

    if extracted_features is None:
        file_path = os.path.join(dataset_dir, extracted_file)
        data = np.load(file_path, allow_pickle=True)
        extracted_features = data['extracted_features']

    features_list = [x['Extracted Features'] for x in extracted_features]
    image_path_list = [x['ImageFullPath'] for x in extracted_features]
    categories_list = [x['CategoryID'] for x in extracted_features]
    features_list = to_grayscale(features_list)

    features_tensor = torch.tensor(np.array(features_list), device=device)

    if metric == 'phash':
        precomputed_hashes = {img_path: phash(Image.open(os.path.join(base_dir, img_path)))
                              for img_path in tqdm(image_path_list, desc='Computing hashes', unit='image')}

    duplicate_images = []
    duplicate_indices = []
    num_images = len(features_list)

    for i in tqdm(range(num_images), desc="Processing images", unit="image"):
        # Get the current feature and move it to GPU
        current_feature = features_tensor[i].unsqueeze(0)  # Add batch dimension

        # Compute similarities in batches
        if metric == 'cosine':
            # Compute cosine similarity using PyTorch
            similarities = torch.nn.functional.cosine_similarity(
                current_feature, features_tensor[i + 1:], dim=1
            )
            # Find indices where similarity > threshold
            duplicate_mask = similarities > similarity_threshold
            duplicate_indices_batch = torch.nonzero(duplicate_mask).squeeze(1) + i + 1

        elif metric == 'euclidean':
            # Compute Euclidean distance using PyTorch
            distances = torch.cdist(current_feature, features_tensor[i + 1:], p=2).squeeze(0)
            # Find indices where distance < (1 - threshold)
            duplicate_mask = distances < (1.0 - similarity_threshold)
            duplicate_indices_batch = torch.nonzero(duplicate_mask).squeeze(1) + i + 1

        elif metric == 'ssim':
            # SSIM is not easily parallelizable, so we fall back to CPU
            for j in range(i + 1, num_images):
                similarity = ssim(features_list[i].reshape(-1), features_list[j].reshape(-1), data_range=1)
                if similarity > similarity_threshold:
                    print(f"Found duplicate images: {image_path_list[i]} and {image_path_list[j]} with SSIM {similarity}")
                    duplicate_images.append((image_path_list[i], image_path_list[j], similarity))
                    duplicate_indices.append((i, j))

        elif metric == 'phash':
            # pHash is computed on CPU
            hash1 = precomputed_hashes[image_path_list[i]]
            for j in range(i + 1, num_images):
                hash2 = precomputed_hashes[image_path_list[j]]
                similarity = 1 - (hash1 - hash2) / len(hash1.hash)
                if similarity > similarity_threshold:
                    duplicate_images.append((image_path_list[i], image_path_list[j], similarity))
                    duplicate_indices.append((i, j))
                    print(f"Found duplicate images: {image_path_list[i]} and {image_path_list[j]} with SSIM {similarity}")

        # For cosine and Euclidean, process the batch results
        if metric in ['cosine', 'euclidean']:
            for j in duplicate_indices_batch.cpu().numpy():
                if metric == 'cosine':
                    similarity = similarities[j - i - 1].item()
                    print(f"Found duplicate images: {image_path_list[i]} and {image_path_list[j]} with cosine similarity {similarity}")
                elif metric == 'euclidean':
                    distance = distances[j - i - 1].item()
                    print(f"Found duplicate images: {image_path_list[i]} and {image_path_list[j]} with Euclidean distance {distance}")
                duplicate_images.append((image_path_list[i], image_path_list[j], similarity if metric == 'cosine' else distance))
                duplicate_indices.append((i, j))

    if save_result:
        save_path = os.path.join(dataset_dir, save_name)
        if metric in ['cosine', 'ssim', 'phash']:
            df = pd.DataFrame(duplicate_images, columns=["Image1", "Image2", "Score"])
        else:
            df = pd.DataFrame(duplicate_images, columns=["Image1", "Image2", "Distance"])
        df.to_csv(save_path, index=False)
        print(f"Saved to {save_path}")

    return duplicate_images, duplicate_indices

In [None]:
# duplicate_images = find_duplicate(extracted_features=extracted_features,
#                                   base_dir=base_dir,
#                                   dataset_dir=dataset_dir,
#                                   save_result = True,
#                                   metric='cosine') # Xịn nhất nhưng lâu nhất
# duplicate_images_2 = find_duplicate(extracted_features=extracted_features,
#                                   base_dir=base_dir,
#                                   dataset_dir=dataset_dir,
#                                   save_result = True,
#                                   metric='ssim') # Tàm tạm, không xịn
# duplicate_images_3, duplicate_indices_3 = find_duplicate(extracted_features=extracted_features[:10000],
#                                   base_dir=base_dir,
#                                   dataset_dir=dataset_dir,
#                                   save_result = True,
#                                   metric='euclidean') # Phế, nhưng nhanh
# duplicate_images_4 = find_duplicate(extracted_features=extracted_features,
#                                   base_dir=base_dir,
#                                   dataset_dir=dataset_dir,
#                                   save_result = True,
#                                   metric='phash') # Ổn

## Tiến hành loại bỏ ảnh trùng

In [None]:
all_duplicate_images = []
all_duplicate_indices = []

num_images = len(extracted_features)
chunk_size = 36740

for i in tqdm(range(0, num_images, chunk_size), desc='Processing Batches', unit='batch'):
    start_idx = i
    end_idx = min(i + chunk_size, num_images) 
    chunk = extracted_features[start_idx:end_idx]
    
    # Find duplicates in the current chunk
    duplicate_images, duplicate_indices = find_duplicate(
        extracted_features=chunk,
        base_dir=cropped_base_dir,
        dataset_dir=dataset_dir,
        save_result=True,
        save_name=f'duplicate_images_{start_idx + 1}_{end_idx}.csv',
        metric='cosine',
        similarity_threshold=0.95
    )
    
    all_duplicate_images.extend(duplicate_images)
    all_duplicate_indices.extend(duplicate_indices)
    
    print(f"Processed images {start_idx + 1} to {end_idx}")

df_combined = pd.DataFrame(all_duplicate_images, columns=["Image1", "Image2", "Distance"])

df_combined['Index1'] = [idx[0] for idx in all_duplicate_indices]
df_combined['Index2'] = [idx[1] for idx in all_duplicate_indices]

# Save the combined results to a CSV file
combined_save_path = os.path.join(dataset_dir, "cosine_combined_duplicate_images.csv")
df_combined.to_csv(combined_save_path, index=False)
print(f"Combined duplicate image results saved to {combined_save_path}")

In [None]:
all_duplicate_images = []
all_duplicate_indices = []

num_images = len(extracted_features)
chunk_size = 36740

for i in tqdm(range(0, num_images, chunk_size), desc='Processing Batches', unit='batch'):
    start_idx = i
    end_idx = min(i + chunk_size, num_images) 
    chunk = extracted_features[start_idx:end_idx]
    
    # Find duplicates in the current chunk
    duplicate_images, duplicate_indices = find_duplicate(
        extracted_features=chunk,
        base_dir=cropped_base_dir,
        dataset_dir=dataset_dir,
        save_result=True,
        save_name=f'duplicate_images_{start_idx + 1}_{end_idx}.csv',
        metric='euclidean',
        similarity_threshold=0.95
    )
    
    all_duplicate_images.extend(duplicate_images)
    all_duplicate_indices.extend(duplicate_indices)
    
    print(f"Processed images {start_idx + 1} to {end_idx}")

df_combined = pd.DataFrame(all_duplicate_images, columns=["Image1", "Image2", "Distance"])

df_combined['Index1'] = [idx[0] for idx in all_duplicate_indices]
df_combined['Index2'] = [idx[1] for idx in all_duplicate_indices]

# Save the combined results to a CSV file
combined_save_path = os.path.join(dataset_dir, "euclidean_combined_duplicate_images.csv")
df_combined.to_csv(combined_save_path, index=False)
print(f"Combined duplicate image results saved to {combined_save_path}")

In [None]:
def remove_duplicates(df_extracted_features,
                      duplicate_indices,
                      removal_strategy='keep_first',
                      dataset_dir='./',
                      save_results=False,
                      extracted_features_name='extracted_features.npz',
                      extracted_dataframe_name='dropdup_extracted_features.csv'):
    indices_to_remove = set()
    removed_image_paths = []  

    for i, j in duplicate_indices:
        if removal_strategy == 'keep_first':
            indices_to_remove.add(j)
            removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[j])  
        elif removal_strategy == 'keep_second':
            indices_to_remove.add(i)
            removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[i]) 
        elif removal_strategy == 'keep_smaller':
            if 'ImageFullPath' not in df_extracted_features.columns:
                print("Warning: 'ImageFullPath' column not found. Defaulting to 'keep_first'.")
                indices_to_remove.add(j)
                removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[j])
            else:
                path_i = os.path.join(dataset_dir, df_extracted_features['ImageFullPath'].iloc[i])
                path_j = os.path.join(dataset_dir, df_extracted_features['ImageFullPath'].iloc[j])
                try:
                    size_i = os.path.getsize(path_i)
                    size_j = os.path.getsize(path_j)

                    if size_i <= size_j:
                        indices_to_remove.add(j)
                        removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[j])  # Add removed image path
                    else:
                        indices_to_remove.add(i)
                        removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[i])  # Add removed image path
                except FileNotFoundError:
                    print(f"Warning: One or both files not found: {path_i}, {path_j}. Defaulting to 'keep_first'.")
                    indices_to_remove.add(j)
                    removed_image_paths.append(df_extracted_features['ImageFullPath'].iloc[j])  # Add removed image path
        else:
            raise ValueError(f"Invalid removal_strategy: {removal_strategy}")

    indices_to_remove = sorted(
        [df_extracted_features.index[idx] for idx in indices_to_remove if idx < len(df_extracted_features.index)],
        reverse=True
    )
    new_df_extracted_features = df_extracted_features.drop(indices_to_remove)

    extracted_features_list = []
    # Duyet qua tung dong dataframe
    for _, row in tqdm(new_df_extracted_features.iterrows(), desc="Extracting Features", total=len(new_df_extracted_features), file=sys.stdout, leave=True):
        image_path = row["ImageFullPath"]
        categoryid = row["CategoryID"]
        extracted_feature = row["Extracted Features"]
        # Lay duong dan full
        full_path = os.path.join(dataset_dir, image_path)
        try:
            extracted_features_list.append({'ImageFullPath': image_path, 'CategoryID': categoryid, 'Extracted Features': extracted_feature})
        except Exception as e:
            print(f"Error at image {full_path}: {e}. Skipping...")

    print(f"Successfully processed {len(extracted_features_list)} images")
    new_df_extracted_features['Extracted Features'] = [entry['Extracted Features'] for entry in extracted_features_list]
    new_extracted_features = new_df_extracted_features.reset_index().drop('index', axis=1)
    if save_results:
        df_file_path = os.path.join(dataset_dir, extracted_dataframe_name)
        features_file_path = os.path.join(dataset_dir, extracted_features_name)

        new_df_extracted_features.to_csv(df_file_path, index=False)
        print(f"Saved DataFrame to: {df_file_path}")

        np.savez(features_file_path, extracted_features=extracted_features_list)
        print(f"Saved extracted features to: {features_file_path}")

    return new_df_extracted_features, extracted_features_list, removed_image_paths 

In [None]:
extracted_features = load_features('/kaggle/input/cs114-extracted-features/fulldata_extracted_features.npz')
df_extracted_features = pd.DataFrame([feature for feature in tqdm(extracted_features)])

In [None]:
new_df_extracted_features, extracted_features_list, indices_to_remove = remove_duplicates(
    df_extracted_features=df_extracted_features,
    duplicate_indices=all_duplicate_indices,
    removal_strategy='keep_first', 
    dataset_dir=dataset_dir,
    save_results=True,
    extracted_features_name=cropped_dropdup_extracted_features_file_name,
    extracted_dataframe_name=cropped_dropdup_extracted_features_csv
)

## Visualization

In [None]:
def plot_duplicate_images(image_pairs, base_dir):
    rows = len(image_pairs)
    fig, axes = plt.subplots(rows, 2, figsize=(10, rows * 3))

    if rows == 1:
        axes = [axes]

    for i, (image_path1, image_path2, scores) in tqdm(enumerate(image_pairs), desc="Displaying images", total=len(image_pairs), unit='image'):
        full_path1 = os.path.join(base_dir, image_path1)
        full_path2 = os.path.join(base_dir, image_path2)

        img1 = Image.open(full_path1)
        img2 = Image.open(full_path2)

        axes[i][0].imshow(img1)
        axes[i][0].axis('off')
        axes[i][0].set_title(image_path1.split('/')[-1])

        axes[i][1].imshow(img2)
        axes[i][1].axis('off')
        axes[i][1].set_title(image_path2.split('/')[-1])

    plt.tight_layout()
    plt.show()

In [None]:
plot_duplicate_images(all_duplicate_images[:10], base_dir=cropped_base_dir)