In [None]:
import os
import pandas as pd
import cv2
import numpy as np
from tqdm import tqdm
from PIL import Image
import imagehash
import requests
from io import BytesIO

# Path to images folder in Google Drive
image_folder = "/content/drive/MyDrive/images"
csv_path = "metadata.csv"
mapping_csv_path = "image_mapping.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_path)

# Function to compute perceptual hash
def compute_hash(image_path):
    try:
        image = Image.open(image_path)
        return str(imagehash.phash(image))  # Perceptual hash
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Compute hashes for all local images
local_images = {compute_hash(os.path.join(image_folder, f)): f for f in os.listdir(image_folder)}

# Match images from CSV
matched_images = {}
for index, row in tqdm(df.iterrows(), total=len(df)):
    image_url = row["url"]
    try:
        # Download and compute hash for CSV image
        response = requests.get(image_url, stream=True)
        img = Image.open(BytesIO(response.content))
        csv_image_hash = str(imagehash.phash(img))

        # Find matching local image
        if csv_image_hash in local_images:
            matched_images[row["file"]] = local_images[csv_image_hash]  # Store mapping
    except Exception as e:
        print(f"Error downloading {image_url}: {e}")

# Convert mapping to a DataFrame
mapping_df = pd.DataFrame(matched_images.items(), columns=["original_name", "local_name"])

# Save to CSV
mapping_df.to_csv(mapping_csv_path, index=False)

print(f"Image mapping saved to: {mapping_csv_path}")


In [None]:
import pandas as pd

# File paths
x_test_path = "X_test.csv"
mapping_path = "image_mapping.csv"
metadata_path = "metadata.csv"
submission_path = "submission.csv"

# Load the datasets
x_test = pd.read_csv(x_test_path)  # Contains ID, file
mapping = pd.read_csv(mapping_path)  # Contains original_name, local_name
metadata = pd.read_csv(metadata_path)  # Contains file, boxes, Emotion?, Person or creature?

# 1️⃣ Merge X_test with image_mapping.csv to get original image names
x_test = x_test.merge(mapping, left_on="file", right_on="local_name", how="left")

# 2️⃣ Merge with metadata.csv to get bounding boxes, Emotion?, and Person or creature?
final_df = x_test.merge(metadata[["file", "boxes", "Emotion?", "Person or creature?"]],
                        left_on="original_name", right_on="file", how="left")
# 4️⃣ Remove duplicate IDs (keeping the first occurrence)
final_df = final_df.drop_duplicates(subset=["ID"])


# 3️⃣ Select required columns
final_df = final_df[["ID", "boxes", "Emotion?", "Person or creature?"]]

# 4️⃣ Fill missing values (if needed)
most_common = final_df["Person or creature?"].mode()[0]
final_df.loc[:, "Person or creature?"] = final_df["Person or creature?"].fillna(most_common)

most_common_emotion = final_df["Emotion?"].mode()[0]
final_df.loc[:, "Emotion?"] = final_df["Emotion?"].fillna(most_common_emotion)
# 5️⃣ Save the submission file
final_df.to_csv(submission_path, index=False)

print(f"✅ Submission file saved successfully at: {submission_path}")
