In [1]:
import os
import pandas as pd
import random
from tqdm import tqdm

# Load the filtered metadata
metadata_df = pd.read_csv('data/processed/fiw/train/fiw_metadata_filtered.csv')

# Load the train-pairs
pairs_df = pd.read_csv('data/raw/fiw/train/train-pairs.csv')


In [2]:
# Create a dictionary to store available images for each member
available_images = {}

def is_unrelated(member):
    return 'unrelated' in str(member).lower()

def get_images(member_identifier):
    if member_identifier in available_images:
        return available_images[member_identifier]
    
    family, member = member_identifier.split('/')
    member_images = metadata_df[(metadata_df['Family'] == family) & 
                                (metadata_df['Member'] == member)]
    
    # For unrelated members, keep all images regardless of Is_Kept status
    if is_unrelated(member):
        image_paths = member_images['Aligned_Image_Path'].tolist()
    else:
        # For regular members, only keep images where Is_Kept is True
        image_paths = member_images[member_images['Is_Kept'] == True]['Aligned_Image_Path'].tolist()
    
    # Filter out non-existent images
    existing_images = [path for path in image_paths if os.path.exists(path)]
    
    available_images[member_identifier] = existing_images
    return existing_images

def generate_triplets(pairs_df):
    triplets = []
    skipped_pairs = 0
    
    for idx, row in tqdm(pairs_df.iterrows(), total=pairs_df.shape[0], desc="Generating triplets"):
        p1 = row['p1']
        p2 = row['p2']
        nsamples = row['nsamples']
        ptype = row['ptype']
        
        p1_images = get_images(p1)
        p2_images = get_images(p2)
        
        if len(p1_images) == 0 or len(p2_images) == 0:
            skipped_pairs += 1
            continue
        
        for _ in range(int(nsamples)):
            anchor = random.choice(p1_images)
            positive = random.choice(p2_images)
            
            # Generate negative sample
            negative = None
            while negative is None or negative in p1_images or negative in p2_images:
                random_family = random.choice(metadata_df['Family'].unique())
                random_member = random.choice(metadata_df[metadata_df['Family'] == random_family]['Member'].unique())
                negative_images = get_images(f"{random_family}/{random_member}")
                if negative_images:
                    negative = random.choice(negative_images)
            
            triplets.append((anchor, positive, negative, ptype))
    
    print(f"Skipped {skipped_pairs} pairs due to missing or filtered out images.")
    return triplets

In [3]:
# Generate triplets
triplets = generate_triplets(pairs_df)
print(f"\nTotal triplets generated: {len(triplets)}")

# Save triplets to CSV
OUTPUT_ROOT = 'data/processed/fiw/train'
TRIPLET_CSV_PATH = os.path.join(OUTPUT_ROOT, 'filtered_triplets_with_labels.csv')
triplet_df = pd.DataFrame(triplets, columns=['Anchor', 'Positive', 'Negative', 'ptype'])
triplet_df['Triplet_ID'] = triplet_df.index + 1
triplet_df = triplet_df[['Triplet_ID', 'Anchor', 'Positive', 'Negative', 'ptype']]
triplet_df.to_csv(TRIPLET_CSV_PATH, index=False)

print(f"\nTriplets with labels saved to {TRIPLET_CSV_PATH}")
print("Triplet generation completed successfully!")


Generating triplets: 100%|██████████████████████████████████████████████████████████████████████| 6983/6983 [10:40<00:00, 10.90it/s]


Skipped 2570 pairs due to missing or filtered out images.

Total triplets generated: 189550

Triplets with labels saved to data/processed/fiw/train/filtered_triplets_with_labels.csv
Triplet generation completed successfully!


In [4]:
# Print additional statistics
print("\nAdditional Statistics:")
print(f"Number of unique families: {metadata_df['Family'].nunique()}")
print(f"Number of unique members: {metadata_df['Member'].nunique()}")
print(f"Total images: {len(metadata_df)}")
print(f"Images kept (Is_Kept=True): {metadata_df['Is_Kept'].sum()}")
print(f"Images filtered out (Is_Kept=False): {len(metadata_df) - metadata_df['Is_Kept'].sum()}")
print("\nTop 10 families by image count (after filtering):")
print(metadata_df[metadata_df['Is_Kept'] == True]['Family'].value_counts().head(10))
print("\nTop 10 members by image count (after filtering):")
print(metadata_df[metadata_df['Is_Kept'] == True]['Member'].value_counts().head(10))

# Compare triplet counts
original_triplets = pd.read_csv(os.path.join(OUTPUT_ROOT, 'triplets_with_labels.csv'))
print(f"\nOriginal triplet count: {len(original_triplets)}")
print(f"New filtered triplet count: {len(triplet_df)}")
print(f"Difference: {len(original_triplets) - len(triplet_df)}")


Additional Statistics:
Number of unique families: 571
Number of unique members: 42
Total images: 20342
Images kept (Is_Kept=True): 10778
Images filtered out (Is_Kept=False): 9564

Top 10 families by image count (after filtering):
Family
F0601    235
F0987     91
F0009     87
F0992     84
F0686     61
F0064     60
F1018     55
F0303     50
F0173     47
F0118     47
Name: count, dtype: int64

Top 10 members by image count (after filtering):
Member
MID1     3725
MID2     2491
MID3     1718
MID4     1219
MID5      730
MID6      351
MID7      153
MID8      104
MID9       59
MID20      36
Name: count, dtype: int64

Original triplet count: 201399
New filtered triplet count: 189550
Difference: 11849


In [5]:
# Compare triplet types
import pandas as pd
import os

OUTPUT_ROOT = 'data/processed/fiw/train'
TRIPLET_CSV_PATH = os.path.join(OUTPUT_ROOT, 'filtered_triplets_with_labels.csv')
original_triplets = pd.read_csv(os.path.join(OUTPUT_ROOT, 'triplets_with_labels.csv'))
triplet_df = pd.read_csv(TRIPLET_CSV_PATH)
print("\nOriginal triplet types:")
print(original_triplets['ptype'].value_counts())
print("Total:", len(original_triplets))
print("\nNew filtered triplet types:")
print(triplet_df['ptype'].value_counts())
print("Total:", len(triplet_df))


Original triplet types:
ptype
ms      46969
fs      46583
fd      30746
md      29730
bb      16325
sibs    15721
ss      11488
gfgd     2003
gmgs     1834
Name: count, dtype: int64
Total: 201399

New filtered triplet types:
ptype
ss      34230
bb      34230
ms      30874
fs      25801
fd      23708
md      23321
sibs    12131
gfgs     1570
gmgs     1368
gfgd     1277
gmgd     1040
Name: count, dtype: int64
Total: 189550
