In [1]:
import nltk
from nltk.corpus import wordnet as wn
from collections import defaultdict

from classes import IMAGENET2012_CLASSES

print('len of classes' , len(IMAGENET2012_CLASSES.items()))
wnids = [a for a,b in IMAGENET2012_CLASSES.items()]
print(wnids[:5])
print('lend of wnids - ' ,len(wnids))
imagenet_synsets = [wn.synset_from_pos_and_offset('n', int(wnid[1:])) for wnid in wnids]
image_counts = dict.fromkeys(wnids, 50)
print(list(image_counts.items())[:5])

hypernym_freq = defaultdict(int)
for synset in imagenet_synsets:
    for path in synset.hypernym_paths(): # path is list of hypernyms
        for hypernym in path:
            hypernym_freq[hypernym] += 1

len of classes 1000
['n01440764', 'n01443537', 'n01484850', 'n01491361', 'n01494475']
lend of wnids -  1000
[('n01440764', 50), ('n01443537', 50), ('n01484850', 50), ('n01491361', 50), ('n01494475', 50)]


In [2]:
def get_imagenet_wnids(val_path):
    """
    Read ImageNet validation class WNIDs from a file.
    Assumes one WNID per line.
    """
    with open(val_path, 'r') as f:
        return [line.strip() for line in f]

def get_ancestor_synset(synset, level_up):
    """
    Get the ancestor synset 'level_up' levels up in the hierarchy.
    Returns the original synset if no ancestor at that level exists.
    """
    current = synset
    for _ in range(level_up):
        # Get hypernyms (parent nodes)
        hypernyms = current.hypernyms()
        if not hypernyms:
            return current
        # Take first hypernym path
        current = max(hypernyms, key = hypernym_freq.get)
    return current

def analyze_hierarchy(wnids, level_up, image_counts=None):
    """
    Analyze the ImageNet hierarchy by regressing classes up by specified levels.
    
    Parameters:
    wnids (list): List of WordNet IDs
    level_up (int): Number of levels to move up in hierarchy
    image_counts (dict): Optional dictionary mapping wnids to image counts
    
    Returns:
    tuple: (number of resulting classes, dict mapping new classes to original classes,
           dict mapping new classes to total image counts)
    """
    # Convert WNIDs to synsets
    synsets = [wn.synset_from_pos_and_offset('n', int(wnid[1:])) for wnid in wnids]
    
    # Map original synsets to their ancestors
    ancestor_mapping = {}
    for synset in synsets:
        ancestor = get_ancestor_synset(synset, level_up)
        ancestor_mapping[synset] = ancestor
    
    # Group original classes by ancestor
    classes_by_ancestor = defaultdict(list)
    for original, ancestor in ancestor_mapping.items():
        classes_by_ancestor[ancestor].append(original)
    
    # Calculate image counts if provided
    ancestor_image_counts = defaultdict(int)
    if image_counts:
        for ancestor, originals in classes_by_ancestor.items():
            for original in originals:
                # Convert synset back to WNID format
                wnid = f"n{str(original.offset()).zfill(8)}"
                ancestor_image_counts[ancestor] += image_counts.get(wnid, 0)
    
    return (
        len(classes_by_ancestor),
        dict(classes_by_ancestor),
        dict(ancestor_image_counts)
    )

def print_analysis_results(num_classes, class_mapping, image_counts):
    """
    Print the analysis results in a readable format.
    """
    print(f"Number of resulting classes: {num_classes}\n")
    print("Class distribution:")
    for ancestor, originals in class_mapping.items():
        print(f"\nAncestor: {ancestor.name()} ({len(originals)} original classes)")
        print(f"Image count: {image_counts.get(ancestor, 0)}")
        print("Original classes:")
        for original in originals:
            print(f"  - {original.name()}")

# Analyze hierarchy moving up 2 levels
results = analyze_hierarchy(wnids, 4, image_counts)
print(results[0])
print_analysis_results(*results)


110
Number of resulting classes: 110

Class distribution:

Ancestor: teleost_fish.n.01 (5 original classes)
Image count: 250
Original classes:
  - tench.n.01
  - goldfish.n.01
  - rock_beauty.n.01
  - anemone_fish.n.01
  - lionfish.n.01

Ancestor: cartilaginous_fish.n.01 (2 original classes)
Image count: 100
Original classes:
  - great_white_shark.n.01
  - tiger_shark.n.01

Ancestor: fish.n.01 (6 original classes)
Image count: 300
Original classes:
  - hammerhead.n.03
  - electric_ray.n.01
  - stingray.n.01
  - eel.n.02
  - sturgeon.n.01
  - gar.n.01

Ancestor: animal.n.01 (17 original classes)
Image count: 850
Original classes:
  - cock.n.05
  - hen.n.02
  - harvestman.n.01
  - scorpion.n.03
  - black_grouse.n.01
  - ptarmigan.n.01
  - ruffed_grouse.n.01
  - prairie_chicken.n.01
  - quail.n.02
  - partridge.n.03
  - tusker.n.01
  - sea_anemone.n.01
  - conch.n.01
  - snail.n.01
  - slug.n.07
  - sea_slug.n.01
  - chambered_nautilus.n.01

Ancestor: chordate.n.01 (12 original classes)
I

In [3]:
import os
import shutil

dir_name = "val_merged_csv"

# Check if the directory exists
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)  # Delete existing directory

os.makedirs(dir_name)  # Create a new empty directory


In [4]:
import pandas as pd

In [5]:
classes_by_ancestor = results[1]
ancestor_image_counts = results[2]
val_dir = 'val_csv'
print(type(classes_by_ancestor))
print(list(classes_by_ancestor.items())[0])
for k, v in classes_by_ancestor.items():
    

    csv_file_name = 'n' + str(k.offset()).zfill(8) + '.csv'
    csv_file_name = os.path.join(dir_name, csv_file_name)
    with open(csv_file_name, "w") as f:
        pass  # Creates an empty file
    csv_to_merge = ['n' + str(cur.offset()).zfill(8) for cur in v]

    for cur_file in csv_to_merge:
        test_df = pd.read_csv(os.path.join(val_dir, cur_file + ".csv"), header=None)  #data
        test_df = test_df[1:]
        test_df.to_csv(csv_file_name, mode="a", header=False, index=False)
    assert len(pd.read_csv(csv_file_name, header = None)) == ancestor_image_counts[k]
    
print('done - ' + str(len(list(classes_by_ancestor.items()))) + '  classes')

<class 'dict'>
(Synset('teleost_fish.n.01'), [Synset('tench.n.01'), Synset('goldfish.n.01'), Synset('rock_beauty.n.01'), Synset('anemone_fish.n.01'), Synset('lionfish.n.01')])
done - 110  classes


In [6]:
#sample 20 images from each class
img_dir_name = "val_merged_imgs"
import random
# Check if the directory exists
if os.path.exists(img_dir_name):
    shutil.rmtree(img_dir_name)  # Delete existing directory

os.makedirs(img_dir_name)  # Create a new empty directory



In [7]:
random.seed(42)
val_images = 'val_images'
for file in os.listdir(dir_name):
    df = pd.read_csv(os.path.join(dir_name, file), header=None)
    
    num_samples = 20
    # Randomly select sample indices from the test set
    sample_indices = random.sample(range(len(df)) ,num_samples)
    for idx in sample_indices:
        image_path = os.path.join(val_images, df.iloc[idx,0])
        shutil.copy(image_path, img_dir_name)
    
print(len(os.listdir(img_dir_name)))

2200
