In [70]:
import os
import pandas as pd
import re

# ===================== PATHS =====================
asl_csv = "asl_citizen_gloss_counts.csv"
wlasl_folder = "landmarks_mapped"
msasl_folder = "MSASL_Keypoints"
output_csv = "all_counts.csv"

# ===================== NORMALIZE (Unified) =====================
def normalize_gloss(name):
    """
    Normalize a gloss name by:
    1) Lowercasing
    2) Taking only part before the first underscore
    3) Removing trailing numbers
    """
    name = str(name).lower()
    name = name.split('_')[0]          # part before first underscore
    name = re.sub(r'\d+$', '', name)   # remove trailing digits
    return name

# ===================== COLLECT GLOSSES FROM FOLDERS =====================
def collect_glosses(folder):
    """
    Return a set of unique normalized glosses from .npy files.
    """
    glosses = set()
    for root, _, files in os.walk(folder):
        for f in files:
            if f.endswith(".npy"):
                base = os.path.splitext(f)[0]
                glosses.add(normalize_gloss(base))
    return glosses

# ===================== COUNT FILES PER GLOSS =====================
def count_folder(folder):
    counts = {}
    for root, _, files in os.walk(folder):
        for f in files:
            if f.endswith(".npy"):
                gloss = normalize_gloss(os.path.splitext(f)[0])
                if gloss:
                    counts[gloss] = counts.get(gloss, 0) + 1
    return pd.Series(counts)

# ===================== LOAD ASL CITIZEN =====================
asl_df = pd.read_csv(asl_csv)
asl_df["clean"] = asl_df["gloss"].apply(normalize_gloss)
asl_counts = asl_df["clean"].value_counts().rename("count_ASL")
asl_glosses = set(asl_df["clean"])

# ===================== LOAD WLASL & MSASL =====================
wlasl_glosses = collect_glosses(wlasl_folder)
msasl_glosses = collect_glosses(msasl_folder)

wlasl_counts = count_folder(wlasl_folder).rename("count_WLASL")
msasl_counts = count_folder(msasl_folder).rename("count_MSASL")

# ===================== UNIQUE SET COMPARISONS =====================
asl_unique = asl_glosses - wlasl_glosses - msasl_glosses
wlasl_unique = wlasl_glosses - asl_glosses - msasl_glosses
msasl_unique = msasl_glosses - asl_glosses - wlasl_glosses

# Total distinct glosses across all datasets
all_unique_glosses = asl_glosses | wlasl_glosses | msasl_glosses

print("\n========= UNIQUE GLOSSES =========")
print(f"ASL Citizen unique: {len(asl_unique)}")
print(f"WLASL unique:       {len(wlasl_unique)}")
print(f"MSASL unique:       {len(msasl_unique)}")
print(f"TOTAL distinct glosses (union of all 3): {len(all_unique_glosses)}")
print("=================================\n")

# ===================== OVERLAP BREAKDOWN =====================

# Glosses in exactly TWO datasets (but not the third)
asl_wlasl_only = (asl_glosses & wlasl_glosses) - msasl_glosses
asl_msasl_only = (asl_glosses & msasl_glosses) - wlasl_glosses
wlasl_msasl_only = (wlasl_glosses & msasl_glosses) - asl_glosses

# Glosses common to ALL THREE datasets
common_all_three = asl_glosses & wlasl_glosses & msasl_glosses

# Neat print of all categories
print("========= FULL OVERLAP SUMMARY =========")
print(f"Unique glosses per dataset:")
print(f"  ASL only:           {len(asl_unique)}")
print(f"  WLASL only:         {len(wlasl_unique)}")
print(f"  MSASL only:         {len(msasl_unique)}\n")

print(f"Glosses shared by exactly 2 datasets:")
print(f"  ASL & WLASL only:   {len(asl_wlasl_only)}")
print(f"  ASL & MSASL only:   {len(asl_msasl_only)}")
print(f"  WLASL & MSASL only: {len(wlasl_msasl_only)}\n")

print(f"Glosses common to ALL THREE datasets: {len(common_all_three)}\n")

# Verify totals
total_calculated = (
    len(asl_unique) +
    len(wlasl_unique) +
    len(msasl_unique) +
    len(asl_wlasl_only) +
    len(asl_msasl_only) +
    len(wlasl_msasl_only) +
    len(common_all_three)
)
print(f"TOTAL distinct glosses (calculated): {total_calculated}")
print(f"TOTAL distinct glosses (union):      {len(all_unique_glosses)}")
print("========================================\n")

# ===================== MERGE COUNTS =====================
df = pd.concat([asl_counts, wlasl_counts, msasl_counts], axis=1).fillna(0).astype(int)
df["total_count"] = df.sum(axis=1)
df = df.sort_values("total_count", ascending=False)

df.to_csv(output_csv)
print(f"MERGED COUNT CSV SAVED → {output_csv}")
print(f"Total merged glosses: {len(df)}")
print(df.head(20))



ASL Citizen unique: 903
WLASL unique:       533
MSASL unique:       129
TOTAL distinct glosses (union of all 3): 3068

Unique glosses per dataset:
  ASL only:           903
  WLASL only:         533
  MSASL only:         129

Glosses shared by exactly 2 datasets:
  ASL & WLASL only:   643
  ASL & MSASL only:   36
  WLASL & MSASL only: 105

Glosses common to ALL THREE datasets: 719

TOTAL distinct glosses (calculated): 3068
TOTAL distinct glosses (union):      3068

MERGED COUNT CSV SAVED → all_counts.csv
Total merged glosses: 3067
         count_ASL  count_WLASL  count_MSASL  total_count
bird             1           10           31           42
fish             1           10           30           41
eat              1            7           33           41
teacher          1            8           31           40
orange           1           10           29           40
black            1           10           29           40
school           1            9           30           4