In [2]:
import os
import re
import pandas as pd
from collections import Counter

# =========================
# CONFIG
# =========================
VIDEO_DIR = r"E:\ASL_Citizen\videos"
OUTPUT_CSV = "ASL_Citizen_gloss_counts.csv"

# =========================
# ROBUST GLOSS EXTRACTION
# =========================
def extract_gloss(filename):
    # Remove extension
    name = os.path.splitext(filename)[0]

    # Keep part after dash
    if '-' in name:
        name = name.split('-', 1)[1]

    # Remove trailing numbers (e.g. "HEAD 2")
    name = re.sub(r'\s*\d+$', '', name)

    # ðŸ”‘ Extract LAST uppercase word ONLY
    matches = re.findall(r'[A-Z]+', name)
    if not matches:
        return None

    return matches[-1]

# =========================
# PROCESS FILES
# =========================
glosses = []

for file in os.listdir(VIDEO_DIR):
    if file.lower().endswith(".mp4"):
        gloss = extract_gloss(file)
        if gloss:
            glosses.append(gloss)

# =========================
# COUNT & SAVE
# =========================
counts = Counter(glosses)

df = pd.DataFrame(counts.items(), columns=["GLOSS", "COUNT"])
df = df.sort_values(by="COUNT", ascending=False).reset_index(drop=True)

df.to_csv(OUTPUT_CSV, index=False)

print("CSV saved:", OUTPUT_CSV)
print("Unique glosses:", len(df))


CSV saved: ASL_Citizen_gloss_counts.csv
Unique glosses: 2091
