In [78]:
import pandas as pd
import json
import subprocess
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os


In [79]:
def clean_json_file(input_path, output_dir, max_workers=20):
    """
    Liest eine JSON-Datei mit Videoinformationen, prüft die YouTube-URLs auf Verfügbarkeit,
    filtert ungültige raus und speichert die bereinigte Version im Zielordner.

    :param input_path: Pfad zur Eingabedatei (.json)
    :param output_dir: Zielordner, in dem die bereinigte Datei gespeichert wird
    :param max_workers: Anzahl paralleler yt-dlp-Prozesse (Standard: 20)
    """

    def is_youtube_video_available(url):
        return subprocess.call(
            ["yt-dlp", "--skip-download", "--quiet", "--no-warnings", "--print", "title", url],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        ) == 0

    # Datei laden
    df = pd.read_json(input_path)
    if 'review' in df.columns:
        df = df.drop(columns='review')

    # URLs prüfen (parallel mit Fortschrittsanzeige)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        df['available'] = list(tqdm(executor.map(is_youtube_video_available, df['url']), total=len(df)))

    # Nur gültige behalten
    df_clean = df[df['available'] == True]

    # Zielpfad vorbereiten
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.basename(input_path).replace(".json", "_clean.json")
    output_path = os.path.join(output_dir, filename)

    # Als JSON speichern
    with open(output_path, "w") as f:
        json.dump(df_clean.to_dict(orient="records"), f, indent=2)

    print(f"Gespeichert: {output_path} ({len(df_clean)} gültige Einträge)")


In [None]:
clean_json_file(
    input_path='meta/MSASL_train.json',
    output_dir='/Users/sami/Desktop/MS-ASL/meta/cleaned'
)

clean_json_file(
    input_path='meta/MSASL_val.json',
    output_dir='/Users/sami/Desktop/MS-ASL/meta/cleaned'
)

clean_json_file(
    input_path='meta/MSASL_test.json',
    output_dir='/Users/sami/Desktop/MS-ASL/meta/cleaned'
)


100%|██████████| 5287/5287 [1:06:02<00:00,  1.33it/s]


Gespeichert: /Users/sami/Desktop/MS-ASL/meta/cleaned/MSASL_val_clean.json (3297 gültige Einträge)


100%|██████████| 4172/4172 [55:25<00:00,  1.25it/s]  

Gespeichert: /Users/sami/Desktop/MS-ASL/meta/cleaned/MSASL_test_clean.json (2847 gültige Einträge)



