In [None]:
import os
import shutil
import json
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Define paths
BASE_DIR = "/content/drive/My Drive/capstone"
SOURCE_FOLDER = os.path.join(BASE_DIR, "held-out-data")
TARGET_FOLDER = os.path.join(BASE_DIR, "held-out-data-V2")
SOURCE_JSON_PATH = os.path.join(SOURCE_FOLDER, "digital_assistant_metadata.json")
TARGET_JSON_PATH = os.path.join(TARGET_FOLDER, "digital_assistant_metadata.json")

print(f"Loading metadata from: {SOURCE_JSON_PATH}")

try:
    with open(SOURCE_JSON_PATH, 'r') as f:
        metadata = json.load(f)
except FileNotFoundError:
    print(f"Error: Metadata file not found at {SOURCE_JSON_PATH}. Please check the path.")
    exit()
except json.JSONDecodeError:
    print(f"Error: Failed to decode JSON from {SOURCE_JSON_PATH}. Check file content.")
    exit()

# Filter the metadata
filtered_metadata = {}
files_to_copy = []
total_records = len(metadata)

print(f"Total records found in metadata: {total_records}")

# Loop through each record and apply the new filtering rule
for index, data in enumerate(metadata):

    file_id = data.get('Filename', f"record_{index}")
    ratings = data.get('Ratings')

    if isinstance(ratings, list) and ratings:
        filtered_metadata[file_id] = data

        filename = data.get('Filename')
        if filename:
            files_to_copy.append(filename)
        else:
            print(f"Warning: Record {index} is missing a 'Filename'. Skipping file copy for this record.")

print(f"Found {len(filtered_metadata)} records with ratings out of {total_records} total.")

=if not os.path.exists(TARGET_FOLDER):
    print(f"Creating target directory: {TARGET_FOLDER}")
    os.makedirs(TARGET_FOLDER)
else:
    print(f"Target directory already exists: {TARGET_FOLDER}")

# Save the new filtered metadata JSON
print(f"Saving filtered metadata to: {TARGET_JSON_PATH}")
with open(TARGET_JSON_PATH, 'w') as f:
    json.dump(filtered_metadata, f, indent=4)

print(f"Starting file copy for {len(files_to_copy)} files...")

for filename in files_to_copy:
    source_file_path = os.path.join(SOURCE_FOLDER, filename)
    target_file_path = os.path.join(TARGET_FOLDER, filename)

    if os.path.exists(source_file_path):
        try:
            shutil.copy2(source_file_path, target_file_path)
        except Exception as e:
            print(f"Error copying file {filename}: {e}")
    else:
        print(f"Warning: Audio file not found in source folder: {source_file_path}")

print("-" * 30)
print(f"Filtering and copying complete! New dataset is in: {TARGET_FOLDER}")