In [1]:
!pip install pandas ffmpeg-python
import os
import pandas as pd
from ffmpeg import input as ffmpeg_input



In [25]:
import os
import shutil
import pandas as pd
from tqdm import tqdm  # For progress bars

# ====== CONFIGURATION ======
folder_paths = [
    "/media/storage/fishcount/SER/MELD.Raw/train"
    #"/media/storage/fishcount/SER/MELD.Raw/dev_splits_complete", 
    #"/media/storage/fishcount/SER/MELD.Raw/output_repeated_splits_test"
]

csv_paths = [
    "/media/storage/fishcount/SER/MELD.Raw/train_sent_emo_clean.csv"
    #"/media/storage/fishcount/SER/MELD.Raw/dev_sent_emo.csv",
    #"/media/storage/fishcount/SER/MELD.Raw/test_sent_emo.csv"  # Fixed typo in path
]

output_wav_folder = "/media/storage/fishcount/SER/MELD.Raw/combined_wavs"
output_csv = "/media/storage/fishcount/SER/MELD.Raw/merged_data.csv"

# ====== AUDIO FILE MERGING ======
print("=== Processing Audio Files ===")
os.makedirs(output_wav_folder, exist_ok=True)
audio_files_copied = set()

for folder in tqdm(folder_paths, desc="Checking folders"):
    if not os.path.exists(folder):
        print(f"\n⚠️ Folder not found: {folder}")
        continue
        
    for file in tqdm(os.listdir(folder), desc=f"Processing {os.path.basename(folder)}", leave=False):
        if file.lower().endswith('.wav'):
            dest_path = os.path.join(output_wav_folder, file)
            if file not in audio_files_copied:
                try:
                    shutil.copy2(os.path.join(folder, file), dest_path)
                    audio_files_copied.add(file)
                except Exception as e:
                    print(f"\n⚠️ Failed to copy {file}: {str(e)}")

# ====== CSV DATA MERGING ======
print("\n=== Processing CSV Files ===")
dfs = []
required_columns = [
    'Sr No.', 'Utterance', 'Speaker', 'Emotion', 
    'Sentiment', 'Dialogue_ID', 'Utterance_ID',
    'Season', 'Episode', 'StartTime', 'EndTime'
]

for csv_file in tqdm(csv_paths, desc="Merging CSVs"):
    if not os.path.exists(csv_file):
        print(f"\n⚠️ CSV not found: {csv_file}")
        continue
    
    try:
        df = pd.read_csv(csv_file)
        # Validate required columns exist
        if not all(col in df.columns for col in required_columns):
            missing = [col for col in required_columns if col not in df.columns]
            print(f"\n⚠️ Missing columns in {csv_file}: {missing}")
            continue
            
        dfs.append(df)
    except Exception as e:
        print(f"\n⚠️ Error reading {csv_file}: {str(e)}")

if dfs:
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Advanced duplicate removal (considering audio may have multiple references)
    # Keep first occurrence based on Utterance_ID + Dialogue_ID
    merged_df = merged_df.drop_duplicates(
        subset=['Utterance_ID', 'Dialogue_ID'], 
        keep='first'
    )
    
    # Save with proper header
    merged_df.to_csv(output_csv, index=False)
    print(f"\n✅ Successfully merged {len(merged_df)} entries")
    print(f"✅ Audio: Copied {len(audio_files_copied)} WAV files")
else:
    print("\n❌ Error: No valid CSV data was processed")

# Final validation
if os.path.exists(output_csv):
    print(f"\nMerged CSV saved to: {output_csv}")
if os.path.exists(output_wav_folder):
    print(f"Combined WAVs in: {output_wav_folder} ({len(os.listdir(output_wav_folder))} files)")

=== Processing Audio Files ===


Checking folders: 100%|██████████| 1/1 [00:07<00:00,  7.42s/it]



=== Processing CSV Files ===


Merging CSVs: 100%|██████████| 1/1 [00:00<00:00, 58.56it/s]


✅ Successfully merged 9988 entries
✅ Audio: Copied 9988 WAV files

Merged CSV saved to: /media/storage/fishcount/SER/MELD.Raw/merged_data.csv
Combined WAVs in: /media/storage/fishcount/SER/MELD.Raw/combined_wavs (21686 files)





In [29]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# ====== CONFIGURATION ======
# Path to your existing merged data
merged_csv = "/media/storage/fishcount/SER/MELD.Raw/merged_data.csv"
combined_audio = "/media/storage/fishcount/SER/MELD.Raw/combined_wavs"

# Output directories
# output_base = "/media/storage/fishcount/SER/MELD_Split"
# train_audio = os.path.join(output_base, "train_now/wavs")
# test_audio = os.path.join(output_base, "test_now/wavs")
# train_csv = os.path.join(output_base, "train_now/train_sent_emo.csv")
# test_csv = os.path.join(output_base, "test_now/test_sent_emo.csv")

train_audio = "/media/storage/fishcount/SER/MELD.Raw/trainsplit"
test_audio = "/media/storage/fishcount/SER/MELD.Raw/testsplit"
train_csv = "/media/storage/fishcount/SER/MELD.Raw/trainsplitter.csv
test_csv = "/media/storage/fishcount/SER/MELD.Raw/testsplitter.csv



# Create directories
os.makedirs(train_audio, exist_ok=True)
os.makedirs(test_audio, exist_ok=True)

# ====== 1. LOAD MERGED DATA ======
print("Loading merged data...")
df = pd.read_csv(merged_csv)

# ====== 2. STRATIFIED SPLIT ======
print("Performing 80/20 split...")
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['Emotion']  # Preserve emotion balance
)

# ====== 3. SAVE SPLIT CSVs ======
print("Saving split CSVs...")
train_df.to_csv(train_csv, index=False)
test_df.to_csv(test_csv, index=False)

# ====== 4. ORGANIZE AUDIO FILES ======
print("Organizing audio files...")

def copy_audio_files(df, target_dir):
    """Copy audio files referenced in dataframe"""
    for _, row in tqdm(df.iterrows(), total=len(df)):
        utt_id = str(row['Utterance_ID'])
        src = os.path.join(combined_audio, f"dia{row['Dialogue_ID']}_utt{utt_id}.wav")
        if os.path.exists(src):
            shutil.copy2(src, target_dir)

copy_audio_files(train_df, train_audio)
copy_audio_files(test_df, test_audio)

print(f"""
✅ Successfully created split dataset:
Train: {len(train_df)} samples | {len(os.listdir(train_audio))} audio files
Test:  {len(test_df)} samples | {len(os.listdir(test_audio))} audio files
""")

SyntaxError: unterminated string literal (detected at line 20) (257249400.py, line 20)