In [7]:
import pandas as pd
import os
import shutil

csv_file = r"/Users/dinesh/College/final proj/attempt3/mosei/CMU-MOSEI-20230514T151450Z-001/CMU-MOSEI/Labels/Data_Train_modified.csv"
df = pd.read_csv(csv_file)

In [8]:
audio_dir = "/Users/dinesh/College/final proj/attempt3/mosei/CMU-MOSEI-20230514T151450Z-001/CMU-MOSEI/Audio_chunk/Train_modified"
new_audio_dir = r"/Users/dinesh/College/final proj/attempt3/updatedMoseiData/audio"
os.makedirs(new_audio_dir, exist_ok=True)

In [9]:
tolerance = 0.3

new_csv_data = []

In [10]:
for index, row in df.iterrows():
    video = row['video']
    start_time = float(row['start_time'])
    end_time = float(row['end_time'])

    # Extract emotion columns
    sentiment = row['sentiment']
    happy = row['happy']
    sad = row['sad']
    anger = row['anger']
    surprise = row['surprise']
    disgust = row['disgust']
    fear = row['fear']
    text = row['text']

    # List all audio files that start with the same video name
    possible_files = [f for f in os.listdir(audio_dir) if f.startswith(video) and f.endswith('.wav')]

    matched_file = None

    for audio_file in possible_files:
        # Extract the start and end times from the audio file name
        try:
            audio_start_time, audio_end_time = map(float, audio_file[len(video) + 1:-4].split('_'))
        except ValueError:
            continue

        # Check if the CSV times are within the audio file's range (with tolerance)
        csv_within_audio = (start_time >= audio_start_time - tolerance and end_time <= audio_end_time + tolerance)
        
        # Check if the audio file times are within the CSV's range (with tolerance)
        audio_within_csv = (audio_start_time >= start_time - tolerance and audio_end_time <= end_time + tolerance)

        # If either condition is true, consider it a match
        if csv_within_audio or audio_within_csv:
            matched_file = audio_file
            break

    if matched_file:
        # Generate the new file name as a number starting from 1
        new_file_name = f"{len(new_csv_data) + 1}.wav"
        
        # Copy the matched audio file to the new directory with the new file name
        shutil.copy(os.path.join(audio_dir, matched_file), os.path.join(new_audio_dir, new_file_name))
        
        # Append the data to the new CSV list
        new_csv_data.append({
            'file_name': new_file_name,
            'text': text,
            'sentiment': sentiment,
            'happy': happy,
            'sad': sad,
            'anger': anger,
            'surprise': surprise,
            'disgust': disgust,
            'fear': fear
        })


In [11]:
new_df = pd.DataFrame(new_csv_data)

new_csv_file = "/Users/dinesh/College/final proj/attempt3/updatedMoseiData/new_mosei.csv"
new_df.to_csv(new_csv_file, index=False)

In [12]:
print(f"\nTotal matched audio files: {len(new_csv_data)}")
print(f"New audio files saved to {new_audio_dir}")
print(f"New CSV file saved to {new_csv_file}")



Total matched audio files: 2089
New audio files saved to /Users/dinesh/College/final proj/attempt3/updatedMoseiData/audio
New CSV file saved to /Users/dinesh/College/final proj/attempt3/updatedMoseiData/new_mosei.csv
