In [38]:
'''
THIS NOTEBOOK CONTAINS ALL THE CODE FOR DATASET CREATION. 
CHECK main.ipynb FOR MODEL TRAINING CODE
'''

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
from tqdm import tqdm
import copy
import zipfile

from config import (
    DEVICE, 
    LATENT_DIM, 
    EMOTION_DIM, 
    LEARNING_RATE, 
    EPOCHS, 
    BETA,
    MODEL_DIR, 
    PLOT_OUTPUT_DIR,
    N_MELS,
    MAX_AUDIO_LENGTH,
    HOP_LENGTH,
    N_FFT,
    SAMPLE_RATE,
    EMOTION_CATEGORIES,
    GL_ITERS
)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [10]:
import torchaudio

In [4]:
!pip install kagglehub pygame



In [7]:
"""import kagglehub

# Download latest version
path = kagglehub.dataset_download("uldisvalainis/audio-emotions")

print("Path to dataset files:", path)
"""

'import kagglehub\n\n# Download latest version\npath = kagglehub.dataset_download("uldisvalainis/audio-emotions")\n\nprint("Path to dataset files:", path)\n'

In [40]:


import pygame
import os
from IPython.display import display, Audio
import time

def play_audio_jupyter(file_path, use_pygame=False):
    """
    Play an audio file in a Jupyter notebook
    
    Args:
        file_path (str): Path to the audio file
        use_pygame (bool): Whether to use pygame for playback instead of IPython's Audio
    """
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return
    
    if use_pygame:
        # Initialize pygame mixer
        pygame.mixer.init()
        try:
            # Load and play the sound
            sound = pygame.mixer.Sound(file_path)
            print(f"Playing: {file_path}")
            print(f"Duration: {sound.get_length():.2f} seconds")
            
            # Play the sound
            sound.play()
            
            # Keep the program running while the sound is playing
            pygame.time.wait(int(sound.get_length() * 1000))
            
        except Exception as e:
            print(f"Error playing audio with pygame: {e}")
        finally:
            # Clean up resources
            pygame.mixer.quit()
    else:
        # Use IPython's Audio display widget (preferred for Jupyter)
        try:
            print(f"Playing: {file_path}")
            display(Audio(file_path, autoplay=True))
        except Exception as e:
            print(f"Error playing audio with IPython: {e}")



pygame 2.6.1 (SDL 2.28.4, Python 3.11.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [41]:
# Example usage
dataset_path = "./recon_check"
file_name = "original.wav"
full_path = os.path.join(dataset_path, file_name)

play_audio_jupyter(full_path)

Error: File not found at ./recon_check/original.wav


In [None]:
from IPython.display import Audio

# Path to your MP3
mp3_path = "./outputs/inference/03-01-05-01-01-01-08_original.mp3"

# Display an audio player
Audio(mp3_path)

In [None]:
!pip install transformers

In [17]:
import os
import librosa
import numpy as np
import tqdm

def get_audio_duration(audio_path):
    """Get duration of an audio file in seconds"""
    try:
        duration = librosa.get_duration(path=audio_path)
        return duration
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def analyze_audio_directory(directory_path):
    """Analyze all wav files in a directory and return duration statistics"""
    # Get all wav files in the directory
    wav_files = [f for f in os.listdir(directory_path) if f.endswith('.wav')]
    
    if not wav_files:
        return None, None, None, None, None
    
    durations = []
    longest_file = None
    longest_duration = 0
    shortest_file = None
    shortest_duration = float('inf')
    
    print(f"Analyzing {len(wav_files)} files in {directory_path}...")
    
    # Process files with progress bar
    for wav_file in tqdm.tqdm(wav_files):
        audio_path = os.path.join(directory_path, wav_file)
        
        # Get audio duration
        duration = get_audio_duration(audio_path)
        
        if duration is not None:
            durations.append(duration)
            
            # Check if this is the longest file
            if duration > longest_duration:
                longest_duration = duration
                longest_file = wav_file
            
            # Check if this is the shortest file
            if duration < shortest_duration:
                shortest_duration = duration
                shortest_file = wav_file
    
    # Calculate mean duration
    mean_duration = np.mean(durations) if durations else None
    
    return longest_file, longest_duration, shortest_file, shortest_duration, mean_duration

def main():
    # Define base directory
    dataset_dir = "datasets"
    
    # Analyze Happy and Angry folders
    emotions = ["Happy", "Angry"]
    
    all_durations = []
    overall_longest_file = None
    overall_longest_duration = 0
    overall_shortest_file = None
    overall_shortest_duration = float('inf')
    
    for emotion in emotions:
        input_dir = os.path.join(dataset_dir, emotion)
        
        # Check if the input directory exists
        if os.path.exists(input_dir):
            print(f"\nAnalyzing {emotion} audio files...")
            result = analyze_audio_directory(input_dir)
            
            if result:
                longest_file, longest_duration, shortest_file, shortest_duration, mean_duration = result
                
                # Get all durations for this emotion and add to all_durations
                emotion_wav_files = [f for f in os.listdir(input_dir) if f.endswith('.wav')]
                for wav_file in emotion_wav_files:
                    duration = get_audio_duration(os.path.join(input_dir, wav_file))
                    if duration is not None:
                        all_durations.append(duration)
                
                # Update overall longest
                if longest_duration > overall_longest_duration:
                    overall_longest_duration = longest_duration
                    overall_longest_file = f"{emotion}/{longest_file}"
                
                # Update overall shortest
                if shortest_duration < overall_shortest_duration:
                    overall_shortest_duration = shortest_duration
                    overall_shortest_file = f"{emotion}/{shortest_file}"
                
                # Print results for this emotion
                print(f"\nResults for {emotion} audio files:")
                print(f"Longest file: {longest_file} ({longest_duration:.2f} seconds)")
                print(f"Shortest file: {shortest_file} ({shortest_duration:.2f} seconds)")
                print(f"Mean duration: {mean_duration:.2f} seconds")
            else:
                print(f"No WAV files found in {input_dir}")
        else:
            print(f"Directory not found: {input_dir}")
    
    # Print overall results
    if all_durations:
        overall_mean = np.mean(all_durations)
        print(f"\nOverall Results:")
        print(f"Longest file: {overall_longest_file} ({overall_longest_duration:.2f} seconds)")
        print(f"Shortest file: {overall_shortest_file} ({overall_shortest_duration:.2f} seconds)")
        print(f"Mean duration across all files: {overall_mean:.2f} seconds")
        print(f"Total files analyzed: {len(all_durations)}")

if __name__ == "__main__":
    main()


Analyzing Happy audio files...
Analyzing 2167 files in datasets/Happy...


100%|██████████| 2167/2167 [00:00<00:00, 19527.50it/s]



Results for Happy audio files:
Longest file: d15.wav (7.01 seconds)
Shortest file: 1075_IEO_HAP_HI.wav (1.33 seconds)
Mean duration: 2.67 seconds

Analyzing Angry audio files...
Analyzing 2167 files in datasets/Angry...


100%|██████████| 2167/2167 [00:00<00:00, 11230.49it/s]



Results for Angry audio files:
Longest file: su15.wav (5.86 seconds)
Shortest file: OAF_whip_angry.wav (1.29 seconds)
Mean duration: 2.77 seconds

Overall Results:
Longest file: Happy/d15.wav (7.01 seconds)
Shortest file: Angry/OAF_whip_angry.wav (1.29 seconds)
Mean duration across all files: 2.72 seconds
Total files analyzed: 4334


In [31]:
import os
from gtts import gTTS
from pathlib import Path
import argparse

def text_to_audio(input_text_file, output_wav_file):
    """
    Convert text from a file to speech and save as a WAV file using Google Text-to-Speech.
    
    Args:
        input_text_file: Path to the input text file
        output_wav_file: Path to save the output WAV file
    """
    try:
        # Read text from the input file
        with open(input_text_file, 'r', encoding='utf-8') as file:
            text = file.read()
        print(f"Read {len(text)} characters from {input_text_file}")
        
        # Create gTTS object
        tts = gTTS(text=text, lang='en', slow=False)
        
        # Save as MP3 first (gTTS only outputs MP3)
        mp3_file = str(output_wav_file).replace('.wav', '.mp3')
        tts.save(mp3_file)
        print(f"Saved speech as MP3: {mp3_file}")
        
        # Convert MP3 to WAV
        # Check if output directory exists, create if not
        output_dir = os.path.dirname(output_wav_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        from pydub import AudioSegment
        sound = AudioSegment.from_mp3(mp3_file)
        sound.export(output_wav_file, format="wav")
        print(f"Successfully converted to WAV: {output_wav_file}")
        
        # Remove temporary MP3 file
        os.remove(mp3_file)
    
        print("Warning: pydub not installed. Could not convert MP3 to WAV.")
        print(f"MP3 file saved at: {mp3_file}")
        print("Install pydub with 'pip install pydub' and ffmpeg for MP3 to WAV conversion.")
            
    except Exception as e:
        print(f"Error in text to speech conversion: {e}")
        return False



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert text file to speech')
    input_file='Text/Angry/su04.txt'
    output_file='output.wav'
    
    
    
    
    text_to_audio(input_file, output_file)

Read 47 characters from Text/Angry/su04.txt
Saved speech as MP3: output.mp3
Successfully converted to WAV: output.wav


In [26]:
# Install required packages if needed (uncomment if necessary)
# !pip install IPython pygame

import pygame
import os
from IPython.display import display, Audio
import time

def play_audio_jupyter(file_path, use_pygame=False):
    """
    Play an audio file in a Jupyter notebook
    
    Args:
        file_path (str): Path to the audio file
        use_pygame (bool): Whether to use pygame for playback instead of IPython's Audio
    """
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return
    
    if use_pygame:
        # Initialize pygame mixer
        pygame.mixer.init()
        try:
            # Load and play the sound
            sound = pygame.mixer.Sound(file_path)
            print(f"Playing: {file_path}")
            print(f"Duration: {sound.get_length():.2f} seconds")
            
            # Play the sound
            sound.play()
            
            # Keep the program running while the sound is playing
            pygame.time.wait(int(sound.get_length() * 1000))
            
        except Exception as e:
            print(f"Error playing audio with pygame: {e}")
        finally:
            # Clean up resources
            pygame.mixer.quit()
    else:
        # Use IPython's Audio display widget (preferred for Jupyter)
        try:
            print(f"Playing: {file_path}")
            display(Audio(file_path, autoplay=True))
        except Exception as e:
            print(f"Error playing audio with IPython: {e}")

# Example usage
dataset_path = ""
file_name = "output.wav"
full_path = os.path.join(dataset_path, file_name)

play_audio_jupyter(full_path)

pygame 2.6.1 (SDL 2.28.4, Python 3.11.7)
Hello from the pygame community. https://www.pygame.org/contribute.html
Error: File not found at output.wav


In [27]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: rapidfuzz, click, jiwer
  Attempting uninstall: click
    Found existing installation: click 8.1.7
    Uninstalling click-8.1.7:
      Successfully uninstalled click-8.1.7
Successfully

In [22]:
import os
import librosa
import numpy as np

def get_audio_durations(folder_path):
    durations = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                try:
                    y, sr = librosa.load(file_path, sr=None)  # sr=None keeps the original sampling rate
                    duration = librosa.get_duration(y=y, sr=sr)
                    durations.append(duration)
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    return durations

def print_stats(name, durations):
    if durations:
        print(f"Stats for {name}:")
        print(f"  Min duration: {np.min(durations):.2f} seconds")
        print(f"  Max duration: {np.max(durations):.2f} seconds")
        print(f"  Mean duration: {np.mean(durations):.2f} seconds\n")
    else:
        print(f"No audio files found in {name}.\n")

if __name__ == "__main__":
    base_folder = "datasets"
    emotions = ["Happy_padded", "Angry_padded"]

    for emotion in emotions:
        folder_path = os.path.join(base_folder, emotion)
        durations = get_audio_durations(folder_path)
        print_stats(emotion, durations)


Stats for Happy_padded:
  Min duration: 5.00 seconds
  Max duration: 5.00 seconds
  Mean duration: 5.00 seconds

Stats for Angry_padded:
  Min duration: 5.00 seconds
  Max duration: 5.00 seconds
  Mean duration: 5.00 seconds



In [7]:
! pip install mutagen

Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: mutagen
Successfully installed mutagen-1.47.0


In [24]:
import os
import statistics
from mutagen.mp3 import MP3

def folder_duration_stats(path):
    """
    Walk through `path` and return:
      count,
      min_dur,
      max_dur,
      mean_dur,
      shortest_file,
      longest_file
    where durations are in seconds.
    """
    entries = []  # list of (full_path, duration)
    for root, _, files in os.walk(path):
        for fname in files:
            if not fname.lower().endswith('.mp3'):
                continue

            full = os.path.abspath(os.path.join(root, fname))
            if not os.path.exists(full):
                print(f"  Skipping missing file: {full}")
                continue

            try:
                audio = MP3(full)
                entries.append((full, audio.info.length))
            except Exception as e:
                print(f"  Warning: could not read {full}: {e}")

    if not entries:
        return 0, 0.0, 0.0, 0.0, None, None

    # Extract just the durations
    durations = [dur for _, dur in entries]
    mean_dur = statistics.mean(durations)
    min_entry = min(entries, key=lambda x: x[1])
    max_entry = max(entries, key=lambda x: x[1])

    return (
        len(entries),
        min_entry[1],
        max_entry[1],
        mean_dur,
        min_entry[0],
        max_entry[0],
    )

def format_time(seconds):
    """
    Convert seconds to H:MM:SS string.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return f"{int(h)}:{int(m):02d}:{s:05.2f}"

if __name__ == "__main__":
    print("CWD:", os.getcwd(), "\n")

    base = "Audio_transformed"
    for emotion in ["Happy_padded", "Angry_padded"]:
        folder = os.path.join(base, emotion)
        print(f"Scanning: {os.path.abspath(folder)}")
        count, mn, mx, mean, shortest_file, longest_file = folder_duration_stats(folder)

        print(f"{emotion}:")
        print(f"  Files found:       {count}")
        print(f"  Shortest duration: {format_time(mn)}  —  {shortest_file}")
        print(f"  Longest duration:  {format_time(mx)}  —  {longest_file}")
        print(f"  Average duration:  {format_time(mean)}\n")


CWD: /home/jovyan/teaching_material/DL project 

Scanning: /home/jovyan/teaching_material/DL project/Audio_transformed/Happy_padded
Happy_padded:
  Files found:       1701
  Shortest duration: 0:00:05.07  —  /home/jovyan/teaching_material/DL project/Audio_transformed/Happy_padded/03-01-03-01-01-02-03.mp3
  Longest duration:  0:00:05.07  —  /home/jovyan/teaching_material/DL project/Audio_transformed/Happy_padded/03-01-03-01-01-02-03.mp3
  Average duration:  0:00:05.07

Scanning: /home/jovyan/teaching_material/DL project/Audio_transformed/Angry_padded
Angry_padded:
  Files found:       2108
  Shortest duration: 0:00:05.07  —  /home/jovyan/teaching_material/DL project/Audio_transformed/Angry_padded/03-02-05-02-01-02-22.mp3
  Longest duration:  0:00:05.07  —  /home/jovyan/teaching_material/DL project/Audio_transformed/Angry_padded/03-02-05-02-01-02-22.mp3
  Average duration:  0:00:05.07



In [None]:
'''
THIS CODE CONVERTS THE TEXT FILES TO AUDIO USING GTTS
'''

import os
import time
import shutil
from gtts import gTTS
from pathlib import Path
from pydub import AudioSegment
from pydub.effects import speedup
from tqdm import tqdm
import hashlib

def get_file_hash(file_path):
    """
    Get hash of file content to identify duplicate text files.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return hashlib.md5(content.encode()).hexdigest()

def text_to_audio(input_text_file, output_mp3_file):
    """
    Convert text from a file to speech and save as an MP3 file using Google Text-to-Speech.
    """
    try:
        # Read text from the input file
        with open(input_text_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Create gTTS object (Indian English)
        tts = gTTS(text=text, lang='en-in', slow=False, tld='co.in')
        
        # Ensure output directory exists
        output_dir = os.path.dirname(output_mp3_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Save directly as MP3
        tts.save(output_mp3_file)
        return True
    except Exception as e:
        print(f"Error in text-to-speech conversion for {input_text_file}: {e}")
        return False

def process_all_files(delay_seconds=3):
    """
    Process all text files in the Text folder and its subfolders,
    and save the corresponding MP3 files in the Audio folder.
    Avoids duplicate API calls by reusing MP3 files for identical text content.
    
    Args:
        delay_seconds: Number of seconds to wait between API calls
    """
    # Create Audio folder if it doesn't exist
    if not os.path.exists("Audio"):
        os.makedirs("Audio")
        print("Created Audio folder")
    
    # Create Audio/Happy and Audio/Angry folders if they don't exist
    for emotion in ["Happy", "Angry"]:
        emotion_folder = os.path.join("Audio", emotion)
        if not os.path.exists(emotion_folder):
            os.makedirs(emotion_folder)
            print(f"Created {emotion_folder} folder")
    
    # Process all text files
    text_base_path = "Text"
    
    # Dictionary to store already processed content hashes and their corresponding MP3 files
    content_hash_to_mp3 = {}
    
    # Process each emotion folder
    for emotion in ["Happy", "Angry"]:
        emotion_path = os.path.join(text_base_path, emotion)
        
        # Skip if folder doesn't exist
        if not os.path.exists(emotion_path):
            print(f"Warning: {emotion_path} folder not found")
            continue
        
        # Get all text files in the emotion folder
        text_files = [f for f in os.listdir(emotion_path) if f.endswith(".txt")]
        
        if not text_files:
            print(f"No text files found in {emotion_path}")
            continue
            
        print(f"\nProcessing {emotion} files:")
        
        # Process files with progress bar
        for text_file in tqdm(text_files, desc=f"{emotion}", unit="file"):
            input_path = os.path.join(emotion_path, text_file)
            output_path = os.path.join("Audio", emotion, text_file.replace(".txt", ".mp3"))
            
            # Get hash of file content
            file_hash = get_file_hash(input_path)
            
            # Check if we've already processed this content
            if file_hash in content_hash_to_mp3:
                # Reuse existing MP3 file instead of making a new API call
                source_mp3 = content_hash_to_mp3[file_hash]
                shutil.copy2(source_mp3, output_path)
                tqdm.write(f"Copied existing MP3 for duplicate content: {text_file}")
            else:
                # Process new unique content
                success = text_to_audio(input_path, output_path)
                
                if success:
                    # Store the hash and MP3 path for future reuse
                    content_hash_to_mp3[file_hash] = output_path
                    
                    # Add delay only for new API calls
                    if text_file != text_files[-1]:  # No need to delay after the last file
                        time.sleep(delay_seconds)

    # Report stats
    unique_files = len(content_hash_to_mp3)
    total_files = sum(len([f for f in os.listdir(os.path.join(text_base_path, e)) if f.endswith(".txt")]) 
                    for e in ["Happy", "Angry"] if os.path.exists(os.path.join(text_base_path, e)))
    
    print(f"\nProcessing complete! {unique_files} unique text contents found out of {total_files} total files.")
    print(f"Saved {total_files - unique_files} API calls by reusing existing MP3 files.")

if __name__ == "__main__":
    process_all_files(delay_seconds=3)

In [None]:
'''
Modifies the pitch of the audio files and saved the modified files to Audio_transformed
'''

def convert_to_male_voice(input_audio_file, output_audio_file):
    try:
        # Determine file format from extension
        input_format = os.path.splitext(input_audio_file)[1][1:]
        
        # Load the audio file
        sound = AudioSegment.from_file(input_audio_file, format=input_format)
        
        # Lower the pitch by modifying the sound
        slowdown_factor = 0.85  # 15% slower
        male_voice_sound = sound._spawn(sound.raw_data, overrides={
            "frame_rate": int(sound.frame_rate * slowdown_factor)
        })
        
        # Speed up tempo without affecting pitch
        male_voice_sound = speedup(male_voice_sound, 1.25, 150)
        
        # Apply low-pass filter for a more masculine sound
        male_voice_sound = male_voice_sound.low_pass_filter(300)
        
        # Export the modified audio
        output_format = os.path.splitext(output_audio_file)[1][1:]
        male_voice_sound.export(output_audio_file, format=output_format)
        
        return True
    except Exception as e:
        print(f"Error processing {input_audio_file}: {str(e)}")
        return False

In [None]:
'''
Evaluation of the TTS model by checking non english files
'''



import os
import pandas as pd
from langdetect import detect, LangDetectException
from tqdm import tqdm

def is_english(text):
    """
    Detect if text is in English.
    Returns True if English, False otherwise.
    """
    if not text or text.strip() == "":
        return False  # Empty text can't be detected
    
    try:
        lang = detect(text)
        return lang == 'en'
    except LangDetectException:
        return False  # If detection fails, consider it non-English

def check_non_english_files(directory_path):
    """
    Check all text files in a directory to identify non-English content.
    Returns a list of dictionaries with file paths and detected languages.
    """
    results = []
    
    # Get all txt files in the directory
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    print(f"Checking {len(txt_files)} files in {directory_path}...")
    
    # Process files with progress bar
    for txt_file in tqdm(txt_files):
        file_path = os.path.join(directory_path, txt_file)
        
        # Read text file
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Check if content is English
            is_eng = is_english(content)
            
            if not is_eng:
                try:
                    detected_lang = detect(content) if content.strip() else "empty"
                except LangDetectException:
                    detected_lang = "unknown"
                
                results.append({
                    'file': txt_file,
                    'path': file_path,
                    'detected_language': detected_lang,
                    'content_sample': content[:100] + '...' if len(content) > 100 else content
                })
                
        except Exception as e:
            results.append({
                'file': txt_file,
                'path': file_path,
                'error': str(e),
                'detected_language': 'error'
            })
    
    return results

def main():
    # Define base directory
    text_dir = "Text_evaluation_tts"
    
    # Check if the base directory exists
    if not os.path.exists(text_dir):
        print(f"Directory not found: {text_dir}")
        return
    
    # Process Happy and Angry folders
    emotions = ["Happy", "Angry"]
    all_results = []
    
    for emotion in emotions:
        input_dir = os.path.join(text_dir, emotion)
        
        # Check if the emotion directory exists
        if os.path.exists(input_dir):
            print(f"\nChecking {emotion} text files...")
            results = check_non_english_files(input_dir)
            
            # Add emotion information to results
            for item in results:
                item['emotion'] = emotion
            
            all_results.extend(results)
        else:
            print(f"Directory not found: {input_dir}")
    
    if all_results:
        df = pd.DataFrame(all_results)
        
        # Display summary
        print("\n===== SUMMARY =====")
        print(f"Total non-English files found: {len(df)}")
        
        if not df.empty:
            # Count by language
            lang_counts = df['detected_language'].value_counts()
            print("\nDetected languages:")
            print(lang_counts)
            
            # Count by emotion
            emotion_counts = df['emotion'].value_counts()
            print("\nNon-English files by emotion:")
            print(emotion_counts)
            
            # Save results to CSV
            output_file = "non_english_files.csv"
            df.to_csv(output_file, index=False)
            print(f"\nDetailed results saved to {output_file}")
        
        # Return percentage of non-English files
        total_files = sum(len([f for f in os.listdir(os.path.join(text_dir, e)) if f.endswith('.txt')]) 
                       for e in emotions if os.path.exists(os.path.join(text_dir, e)))
        
        if total_files > 0:
            percentage = (len(df) / total_files) * 100
            print(f"\nPercentage of non-English files: {percentage:.2f}% ({len(df)} out of {total_files})")
    else:
        print("\nAll files appear to be in English.")

if __name__ == "__main__":
    main()
    print("\nLanguage detection completed!")

In [None]:
'''
Whisper to convert audio to text
'''



import os
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import tqdm
# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load the Whisper model 
model_id = "openai/whisper-large"
print(f"Loading {model_id}...")
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
print("Model loaded successfully")
def transcribe_audio(audio_path):
    """Transcribe audio file to text using Whisper model"""
    try:
        # Load and resample audio
        audio, sr = librosa.load(audio_path, sr=16000)
        
        # Process audio
        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        
        # Generate transcription
        with torch.no_grad():
            generated_ids = model.generate(input_features=input_features)
        
        # Decode the output
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return transcription
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return ""
def process_directory(directory_path, output_text_dir):
    """Process all wav files in a directory and save transcriptions"""
    os.makedirs(output_text_dir, exist_ok=True)
    
    # Get all wav files in the directory
    wav_files = [f for f in os.listdir(directory_path) if f.endswith('.mp3')]
    
    print(f"Processing {len(wav_files)} files in {directory_path}...")
    
    # Process files with progress bar
    for wav_file in tqdm.tqdm(wav_files):
        audio_path = os.path.join(directory_path, wav_file)
        text_filename = os.path.splitext(wav_file)[0] + '.txt'
        text_path = os.path.join(output_text_dir, text_filename)
        
        # Transcribe audio
        transcription = transcribe_audio(audio_path)
        
        # Save transcription to text file
        with open(text_path, 'w', encoding='utf-8') as f:
            f.write(transcription)
def main():
    # Define base directories
    dataset_dir = "Audio"
    text_output_dir = "Text_evaluation_tts"
    
    # Create the main text output directory
    os.makedirs(text_output_dir, exist_ok=True)
    
    # Process Happy and Angry folders
    emotions = ["Happy", "Angry"]
    
    for emotion in emotions:
        input_dir = os.path.join(dataset_dir, emotion)
        output_dir = os.path.join(text_output_dir, emotion)
        
        # Check if the input directory exists
        if os.path.exists(input_dir):
            print(f"\nProcessing {emotion} audio files...")
            process_directory(input_dir, output_dir)
        else:
            print(f"Directory not found: {input_dir}")
if **name** == "__main__":
    main()
    print("\nTranscription process completed!")
