In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-3-7-1.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-7-7-2.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-7-7-1.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-4-7-1.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-5-7-2.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-1-7-2.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-2-7-1.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)/Speaker-7/Fear/F-6-7-2.wav
/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotio

In [11]:
import os
import pandas as pd
import librosa
from datasets import Dataset
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm


In [16]:
import os
import pandas as pd

# Path to the IESC dataset folder (root folder that contains subdirectories)
IESC_PATH = "/kaggle/input/indian-emotional-speech-corpora-iesc/Indian Emotional Speech Corpora (IESC)"

# Initialize lists to store emotion, gender, and file paths
emotion = []
gender = []  # Gender can be deduced from speaker number
path = []

# Emotion mapping
emotion_map = {'A': 'angry', 'F': 'fear', 'H': 'happy', 'N': 'neutral', 'S': 'sad'}

# Use os.walk to traverse directories and subdirectories
for root, dirs, files in os.walk(IESC_PATH):
    for file in files:
        if file.endswith(".wav"):
            # Split the filename into parts (e.g., "H-4-5-1.wav")
            parts = file.split('-')
            
            # Get the emotion from the first part of the filename
            emotion_label = parts[0]  # First part is emotion
            emotion.append(emotion_map[emotion_label])
            
            # Gender determination based on speaker number (assuming speakers 1-5 are male, 6-8 are female)
            speaker = int(parts[2])  # Speaker ID is the 3rd part
            gender.append('male' if speaker <= 5 else 'female')
            
            # Get the full path to the audio file
            path.append(os.path.join(root, file))

# Create a DataFrame with emotion, gender, and path columns
IESC_df = pd.DataFrame({
    'emotion': emotion,
    'gender': gender,
    'path': path
})

# Display emotion counts
print(IESC_df['emotion'].value_counts())

# Display first few rows of the DataFrame
print(IESC_df.head())


emotion
fear       120
neutral    120
sad        120
happy      120
angry      120
Name: count, dtype: int64
  emotion  gender                                               path
0    fear  female  /kaggle/input/indian-emotional-speech-corpora-...
1    fear  female  /kaggle/input/indian-emotional-speech-corpora-...
2    fear  female  /kaggle/input/indian-emotional-speech-corpora-...
3    fear  female  /kaggle/input/indian-emotional-speech-corpora-...
4    fear  female  /kaggle/input/indian-emotional-speech-corpora-...


In [17]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing
train_df, test_df = train_test_split(IESC_df, test_size=0.2, random_state=42)

# Check the size of the train and test sets
print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")


Training set size: 480
Testing set size: 120


In [36]:
pip install SpeechRecognition


[0m^C
Note: you may need to restart the kernel to use updated packages.


In [34]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import librosa
import speech_recognition as sr

# Path to your fine-tuned Wav2Vec2 model
model_path = "/kaggle/input/wav2vec2_fine_tuned/transformers/default/1/w2v2_fine_tuned_model"

# Try to load the Wav2Vec2 processor and model
try:
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)
    print("Loaded fine-tuned Wav2Vec2 model.")
except OSError:
    print("Preprocessor or model config is missing. Falling back to SpeechRecognition API.")
    # Initialize recognizer for Google Web Speech API fallback
    recognizer = sr.Recognizer()
    audio_path = "/kaggle/input/your-audio-file.wav"  # Replace with your audio file path

    # Load the audio file using SpeechRecognition
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    
    # Recognize speech using Google's Web Speech API
    try:
        transcription = recognizer.recognize_google(audio)
        print("Transcription using Google Speech API:", transcription)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
    exit()  # Exit if fallback is used

# If fine-tuned model is loaded, process the audio
audio_path = "/kaggle/input/your-audio-file.wav"  # Replace with your audio file path
audio_input, _ = librosa.load(audio_path, sr=16000)  # Resample to 16kHz

# Preprocess the audio for the Wav2Vec2 model
input_values = processor(audio_input, return_tensors="pt").input_values

# Perform inference with the fine-tuned Wav2Vec2 model
with torch.no_grad():
    logits = model(input_values).logits

# Get the predicted token ids
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the predicted ids to text
transcription = processor.decode(predicted_ids[0])

# Print the transcription
print("Transcription using Wav2Vec2 model:", transcription)


ModuleNotFoundError: No module named 'speech_recognition'

In [None]:
import torch

class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Map emotions to numerical labels
        emotion_mapping = {'neutral': 0, 'happy': 1, 'angry': 2, 'sad': 3, 'fear': 4}
        self.labels = [emotion_mapping[label] for label in labels]

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create train and test datasets
train_dataset = EmotionDataset(train_encodings, list(train_data["emotion"]))
test_dataset = EmotionDataset(test_encodings, list(test_data["emotion"]))


In [None]:
from transformers import Wav2Vec2ForSequenceClassification
from torch.optim import AdamW

# Load the model (Hubert or Wav2Vec2 for speech classification)
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-large-960h", num_labels=5)
model.to(device)

# Initialize the optimizer
optim = AdamW(model.parameters(), lr=1e-5)

# Set the number of epochs
epoch = 3

# Training loop
for epoch_i in range(epoch):
    print(f"Epoch {epoch_i+1}/{epoch}")
    model.train()

    # Train with DataLoader
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    correct = 0
    total = 0
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs['loss']
        loss.backward()
        optim.step()

        # Make predictions and calculate accuracy
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += len(labels)

    accuracy = correct / total
    print(f"Training Accuracy: {accuracy:.3f}")
