In [None]:
!pip install audiomentations
!pip install imbalanced-learn



In [None]:
import pandas as pd
import os
import librosa
import numpy as np
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load dataset
train_data_path = '/content/drive/MyDrive/data/extracted_files-3/en/validated_new.tsv'
audio_base_path = '/content/drive/MyDrive/data/extracted_files-3/en/input_audio_files'
df = pd.read_csv(train_data_path, sep='\t')
df.head(5)

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment,broader_accents
0,01e8ea298cdecf26e273f5baac3915eb992c493f229686...,common_voice_en_39751075.mp3,e5e7d4694b7160add018a08876327f254690c1ab4c39ea...,Madin was a significant figure of post-war Bir...,,2,0,,,American English Accent,,en,,North American English
1,03b62f72067ec967c423852bef03d1b61e63c156d86f6e...,common_voice_en_40087973.mp3,e90c361c9684d01d31bc6e8df3060bc97e536ca707bef4...,No runoff was necessary.,,2,0,teens,transgender,British English,,en,,British English
2,05d33ad00cc2754da8e542a33a5255f9346535ef1d8619...,common_voice_en_40117514.mp3,e9475052b6e625f8c5890389e4ffc17a1078dec1483592...,It was a sickening sight.,,2,0,twenties,male_masculine,Australian English,,en,,British English
3,08072f2de4dcc2bfec5058dca41eb9535b61ccd193ecc4...,common_voice_en_39603786.mp3,e4657d8d47be955eb14e04cd1c2a2b9ef89d310f639678...,It is made by mounting a sidecar to a regular ...,,2,0,sixties,male_masculine,American English Accent,,en,,North American English
4,083af8bc921baf15ad5d8c8c876f4ecaf4f52bf6370161...,common_voice_en_39603175.mp3,e443f322884c5440d7f5072f21c5b0e1f0433ba6147471...,"Within his genre, Di Giorgio is respected for ...",,2,0,,,British English,,en,,British English


In [None]:
# Check the number of samples (rows) in the DataFrame
num_samples = df.shape[0]  # Returns the number of rows
print(f"Number of samples in df: {num_samples}")

Number of samples in df: 1643


In [None]:
# 1. Filter for 'North American English'
df_na = df[df['broader_accents'] == 'North American English']

# 2. Filter for 'British English'
df_br = df[df['broader_accents'] == 'British English']

# 3. Sample 10 random rows from each
df_na_sample = df_na.sample(10)
df_br_sample = df_br.sample(10)

# 4. Combine both samples into one DataFrame
df2 = pd.concat([df_na_sample, df_br_sample])
# 5. Drop the rows in df that are present in df2
df = df.drop(df2.index)


In [None]:
# Check the number of samples (rows) in the DataFrame
num_samples = df.shape[0]  # Returns the number of rows
print(f"Number of samples in df: {num_samples}")


# Check the number of samples (rows) in the DataFrame
num_samples = df2.shape[0]  # Returns the number of rows
print(f"Number of samples in df2: {num_samples}")

Number of samples in df: 1623
Number of samples in df2: 20


In [None]:
# Map accent labels to numeric IDs
accent_labels = df['broader_accents'].unique()
accent_to_id = {accent: idx for idx, accent in enumerate(accent_labels)}
df['accent_id'] = df['broader_accents'].apply(lambda x: accent_to_id[x])

# Function to preprocess audio files
def preprocess_audio(file_path, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    audio_trimmed, _ = librosa.effects.trim(audio)
    audio_normalized = librosa.util.normalize(audio_trimmed)
    return audio_normalized

# Process audio files
processed_audio = []
accent_targets = []
for index, row in df.iterrows():
    file_path = os.path.join(audio_base_path, row['path'])
    try:
        audio = preprocess_audio(file_path)
        processed_audio.append(audio)
        accent_targets.append(row['accent_id'])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Pad audio sequences to the same length
max_length = max(len(audio) for audio in processed_audio)
def pad_audio_sequence(audio, max_length):
    return np.pad(audio, (0, max_length - len(audio)), 'constant')

padded_audio = [pad_audio_sequence(audio, max_length) for audio in processed_audio]
X = np.array(padded_audio)
y = np.array(accent_targets)


In [None]:
# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)
])

# Apply augmentation to audio data
augmented_audio = [augment(samples=audio, sample_rate=16000) for audio in processed_audio]

# Pad augmented audio sequences
max_length_augmented = max(len(audio) for audio in augmented_audio)
padded_augmented_audio = [pad_audio_sequence(audio, max_length_augmented) for audio in augmented_audio]

# Combine original and augmented data
X_combined = np.concatenate((X, padded_augmented_audio), axis=0)
y_combined = np.concatenate((y, y), axis=0)


In [None]:

# Use the oversampled data for training
X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Extract MFCC features for SVM model
def extract_mfcc_features(audio_data, sr=16000, n_mfcc=13):
    return [librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1) for audio in audio_data]

# Prepare MFCC features
X_train_mfcc = extract_mfcc_features(X_train)
X_val_mfcc = extract_mfcc_features(X_val)

# Apply Regularization by using a smaller C value
svm_model = SVC(kernel='rbf', C=0.1, gamma='auto')
svm_model.fit(X_train_mfcc, y_train)

# Evaluate SVM model using cross-validation
cv_scores = cross_val_score(svm_model, X_train_mfcc, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Evaluate the SVM model
y_pred_svm = svm_model.predict(X_val_mfcc)
print("SVM Model Accuracy:", accuracy_score(y_val, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_svm))
print("Classification Report:\n", classification_report(y_val, y_pred_svm))


Cross-Validation Scores: [0.55961538 0.55876686 0.55876686 0.56069364 0.56069364]
Mean Cross-Validation Accuracy: 0.5597072773084334
SVM Model Accuracy: 0.5553846153846154
Confusion Matrix:
 [[361   0   0   0   0]
 [222   0   0   0   0]
 [ 53   0   0   0   0]
 [  5   0   0   0   0]
 [  9   0   0   0   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.56      1.00      0.71       361
           1       0.00      0.00      0.00       222
           2       0.00      0.00      0.00        53
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         9

    accuracy                           0.56       650
   macro avg       0.11      0.20      0.14       650
weighted avg       0.31      0.56      0.40       650



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Map accent labels to numeric IDs for user-specific data
accent_labels_user = df2['broader_accents'].unique()
accent_to_id_user = {accent: idx for idx, accent in enumerate(accent_labels_user)}
df2['accent_id'] = df2['broader_accents'].apply(lambda x: accent_to_id_user[x])

# Process user-specific audio files
processed_audio_user = []
accent_targets_user = []
for index, row in df2.iterrows():
    file_path = os.path.join(audio_base_path, row['path'])
    try:
        audio = preprocess_audio(file_path)
        processed_audio_user.append(audio)
        accent_targets_user.append(row['accent_id'])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Pad user-specific audio sequences to the same length
max_length_user = max(len(audio) for audio in processed_audio_user)
padded_audio_user = [pad_audio_sequence(audio, max_length_user) for audio in processed_audio_user]
X_user = np.array(padded_audio_user)
y_user = np.array(accent_targets_user)

# Extract MFCC features for user-specific data
X_user_mfcc = extract_mfcc_features(X_user)

# Fine-tune SVM on user-specific data
svm_model.fit(X_user_mfcc, y_user)

# Evaluate fine-tuned SVM model on user-specific data
y_pred_user_svm = svm_model.predict(X_user_mfcc)
print("User-Specific SVM Model Accuracy:", accuracy_score(y_user, y_pred_user_svm))
print("Confusion Matrix (User-Specific):\n", confusion_matrix(y_user, y_pred_user_svm))
print("Classification Report (User-Specific):\n", classification_report(y_user, y_pred_user_svm))


User-Specific SVM Model Accuracy: 1.0
Confusion Matrix (User-Specific):
 [[10  0]
 [ 0 10]]
Classification Report (User-Specific):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

