# American Dialect Classification with Extreme Learning Machines

This notebook is my implementation of the approach described in *"Word Based Dialect Classification using Extreme Learning Machines"* by Rizwan et al. (2016).

The acoustic characteristics that define American dialects have been a long standing interest of mine, and building a classifier seemed like a fun way to explore this. However, I struggled with the limitations that TIMIT presented and wasnâ€™t able to make much progress with my models. To overcome this, I decided to work through this paper as a learning exercise; to deepen my understanding of the techniques the authors used, and to serve as a useful reference point for experimenting with other models.


| Region # | Region name              | # Speakers |
|----------|--------------------------|------------|
| DR1      | New England              | 49         |
| DR2      | Northern                 | 102        |
| DR3      | North Midland            | 102        |
| DR4      | South Midland            | 100        |
| DR5      | Southern                 | 98         |
| DR6      | New York City            | 46         |
| DR7      | Western                  | 100        |

### Feature Extraction

In [1]:
import glob

DATA_PATH = "../data/processed/words"
N_REGIONS = 7 # ignore army brat

train_files = {}
test_files = {}
words = ["dark", "water", "greasy", "suit", "wash"]

# create dicts for the training and test set 
# where the key is a word, value is a list of lists
# each element of the list are the word files for each dialect
for word in words:
    train_files[word] = []
    test_files[word] = []
    for i in range(1, N_REGIONS + 1):
        train_list = glob.glob(f'{DATA_PATH}/TRAIN/DR{i}/*/*_{word}.wav')
        test_list = glob.glob(f'{DATA_PATH}/TEST/DR{i}/*/*_{word}.wav')

        train_files[word].append(train_list)
        test_files[word].append(test_list)

# first 5 samples of "dark" for DR1
train_files["dark"][0][:5]

['../data/processed/words/TRAIN/DR1/FETB0/FETB0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/MMRP0/MMRP0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FMEM0/FMEM0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FSMA0/FSMA0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FJSP0/FJSP0_SA1_0003_dark.wav']

In [14]:
import librosa
import numpy as np

n_mfcc = 12
sr = 16000
n_fft = int(0.025 * sr)      # 25 ms
hop_length = int(0.01 * sr)  # 10 ms
win_length = n_fft

def extract_features(wav):
    """
    Extract MFCC + log-energy + delta + delta-delta
    Output shape: (39, T)
    """
    
    y, _ = librosa.load(wav, sr=sr)
    y = librosa.util.normalize(y) # amplitude normalization
    mfcc = librosa.feature.mfcc(y=y,
                                sr=sr,
                                n_mfcc=n_mfcc,
                                n_fft=n_fft,
                                hop_length=hop_length,
                                win_length=win_length,
                                window="hamming",
                                htk=True,
                                center=False) # (12, T)

    # log energy
    frames = librosa.util.frame(y, frame_length=n_fft, hop_length=hop_length)
    energy = np.sum(frames ** 2, axis=0)
    energy = energy / np.max(energy + 1e-8) # normalize
    log_energy = np.log(energy + 1e-10) # (T,)

    # ensure frame alignment
    T = min(mfcc.shape[1], log_energy.shape[0])
    mfcc = mfcc[:, :T]
    log_energy = log_energy[:T]

    # concatenate to get 13 cepstral features
    mfcc = np.vstack([mfcc, log_energy])

    delta = librosa.feature.delta(mfcc, width=5, order=1)
    delta2 = librosa.feature.delta(mfcc, width=5, order=2)

    # concatenate to get (39, T)
    features = np.vstack([mfcc, delta, delta2])

    return features


def get_word_features(files):
    """
    files: dict[word] = list of lists of wavs of word per dialect
    returns: dict[word] = list of arrays, each array is one word sample (pooled)
    """
    all_features = {}

    for word in files:
        all_features[word] = []

        for wavs_per_dialect in files[word]:
            dialect_word_instances = []

            for wav in wavs_per_dialect:

                features = extract_features(wav)   # (39, T)
                frames = features.T                # (T, 39)

                dialect_word_instances.append(frames)

            all_features[word].append(dialect_word_instances)

    return all_features


train_features = get_word_features(train_files)
test_features = get_word_features(test_files)

In [15]:
# shape should be (num_speakers_in_region, 39)
train_features["dark"][0][0].shape

(31, 39)

### Single Model Experiment

In [16]:
from hpelm import ELM
import os
import sys
from contextlib import redirect_stdout
from sklearn.preprocessing import StandardScaler

n_features = (n_mfcc + 1) * 3 # 39

def append_class_samples(features_1, features_2):

    X = []
    y = []
    
    for sample in features_1:
        X.append(sample)
        y.append(np.full(sample.shape[0], 0))

    for sample in features_2:
        X.append(sample)
        y.append(np.full(sample.shape[0], 1))

    X = np.vstack(X)
    y = np.hstack(y)

    return X, y

def onehot(y, num_classes=2):
    y_onehot = y_onehot = np.zeros((y.size, num_classes))
    y_onehot[np.arange(y.size), y] = 1
    
    return y_onehot

def get_weights(class_1, class_2):
    return np.array([sum([sample.shape[0] for sample in class_1]), sum([sample.shape[0] for sample in class_2])])

class_1 = train_features["dark"][0] # list of all class 1 samples, shape (T, 39)
class_2 = train_features["dark"][1]
weights = get_weights(class_1, class_2)

X_train, y_train = append_class_samples(class_1, class_2)
X_test, y_test = append_class_samples(test_features["dark"][0],
                                      test_features["dark"][1])

# scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# train
elm_d1_d2 = ELM(n_features, 2)
elm_d1_d2.add_neurons(500, "sigm")
with open(os.devnull, "w") as f, redirect_stdout(f):
    loss = elm_d1_d2.train(X_train_scaled, onehot(y_train), 'CV', 'wc', w=weights, k=10)

# eval
y_pred = elm_d1_d2.predict(X_test_scaled)
y_pred_labels = np.argmax(y_pred, axis=1)

accuracy = np.mean(y_pred_labels == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%\nLoss: {loss}")

Test Accuracy: 67.12%
Loss: 0.31942863232632646


### Single Word Pairwise ELM Classification

In [17]:
def build_pairwise_elms(word, hidden_range=range(100, 1000 + 1, 100)):
    pairwise_elms = {}

    for i in range(N_REGIONS):
        for j in range(i + 1, N_REGIONS):

            elm_name = f'DR{i + 1}, DR{j + 1}'

            class_1_features = train_features[word][i] # 0
            class_2_features = train_features[word][j] # 1
            
            X, y = append_class_samples(class_1_features, class_2_features)

            # handle class imbalance
            weights = get_weights(class_1_features, class_2_features)
            
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            y_oh = onehot(y)

            elm = ELM(n_features, 2)
            elm.add_neurons(1000, "sigm")
            with open(os.devnull, "w") as f, redirect_stdout(f):
                error = elm.train(X_scaled, y_oh, 'CV', 'wc', w=weights, k=10)

            pairwise_elms[elm_name] = (elm, scaler, i, j)
            
            print(f'{elm_name} : {error}%')

    return pairwise_elms

In [18]:
elms_dark = build_pairwise_elms("dark")

DR1, DR2 : 0.3059641987302921%
DR1, DR3 : 0.287922343770176%
DR1, DR4 : 0.2943430300066737%
DR1, DR5 : 0.2822035070320056%
DR1, DR6 : 0.36451905746023616%
DR1, DR7 : 0.30645509550893735%
DR2, DR3 : 0.3938684746311273%
DR2, DR4 : 0.34092383737179055%
DR2, DR5 : 0.3129571123961613%
DR2, DR6 : 0.3130293649142041%
DR2, DR7 : 0.38058839348619355%
DR3, DR4 : 0.3992642040476337%
DR3, DR5 : 0.35593795769693526%
DR3, DR6 : 0.3058620516719931%
DR3, DR7 : 0.4075103217426428%
DR4, DR5 : 0.4130840799821466%
DR4, DR6 : 0.3251543298918204%
DR4, DR7 : 0.37831161078397285%
DR5, DR6 : 0.3359222416706203%
DR5, DR7 : 0.32582889268451%
DR6, DR7 : 0.3263431069645981%


In [19]:
def count_to_score(count):
    """
    Assumes class will have 1 - 6 votes.
    """
    mapping = {6 : 2 ** 1,
               5 : 2 ** 0,
               4 : 2 ** -1,
               3 : 2 ** -2,
               2 : 2 ** -3,
               1 : 2 ** -4,
               0 : 0}

    return np.array(list(map(lambda x: mapping[x], count)))


def classify_sample(X, pairwise_elms):
    """
    Tallies binary classifier votes for a word sample.
    Selects class with the highest score.
    """
    count = np.array([0, 0, 0, 0, 0, 0, 0])
    
    for elm, scaler, class_1, class_2 in pairwise_elms.values():

        X_scaled = scaler.transform(X)
        y_pred = elm.predict(X_scaled)
        y_pred = np.argmax(y_pred, axis=1) # frame level predictions
        y_pred_label = np.bincount(y_pred).argmax() # overall prediction

        count[(class_1, class_2)[y_pred_label]] += 1

    scores = count_to_score(count)

    return np.argmax(scores)

def eval_word(test_features, pairwise_elms):
    decisions = []
    
    for i, dialect in enumerate(test_features):
        for sample in dialect:
            decision = classify_sample(sample, pairwise_elms)
            decisions.append(1 if decision == i else 0)

    decisions = np.array(decisions)
    accuracy = np.mean(decisions)

    print(f'{accuracy * 100}')


eval_word(test_features["dark"], elms_dark)

31.210191082802545


### References

1. Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1. Web 
Download. Philadelphia: Linguistic Data Consortium, 1993.

2. Rizwan, M., Odelowo, B. O., & Anderson, D. V. (2016, July). Word based dialect classification using extreme learning machines. In 2016 International Joint Conference on Neural Networks (IJCNN) (pp. 2625-2629). IEEE.