# American Dialect Classification with Extreme Learning Machines

This notebook is my implementation of the approach described in *"Word Based Dialect Classification using Extreme Learning Machines"* by Rizwan et al. (2016).

The acoustic characteristics that define American dialects have been a long standing interest of mine, and building a classifier seemed like a fun way to explore this. However, I struggled with the limitations that TIMIT presented and wasnâ€™t able to make much progress with my models. To overcome this, I decided to work through this paper as a learning exercise; to deepen my understanding of the techniques the authors used, and to serve as a useful reference point for experimenting with other approaches.


| Region # | Region name              | # Speakers |
|----------|--------------------------|------------|
| DR1      | New England              | 49         |
| DR2      | Northern                 | 102        |
| DR3      | North Midland            | 102        |
| DR4      | South Midland            | 100        |
| DR5      | Southern                 | 98         |
| DR6      | New York City            | 46         |
| DR7      | Western                  | 100        |

### Feature Extraction

In [1]:
import glob

DATA_PATH = "../data/processed/words"
N_REGIONS = 7 # ignore army brat

train_files = {}
test_files = {}
words = ["dark", "water", "greasy", "suit", "wash"]

# create dicts for the training and test set 
# where the key is a word, value is a list of lists
# each element of the list are the word files for each dialect
for word in words:
    train_files[word] = []
    test_files[word] = []
    for i in range(1, N_REGIONS + 1):
        train_list = glob.glob(f'{DATA_PATH}/TRAIN/DR{i}/*/*_{word}.wav')
        test_list = glob.glob(f'{DATA_PATH}/TEST/DR{i}/*/*_{word}.wav')

        train_files[word].append(train_list)
        test_files[word].append(test_list)

# first 5 samples of "dark" for DR1
train_files["dark"][0][:5]

['../data/processed/words/TRAIN/DR1/FETB0/FETB0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/MMRP0/MMRP0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FMEM0/FMEM0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FSMA0/FSMA0_SA1_0003_dark.wav',
 '../data/processed/words/TRAIN/DR1/FJSP0/FJSP0_SA1_0003_dark.wav']

In [11]:
import librosa
import numpy as np

n_mfcc = 12
sr = 16000
n_fft = int(0.025 * sr)      # 25 ms
hop_length = int(0.01 * sr)  # 10 ms
win_length = n_fft

def extract_features(wav):
    """
    Extract MFCC + log-energy + delta + delta-delta
    Output shape: (39, T)
    """
    
    y, _ = librosa.load(wav, sr=sr)

    # peak normalization
    peak = np.max(np.abs(y))
    y = y / peak if peak > 0 else y
    
    mfcc = librosa.feature.mfcc(y=y,
                                sr=sr,
                                n_mfcc=n_mfcc,
                                n_mels=26,
                                n_fft=n_fft,
                                hop_length=hop_length,
                                win_length=win_length,
                                window="hamming",
                                norm=None) # (12, T)

    # log energy
    S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    energy = np.sum(abs(S) ** 2, axis=0)
    log_energy = np.log(energy + 1e-8)

    # concatenate to get 13 cepstral features
    mfcc = np.vstack([mfcc, log_energy])

    delta = librosa.feature.delta(mfcc, width=5, order=1)
    delta2 = librosa.feature.delta(mfcc, width=5, order=2)

    # concatenate to get (39, T)
    features = np.vstack([mfcc, delta, delta2])

    # cmvn
    mean = features.mean(axis=1, keepdims=True)
    std = features.std(axis=1, keepdims=True) + 1e-8
    features = (features - mean) / std

    return features


def get_word_features(files):
    """
    files[word][dialect] = list of wav files

    Returns:
        dict[word][dialect] = list of arrays, one per word sample.
        Each array has shape (T_i, 39)
    """
    all_features = {}

    for word in files:
        all_features[word] = []

        for wavs_per_dialect in files[word]:
            dialect_features = []

            for wav in wavs_per_dialect:

                features = extract_features(wav)   # (39, T)
                frames = features.T                # (T, 39)

                dialect_features.append(frames)

            all_features[word].append(dialect_features)

    return all_features


train_features = get_word_features(train_files)
test_features = get_word_features(test_files)

In [12]:
# len should be n speakers for dialect
print(len(train_features["dark"][0]))

# shape should be (n frames, 39)
print(train_features["dark"][0][0].shape)

38
(33, 39)


### Single Model Experiment

In [59]:
from hpelm import ELM
import os
import sys
from contextlib import redirect_stdout
from sklearn.preprocessing import StandardScaler

n_features = (n_mfcc + 1) * 3 # 39

def append_class_samples(features_1, features_2):

    X = []
    y = []
    
    for sample in features_1:
        X.append(sample)
        y.append(np.full(sample.shape[0], 0))

    for sample in features_2:
        X.append(sample)
        y.append(np.full(sample.shape[0], 1))

    X = np.vstack(X)
    y = np.hstack(y)

    return X, y

def onehot(y, num_classes=2):
    y_onehot = y_onehot = np.zeros((y.size, num_classes))
    y_onehot[np.arange(y.size), y] = 1
    
    return y_onehot

def get_weights(class_1, class_2):
    """ Inverse class frequency """
    n1 = sum(sample.shape[0] for sample in class_1)
    n2 = sum(sample.shape[0] for sample in class_2)
    total = n1 + n2
    return np.array([total / (2 * n1), total / (2 * n2)])

class_1 = train_features["dark"][0] # list of all class 1 samples, each shape (T, 39)
class_2 = train_features["dark"][1]
weights = get_weights(class_1, class_2)

X_train, y_train = append_class_samples(class_1, class_2)
X_test, y_test = append_class_samples(test_features["dark"][0],
                                      test_features["dark"][1])

# scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# train
elm_d1_d2 = ELM(n_features, 2)
elm_d1_d2.add_neurons(500, "sigm")
with open(os.devnull, "w") as f, redirect_stdout(f):
    error = elm_d1_d2.train(X_train, onehot(y_train), 'CV', 'wc', w=weights, k=10)

# eval
y_pred = elm_d1_d2.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

accuracy = np.mean(y_pred_labels == y_test)

print(f"Frame level accuracy: {accuracy * 100:.2f}%\nError: {error}")

Frame level accuracy: 65.30%
Error: 0.4542728329086879


### Single Word Pairwise ELM Classification

In [60]:
def build_pairwise_elms(word, log=False):
    pairwise_elms = {}

    for i in range(N_REGIONS):
        for j in range(i + 1, N_REGIONS):

            elm_name = f'DR{i + 1}, DR{j + 1}'

            class_1_features = train_features[word][i] # 0
            class_2_features = train_features[word][j] # 1
            
            X, y = append_class_samples(class_1_features, class_2_features)

            # handle class imbalance 
            weights = get_weights(class_1_features, class_2_features)

            # scale
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            y_oh = onehot(y)

            elm = ELM(n_features, 2)
            elm.add_neurons(1000, "sigm")
            with open(os.devnull, "w") as f, redirect_stdout(f):
                error = elm.train(X_scaled, y_oh, 'CV', 'wc', w=weights, k=10)

            pairwise_elms[elm_name] = (elm, scaler, i, j)

            if log:
                print(f'{elm_name} : {error}')

    return pairwise_elms

In [61]:
def count_to_score(count):
    """
    Assumes class will have 1 - 6 votes.
    """
    mapping = {6 : 2 ** 1,
               5 : 2 ** 0,
               4 : 2 ** -1,
               3 : 2 ** -2,
               2 : 2 ** -3,
               1 : 2 ** -4,
               0 : 0}

    return np.array(list(map(lambda x: mapping[x], count)))


def classify_sample(X, pairwise_elms):
    """
    Tallies binary classifier votes for a word sample.
    Selects class with the highest score.
    """
    count = np.array([0, 0, 0, 0, 0, 0, 0])
    
    for elm, scaler, class_1, class_2 in pairwise_elms.values():

        X_scaled = scaler.transform(X)
        
        y_pred = elm.predict(X_scaled)
        y_pred = np.argmax(y_pred, axis=1) # frame level predictions
        y_pred_label = np.bincount(y_pred).argmax() # overall prediction

        count[(class_1, class_2)[y_pred_label]] += 1

    scores = count_to_score(count)
    
    return np.argmax(scores)

def eval_word(test_features, pairwise_elms):
    decisions = []
    
    for i, dialect in enumerate(test_features):
        for sample in dialect:
            decision = classify_sample(sample, pairwise_elms)
            decisions.append(1 if decision == i else 0)

    decisions = np.array(decisions)
    accuracy = np.mean(decisions)

    return accuracy * 100

In [62]:
elms_dark = build_pairwise_elms("dark", True)
print(eval_word(test_features["dark"], elms_dark))

DR1, DR2 : 0.4062790112563349
DR1, DR3 : 0.4050611536502899
DR1, DR4 : 0.36183703462133837
DR1, DR5 : 0.3806832387355392
DR1, DR6 : 0.3861589507268691
DR1, DR7 : 0.3946966015210175
DR2, DR3 : 0.42102999833842986
DR2, DR4 : 0.35621334859521564
DR2, DR5 : 0.35060531435948705
DR2, DR6 : 0.38500665612104135
DR2, DR7 : 0.40619071135784657
DR3, DR4 : 0.4161590602975426
DR3, DR5 : 0.3784060665526682
DR3, DR6 : 0.40339433196902763
DR3, DR7 : 0.43583159716432196
DR4, DR5 : 0.44817176256575125
DR4, DR6 : 0.4045036042897037
DR4, DR7 : 0.4007569525794503
DR5, DR6 : 0.44814528019306776
DR5, DR7 : 0.3578442984406571
DR6, DR7 : 0.42576190976745865
24.840764331210192


In [63]:
accuracies = []

for i in range(10):
    elms = build_pairwise_elms("dark")
    accuracy = eval_word(test_features["dark"], elms)

    accuracies.append(accuracy)

print(f'Mean: {np.mean(accuracies)} \nStd: {np.std(accuracies)} \nMax: {max(accuracies)} \nMin: {min(accuracies)} \n')

Mean: 30.95541401273885 
Std: 2.5027876056545857 
Max: 35.6687898089172 
Min: 26.751592356687897 



### References

1. Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1. Web 
Download. Philadelphia: Linguistic Data Consortium, 1993.

2. Rizwan, M., Odelowo, B. O., & Anderson, D. V. (2016, July). Word based dialect classification using extreme learning machines. In 2016 International Joint Conference on Neural Networks (IJCNN) (pp. 2625-2629). IEEE.