In [1]:
#@title Mount drive

from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [2]:
#@title Install dependencies

!pip install python_speech_features

Collecting python_speech_features
  Downloading https://files.pythonhosted.org/packages/ff/d1/94c59e20a2631985fbd2124c45177abaa9e0a4eee8ba8a305aa26fc02a8e/python_speech_features-0.6.tar.gz
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-cp36-none-any.whl size=5887 sha256=7577a4508fc37fae8ae5b5152cee501fff3b15f084c5d821679a77a8ece9b732
  Stored in directory: /root/.cache/pip/wheels/3c/42/7c/f60e9d1b40015cd69b213ad90f7c18a9264cd745b9888134be
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


In [30]:
#@title Data pre-processing

import json
import pandas as pd
import python_speech_features as pss
import numpy as np
from pathlib import Path
from pysndfile import sndio
from intervaltree import IntervalTree
from sklearn.preprocessing import StandardScaler

WIN_LEN = 0.03
WIN_SHIFT = 0.01


def get_root_dir():
    # TODO: adjust your path here
    return Path('/') / 'content' / 'gdrive' / 'My Drive' / 'Dual Student'


def load_audio(filepath):
    """
    Loads the utterance samples from a file.

    Source: lab3 of DT2119 Speech and Speaker Recognition at KTH, by prof. Giampiero Salvi (slightly modified)

    :param filepath: path to the utterance file (.wav)
    :return: (samples, sample rate), where samples is a numpy array of shape (n_samples,)
    """
    snd_obj = sndio.read(filepath, dtype=np.int16)
    samples = np.array(snd_obj[0])
    sample_rate = snd_obj[1]
    return samples, sample_rate


def load_transcription(filepath):
    """
    Loads the phonetic transcription of an utterance from a file.

    :param filepath: path to the transcription file (.phn)
    :return: list of tuples (begin_sample, end_sample, phone)
    """
    with filepath.open() as f:
        lines = f.read().split('\n')
    transcription = map(lambda line: line.split(' '), lines)
    transcription = filter(lambda segment: len(segment) == 3, transcription)    # remove invalid lines
    transcription = map(lambda segment: (int(segment[0]), int(segment[1]), segment[2]), transcription)
    transcription = list(transcription)
    return transcription


def get_number_of_frames(n_samples, sample_rate, win_len, win_shift):
    """
    Returns the number of frames for which the window is fully contained.

    :param n_samples: number of samples
    :param sample_rate: sampling rate
    :param win_len: window length (in seconds)
    :param win_shift: window shift (in seconds)
    :return: number of frames
    """
    win_len = round(win_len * sample_rate)
    win_shift = round(win_shift * sample_rate)
    return 1 + int((n_samples - win_len) / win_shift)


def extract_features(samples, sample_rate, win_len, win_shift, win_fun=np.hamming):
    """
    Computes 13 MFCC + delta + delta-delta features for an utterance.

    :param samples: samples of the utterance, numpy array of shape (n_samples,)
    :param sample_rate: sampling rate
    :param win_len: window length (in seconds)
    :param win_shift: window shift (in seconds)
    :param win_fun: window function
    :return: numpy array of shape (n_frames, n_features), where n_features=39
    """
    mfcc = pss.mfcc(samples, sample_rate, winlen=win_len, winstep=win_shift, winfunc=win_fun)
    delta = pss.delta(mfcc, 3)
    delta_delta = pss.delta(delta, 3)
    return np.concatenate((mfcc, delta, delta_delta), axis=1)


def extract_labels(transcription, sample_rate, n_frames, win_len, win_shift):
    """
    Extracts the phone labels from a phone transcription. The mid-point is used to solve ambiguities (frames with more
    than one label).

    :param transcription: phone transcription, list of tuples (begin_sample, end_sample, label)
    :param sample_rate: sampling rate
    :param n_frames: number of frames of the utterance
    :param win_len: window length (in seconds)
    :param win_shift: window shift (in seconds)
    :return: list of length n_frames
    """
    # fill interval tree
    segments = IntervalTree()
    for segment in transcription:
        begin_sample = segment[0]
        end_sample = segment[1]
        label = segment[2]
        assert len(segments[begin_sample:end_sample]) == 0    # no overlaps
        segments[begin_sample:end_sample] = label

    # seconds -> samples
    win_len = round(win_len * sample_rate)
    win_shift = round(win_shift * sample_rate)

    # find labels of middle samples
    labels = []
    mid_sample = transcription[0][0] + int(win_len/2)
    for i in range(n_frames):
        labels.append(segments[mid_sample].pop().data)
        mid_sample += win_shift
    return labels


def stack_acoustic_context(features, n):
    """
    For each feature vector (frame), stack feature vectors on the left and on the right to get an acoustic context
    (dynamic features).

    :param features: original features, numpy array of shape (n_frames, n_features)
    :param n: how many features on the left and on the right to stack (acoustic context or dynamic features)
    :return: features with acoustic context, numpy array of shape (n_frames, context*n_features)
    """
    if n < 0 or n > features.shape[0]:
        raise ValueError('Invalid context size')
    if n == 0:
        return features
    length = features.shape[0]
    idx_list = list(range(length))
    idx_list = idx_list[1:1+n][::-1] + idx_list + idx_list[-1-n:-1][::-1]
    features = [features[idx_list[i:i+1+2*n]].reshape(-1) for i in range(length)]
    return np.array(features)


def normalize(train_set, test_set, mode='full'):
    """
    Normalizes the dataset according to the specified mode.

    :param train_set: list of utterances, each utterance is a dictionary containing utterance info useful for
        normalization, feature vectors, and phone labels.
    :param test_set: list of utterances, each utterance is a dictionary containing utterance info useful for
        normalization, feature vectors, and phone labels.
    :param mode: normalization mode. Support for: 'full', 'speaker', 'utterance'.
    :return: (train_set, test_set), normalized
    """
    if mode == 'full':
        # fit scaler
        x_train = np.concatenate([utterance['features'] for utterance in train_set])
        ss = StandardScaler()
        ss.fit(x_train)

        # normalize
        for utterance in train_set:
            utterance['features'] = ss.transform(utterance['features'])
        for utterance in test_set:
            utterance['features'] = ss.transform(utterance['features'])

    elif mode == 'speaker':
        # TODO
        raise NotImplementedError('Normalization mode ' + mode + ' not yet supported')
    elif mode == 'utterance':
        # TODO
        raise NotImplementedError('Normalization mode ' + mode + ' not yet supported')
    else:
        raise ValueError('Invalid normalization mode')

    return train_set, test_set


def path_to_info(path):
    """
    Extracts the information about an utterance starting from its path.

    Path format: .../<USAGE>/<DIALECT>/<SEX><SPEAKER_ID>/<TEXT_TYPE><SENTENCE_NUMBER>.<FILE_TYPE>
    Example: .../train/dr1/mwar0/sx415.wav

    See timit/readme.doc for an explanation of each field.

    :param path: path the utterance file
    :return: dictionary with utterance information
    """
    return {
        'file_type': path.suffix,
        'text_type': path.stem[:2],
        'sentence_number': int(path.stem[2:]),
        'sex': path.parts[-2][0],
        'speaker_id': path.parts[-2][1:],
        'dialect': path.parts[-3],
        'usage': path.parts[-4]
    }


def get_core_test_speakers():
    """
    Returns a dictionary (dialect -> list of speaker_id) for the core test set.

    :return: dictionary (dialect -> list of speaker_id)
    """
    filepath = get_root_dir() / 'data' / 'timit_core_test_set.json'
    with filepath.open() as json_file:
        return json.load(json_file)


def get_phone_mapping():
    """
    Generates:
    - dictionary (origin phone -> train label), to load targets for the model from transcriptions. Different phones can
        be mapped to the same label, as a subset of phones is used for training (48 phones).
    - dictionary (train label -> test label), to evaluate the model on a subset of the training phones (39 phones).

    The training and test phone subsets are chosen according to standard recipes for TIMIT.

    :return: tuple (phone_labels, evaluation_mapping), containing the described dictionaries.
    """
    # read file
    filepath = get_root_dir() / 'data' / 'timit_phones_60-48-39.map'
    with filepath.open() as csv_file:
        data_frame = pd.read_csv(csv_file, sep='\t')
    data_frame = data_frame.dropna()

    # load phone mappings
    origin_to_train_phone = {op: tp for op, tp in zip(data_frame['origin'], data_frame['train'])}
    origin_to_test_phone = {op: tp for op, tp in zip(data_frame['origin'], data_frame['test'])}

    # generate labels (sorting in order to be sure that multiple calls generate always the same dictionaries)
    train_labels = {phone: label for label, phone in enumerate(sorted(data_frame['train'].unique()))}
    test_labels = {phone: label for label, phone in enumerate(sorted(data_frame['test'].unique()))}

    # get phone labels (origin phone -> train label, to generate targets from transcriptions)
    origin_to_train_label = {}
    for origin_phone in data_frame['origin']:
        train_phone = origin_to_train_phone[origin_phone]
        origin_to_train_label[origin_phone] = train_labels[train_phone]

    # get evaluation mapping (train label -> test label, to evaluate the model using a subset of phones)
    train_label_to_test_label = {}
    for origin_phone in data_frame['origin']:
        test_phone = origin_to_test_phone[origin_phone]
        train_label = origin_to_train_label[origin_phone]
        train_label_to_test_label[train_label] = test_labels[test_phone]

    return origin_to_train_label, train_label_to_test_label


def _preprocess_data(dataset_path, core_test=False):
    # get phone labels
    phone_labels, _ = get_phone_mapping()

    # get speakers in core test
    core_test_speakers = None   # we need them only if core_test=True, we initialize to None to avoid a warning
    if core_test:
        core_test_speakers = get_core_test_speakers()

    # prepare dataset
    dataset = []
    for filepath in dataset_path.glob('**/*.wav'):
        info = path_to_info(filepath)

        # check sentence and speaker
        if info['text_type'] == 'sa':
            continue
        if core_test and not info['speaker_id'] in core_test_speakers[info['dialect']]:
            continue

        # load audio and transcription
        print('Processing ', filepath, '...', sep='', end=' ')
        samples, sample_rate = load_audio(filepath)
        filepath = filepath.with_suffix('.phn')
        transcription = load_transcription(filepath)

        # drop leading and trailing samples not in the transcription
        samples = samples[transcription[0][0]:transcription[-1][1]]

        # extract features and labels
        features = extract_features(samples, sample_rate, WIN_LEN, WIN_SHIFT)
        n_frames = get_number_of_frames(samples.shape[0], sample_rate, WIN_LEN, WIN_SHIFT)
        assert features.shape[0] - n_frames <= 1
        features[:n_frames]     # the last frame may have the window not fully inside, we drop it
        labels = extract_labels(transcription, sample_rate, n_frames, WIN_LEN, WIN_SHIFT)

        # drop frames with ignored phones as target (glottal stop /q/)
        labels = np.array([(phone_labels[label] if label in phone_labels else -1) for label in labels])
        valid_idx = np.where(labels != -1)[0]
        features = features[valid_idx]
        labels = labels[valid_idx]
        print('done')

        # add to dataset
        dataset.append({
            'dialect': info['dialect'],
            'sex': info['sex'],
            'speaker_id': info['speaker_id'],
            'features': features,
            'labels': labels
        })

    return np.array(dataset)


def load_data(dataset_path, core_test=True, force_preprocess=False):
    """
    Returns training and test set containing features (13 MFCC + delta + delta-delta) and labels (phones encoded as
    integers).

    The split in training and test sets is the recommended one (see timit/readme.doc and timit/doc/testset.doc).

    :param dataset_path: path to the dataset. Since the TIMIT dataset is protected by copyright, it is not distributed
        with the package.
    :param core_test: whether to use the core test set (see timit/doc/testset.doc) instead of the complete test set
    :param force_preprocess: force to pre-process again, even if saved data can be loaded
    :return: dictionary {'train': train_set, 'test': test_set}, where train_set and test_set are numpy arrays of
        utterances. Each utterance is a dictionary containing utterance info useful for normalization, feature vectors,
        and phone labels.
    """
    dataset_path = Path(dataset_path)
    if not dataset_path.is_dir():
        raise ValueError('Invalid dataset path')

    # training set
    filepath = get_root_dir() / 'data' / 'timit_train.npz'
    if filepath.is_file() and not force_preprocess:
        print('Loading training set...', end=' ')
        train_set = np.load(filepath, allow_pickle=True)['train_set']
        print('done')
    else:
        print('Preparing training set...')
        train_set = _preprocess_data(dataset_path / 'train')
        np.savez(filepath, train_set=train_set)

    # test set
    filepath = get_root_dir() / 'data' / ('timit_' + ('core_' if core_test else '') + 'test.npz')
    if filepath.is_file() and not force_preprocess:
        print('Loading test set...', end=' ')
        test_set = np.load(filepath, allow_pickle=True)['test_set']
        print('done')
    else:
        print('Preparing test set...')
        test_set = _preprocess_data(dataset_path / 'test', core_test)
        np.savez(filepath, test_set=test_set)

    return train_set, test_set



In [33]:
#@title Data input pipeline

import tensorflow as tf

# set this according to your path
DATASET_PATH = '/content/gdrive/My Drive/Dual Student/data/timit'
BUFFER_SIZE = 1024
BATCH_SIZE = 32

def _get_tf_dataset(dataset, padding_values, shuffle=False):
    features = [utterance['features'] for utterance in dataset]
    labels = [utterance['labels'] for utterance in dataset]
    n_features = features[0].shape[1]

    x_dataset = tf.data.Dataset.from_generator(lambda: features, output_types=tf.float64)
    y_dataset = tf.data.Dataset.from_generator(lambda: labels, output_types=tf.int32, output_shapes=(None,))
    dataset = tf.data.Dataset.zip((x_dataset, y_dataset))
    dataset = dataset.shuffle(buffer_size=BUFFER_SIZE)      # before padding, less memory used!
    dataset = dataset.padded_batch(batch_size=BATCH_SIZE, padding_values=padding_values,
                                   padded_shapes=((None, n_features), (None,)))
    return dataset


train_set, test_set = load_data(get_root_dir() / 'data' / 'timit')
train_set, test_set = normalize(train_set, test_set)

# define input pipeline
# TODO: choose padding values not present in the dataset
padding_values = (tf.constant(-50, dtype=tf.float64), tf.constant(50, dtype=tf.int32))
train_ds = _get_tf_dataset(train_set, padding_values, shuffle=True)
test_ds = _get_tf_dataset(train_set, padding_values)

Loading training set... done
Loading test set... done


In [35]:
#@title Model

%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
from numpy import newaxis


class DualStudent():

    def __init__(self, nr_of_units=768, nr_of_layers=6, nr_of_classes=61, student_version="Mono_directional", show_summary=True, epsilon=0.016395, lambda_1=1, lambda_2=10000):
        self.nr_of_units=nr_of_units
        self.nr_of_layers=nr_of_layers
        self.nr_of_classes=nr_of_classes
        self.student_version=student_version
        self.x=None
        self.y=None
        self.lambda_1=lambda_1
        self.lambda_2=lambda_2
        self.epsilon=epsilon
        self.cce = tf.keras.losses.CategoricalCrossentropy()
        self.mse = tf.keras.losses.MeanSquaredError()
        self.show_summary=show_summary
        self.get_data()
        if self.student_version=="Mono_directional":
            self.student1=self.get_model("student1")
            self.student2=self.get_model("student2")

        elif self.student_version=="Imbalanced":
            self.student1=self.get_model("student1")
            self.student2=self.get_model("student2", lstm_version="Bi_directional")

        else:
            self.student1=self.get_model("student1" , lstm_version="Bi_directional" )
            self.student2=self.get_model("student2", lstm_version="Bi_directional" )
        
        self.models={"student1":self.student1,"student2":self.student2}

    def get_data(self):
        with open('/content/gdrive/My Drive/train_xspeech.npy', 'rb') as f:
            train_x = np.load(f)
            self.y = np.load(f)

        shape_=np.shape(train_x)
        x_train=train_x[:,newaxis,:]
        tf.reshape(x_train,(shape_[0],1, shape_[1]) )
        self.x = x_train 

    def get_model(self, name_="", lstm_version="Mono_directional"):
        inputs = tf.keras.Input(shape=np.shape(self.x)[1:])

        if lstm_version=="Bi_directional":
            x=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=True))(inputs) 
            for i in range(self.nr_of_layers-3):
                x=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=True))(x)  
            x=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=False))(x)  
            outputs=tf.keras.layers.Dense(units=self.nr_of_classes, activation="softmax")(x)

        else:
            x=tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=True)(inputs) 
            for i in range(self.nr_of_layers-3):
                x=tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=True)(x)  
            x=tf.keras.layers.LSTM(units=self.nr_of_units, return_sequences=False)(x)  
            outputs=tf.keras.layers.Dense(units=self.nr_of_classes, activation="softmax")(x)

        model = tf.keras.Model(inputs=inputs, outputs=outputs, name=lstm_version+"_"+name_)
        if self.show_summary:  
            model.summary()
            print("\n\n")
        optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, name='SGD')
        model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
        return model
    
    def train(self, x=None, y=None, nr_epochs=100, batch_size=100, unlabeled_x=None):
        if x==None and y==None:
            x=self.x
            y=self.y

        self.epochs=nr_epochs
        losses={}
        stable_samples={}
        for epoch in range(1,self.epochs+1):
            #model="student1"
            #Y_pred=self.models[model].predict(x)
            #print("accuracy",tf.reduce_mean(tf.keras.metrics.categorical_accuracy(y, Y_pred)))
            for i in range(int(np.ceil(np.size(x,0)/batch_size))):
                x_batch=x[i*batch_size:(i+1)*batch_size]
                y_true=y[i*batch_size:(i+1)*batch_size]
                B_1=x_batch + np.random.random(np.shape(x_batch))*0.1
                B_2=x_batch + np.random.random(np.shape(x_batch))*0.1

                #change this when we get real data
                unlabeled_x=x[0:100]
                noisy_augmentation = unlabeled_x + np.random.random(np.shape(unlabeled_x))

                with tf.GradientTape(persistent=True) as tape:
                    for model in self.models:
                    
                        # Calculate L_cls on labeled samples
                        y_pred=self.models[model](x_batch)
                        loss_cls =  self.cce(y_true, y_pred)

                        # Calculate L_con by Eq. 1 between B1 and B2
                        y_B_1=self.models[model](B_1)
                        y_B_2=self.models[model](B_2)
                        loss_con= self.lambda_1 * self.mse(y_B_1, y_B_2)
                        losses[model+"_loss"] = loss_cls + self.lambda_1 * loss_con
                        
                        # Determine whether x is stable by Eq. 3
                        U_pred=self.models[model](unlabeled_x)
                        noisy_pred=self.models[model](noisy_augmentation)
                        
                        P_i=tf.argmax(U_pred, axis=1)
                        P_j=tf.argmax(noisy_pred, axis=1)   
                        M_i=tf.math.reduce_max(U_pred, axis=1) 
                        M_j=tf.math.reduce_max(noisy_pred, axis=1)  
                        M_i_j=tf.where(tf.where(M_i>self.epsilon,1,0)+tf.where(M_j>self.epsilon,1,0)>0,1,0)
                        
                        stable_samples[model]=tf.where(P_i==P_j,1,0)*M_i_j 
                        stable_samples[model+"_pred"]=U_pred 
                        stable_samples[model+"_noise"]=noisy_pred


                    # R_1, R_2, R_i, R_j and R_12 does not mean the same thing as in the paper
                    R_1=tf.where(stable_samples["student1"]-stable_samples["student2"]>0,True,False)                
                    R_2=tf.where(stable_samples["student2"]-stable_samples["student1"]>0,True,False)
                    R_12=tf.where(stable_samples["student1"]+stable_samples["student2"]==2,True,False)
                    
                    # where both R_1 and R_2 are equal to one (R_12) measure prediction consistancy with Euclidean distance
                    epsilon_i=tf.math.reduce_euclidean_norm(stable_samples["student1_pred"][R_12]-stable_samples["student1_noise"][R_12], axis=1)
                    epsilon_j=tf.math.reduce_euclidean_norm(stable_samples["student2_pred"][R_12]-stable_samples["student2_noise"][R_12], axis=1)
                    R_i=epsilon_i>epsilon_j
                    R_j=epsilon_i<=epsilon_j

                    # loss_sta for student 1
                    sample1_update1=tf.concat([stable_samples["student1_pred"][R_2], stable_samples["student1_pred"][R_12][R_i]],axis=0)
                    sample2_update1=tf.concat([stable_samples["student2_pred"][R_2], stable_samples["student2_pred"][R_12][R_i]],axis=0)
                    loss_sta=self.mse(sample1_update1, sample2_update1)
                    losses["student1_loss"] = losses["student1_loss"] + self.lambda_2 * loss_sta

                    # loss_sta for student 2
                    sample1_update2=tf.concat([stable_samples["student1_pred"][R_1], stable_samples["student1_pred"][R_12][R_j]],axis=0)
                    sample2_update2=tf.concat([stable_samples["student2_pred"][R_1], stable_samples["student2_pred"][R_12][R_j]],axis=0)
                    loss_sta=self.mse(sample1_update2, sample2_update2)
                    losses["student2_loss"] = losses["student2_loss"] + self.lambda_2 * loss_sta

                # update the model parameters 
                for model in self.models:
                    trainable_vars = self.models[model].trainable_variables
                    gradients = tape.gradient(losses[model+"_loss"], trainable_vars)
                    self.models[model].optimizer.apply_gradients(zip(gradients, trainable_vars))
                del tape

In [36]:
#@title Training

# train model
models={}
for version_ in ["Mono_directional", "Imbalanced", "Bi_directional"]:
    models[version_]=DualStudent(student_version=version_)
    print("\n\n\n")

models["Mono_directional"].train(nr_epochs=5)
# TODO: with TIMIT dataset...

# evaluate model
# TODO

FileNotFoundError: ignored