# Download the corpus

In [21]:
from os import system
import glob

In [None]:
# Downloading the audios
system("wget --accept wav --mirror --no-parent https://media.talkbank.org/ca/CallHome/eng/0wav/ -P ./corpus/original_audio/")
for filename in glob.iglob('./corpus/original_audio/media.talkbank.org/ca/CallHome/eng/0wav/*'):
    system('mv {} ./corpus/original/audio')
system('rm -r ./corpus/original_audio/media.talkbank.org')


In [None]:
# Downloading the transcripts
system('wget https://ca.talkbank.org/data/CallHome/eng.zip -P ./corpus/original_text/')
system('unzip -j ./corpus/original_text/eng.zip -d ./corpus/original_text/')
for filename in glob.iglob('./corpus/original_text/*'):
    if not filename.endswith('.cha'):
        system('rm {}'.format(filename))

# Sentence level alignment

In [None]:
from os import walk, listdir
from pydub import AudioSegment
import re
import soundfile as sf

In [None]:
speechs = {}

class Speech():
    def __init__(self, identifier):
        self._id = identifier
        self.audio_path = './corpus/original_audio/'+identifier+'.wav'
        self.transc_path = './corpus/original_text/'+identifier+'.cha'
        self.sentences = [] #dictionnaries like {start (ms), end, text}

In [None]:
# creating instances
for filename in listdir('./corpus/original_audio/'):
        if filename.endswith('.wav'):
            _id = filename.split('.')[0]
            speechs[_id] = Speech(_id)

In [None]:
def get_clean_lines(content):
        try:
            content = re.split('@Media.*audio\n',content)[1]
        except:
            return 
        content = re.split('\n@End\n',content)[0]
        lines = re.split('(\x15.*\x15)', content)
        lines2 = []
        i = 0
        pattern = re.compile('\x15.*\x15')
        while i<len(lines):
            elem = lines[i]
            while not pattern.match(lines[i]) and i!=len(lines)-1: #tant ne contient pas (ou n'est pas) un timestamp:
                i += 1
                next_elem = lines[i]
                next_elem = next_elem.replace('*B:', ' ')
                next_elem = next_elem.replace('*A:', ' ')
                elem = elem + ' ' + next_elem #on concatenate avec l'élement suivant
            #quand on tombe sur un timestamp
            elem = elem.replace('\n\t', ' ')
            elem = elem.replace('\n', '')
            elem = elem.replace('\t','')
            elem = elem.replace('*A:',' ')
            elem = elem.replace('*B:',' ')
            elem= elem[1:]
            
            lines2.append(elem)
            i += 1 
    
        lines2 = lines2[:-1]
        return lines2

In [None]:
# Spotting the sentences
for speech in speechs.values():
        with open(speech.transc_path) as transc_file:
            content = transc_file.read()
            
        #splitting after each timestamp
        lines= get_clean_lines(content)
        if lines == None:
            continue
        
        for i in range(len(lines)):
            lines[i] = lines[i].rstrip()
        for line in lines:
            try:
                timestamp = re.search("\x15(.*)\x15", line).group(1)
            except:
                breakpoint()
            start = int(timestamp.split("_")[0])
            end = int(timestamp.split("_")[1])
            #breakpoint()
            text = re.search("(.*)\x15{}".format(timestamp),line).group(1)
            #writing in the speech object
            speech.sentences.append({'start':start,'end':end, 'text':text})
print("Sentences retrieved")

In [None]:
# Creating files
for speech in speechs.values():
        print('Cutting {}'.format(speech.audio_path))
        data, samplerate = sf.read(speech.audio_path)
        for sent in speech.sentences:
            title = "{}_{}_{}".format(speech._id, sent['start'], sent['end'])
            cut = data[sent['start']*int((samplerate/1000)):sent['end']*int((samplerate/1000))]
            sf.write('./corpus/alignment/sentence_level_audio/{}.wav'.format(title),cut , samplerate)
            with open('./corpus/alignment/sentence_level_text/{}.txt'.format(title),'w') as outfile:
                print(sent['text'], file = outfile)

# Filtering non-filler utterances

In [None]:
import glob
import re

In [None]:
pattern = re.compile('(mhm|uhhuh|mm|um|eh|em|ah|huh|ha|er|oof|hee|ach|eee|ew)')

for file in glob.iglob('./corpus/alignment_sentence_level_text/'):
    filename = file.split('.')[-1]
    filename = filename.split('.')[0]
    has_filler = False
    with open(file,'r', encoding = 'utf-8') as infile:
        content = infile.read()
        matched = re.match(pattern, content)
        has_filler = bool(matched)
    if not has_filler:
        system('rm {}'.format(file))
        system('rm ./corpus/alignment_sentence_level_audio/{}.wav'.format(filename))

# Word level alignment

In [None]:
from os import system

In [None]:
# Converting the audio sentences into mono
path = './corpus/alignment/sentence_level_audio/'
for filename in listdir(path):
    system('sox {}{} {}{} remix 1,2'.format(path, filename, path, filename))

In [None]:
# Preparing the setup for MFA
# (converting .txt to .lab, and moving .lab and .wav sentence level into the same folder)

for filename in listdir('./corpus/alignment/sentence_level_audio'):
    _id = filename.split('.')[0]
    system('mv ./corpus/alignment/sentence_level_audio/{}.wav ./corpus/alignment/mfa_setup/{}.wav '.format(_id,_id))
    system('mv ./corpus/alignment/sentence_level_text/{}.txt ./corpus/alignment/mfa_setup/{}.lab'.format(_id,_id))
    

In [None]:
mfa_path = ??? # Replace with the installation directory of Montreal Forced Aligner
lm_path = '../corpus/alignment/librispeech-lexicon.txt' # Language model
lab_wav_path = '../corpus/alignment/mfa_setup'
output_path = '../corpus/alignment/textgrids'


In [None]:
# mfa command
system('{}\bin\mfa_align.exe -v  {} {} english {}'.format(mfa_path, lm_path, lab_wav_path, output_path))


# Data preparation

In [None]:
import textgrids as tg
import pandas as pd

In [None]:
hesitations = [
    'mhm',
    'uhhuh',
    'mm',
    'um',
    'eh',
    'em',
    'ah',
    'huh',
    'ha',
    'er',
    'oof',
    'hee',
    'ach',
    'eee',
    'ew'
]

silence = ['sp', 'sil','']

def tag_word(word):
    if word in hesitations: 
        return 'hesitation'
    elif word in silence:
        return 'silence'
    else:
        return 'speech'

**You can jump until the end if you have the pickles**

In [None]:
words_df = pd.DataFrame(columns=['file','word','tag','xmin','xmax'])
files_df = pd.DataFrame(columns=['file', 'xmin','xmax'])

In [None]:
for filename in listdir('./corpus/alignment/textgrids'):
    if not filename.endswith('.TextGrid'):
        continue
    name = filename.split('.')[0]
    the_tg = tg.TextGrid('./corpus/alignment/textgrids/'+filename)
    
    # Filling the dataframe of files
    files_df.loc[len(files_df)] = {'file':name,
                            'xmin':the_tg.xmin,
                            'xmax':the_tg.xmax}
    
    # Filling the dataframe of words 
    for word in the_tg['words']:
        words_df.loc[len(words_df)] = {'file':name,
                             'word':word.text,
                             'tag':tag_word(word.text),
                             'xmin':word.xmin,
                             'xmax':word.xmax
                            }

## Words dataframe

In [None]:
words_df.head()

In [None]:
words_df['duration'] = words_df['xmax']-words_df['xmin']

In [None]:
words_df['duration'].plot.box()

In [None]:
# converting to ms
words_df['xmin'] = words_df['xmin'].apply(lambda x: int(x*1000))
words_df['xmax'] = words_df['xmax'].apply(lambda x: int(x*1000))
words_df['duration'] = words_df['duration'].apply(lambda x: int(x*1000))

In [None]:
words_df.groupby('tag').count().file

## Files dataframe

In [None]:
files_df.head()

In [None]:
files_df['duration'] = files_df['xmax']-files_df['xmin']

In [None]:
# converting to ms
files_df['xmin'] = files_df['xmin'].apply(lambda x: int(x*1000))
files_df['xmax'] = files_df['xmax'].apply(lambda x: int(x*1000))
files_df['duration'] = files_df['duration'].apply(lambda x: int(x*1000))

## Saving

In [None]:
words_df.to_pickle('./pickles/words_df.pkl')
files_df.to_pickle('./pickles/files_df.pkl')

## Loading

In [None]:
words_df = pd.read_pickle('./pickles/words_df.pkl')
files_df = pd.read_pickle('./pickles/files_df.pkl')

# Framing audios

In [None]:
import torch

In [None]:
def tag_decision_med(filename, xmin, xmax):
    # Taking the median sample
    med = int((xmin+xmax)/2)
    query = words_df.query("file=='{}' and {} >= xmin and {} <= xmax ".format(filename, med, med))
    tag = query.iloc[0]['tag']
    return tag

In [None]:
tag2int = {
    '<dummy>':0,
    'speech':1,
    'silence':2,
    'hesitation':3
}

In [None]:
sampling_rate = 16000
win_shift = 12.5
win_len = 12.5
numb_files = len(files_df)
max_len = int(files_df['duration'].max() // win_len)

**Skip the following if you have tags.pt**

In [None]:
Y = torch.zeros((numb_files,max_len))

In [None]:
i,j = 0,0

for audiofile in files_df['file']:
    j = 0
    filename = audiofile
    file_df = files_df.query("file == '{}'".format(filename))
    duration = file_df.iloc[0]['duration']
    head = 0
    while head + win_shift <= duration:
        try:
            tag = tag2int[tag_decision_med(filename, head, head+win_len )]
            Y[i][j] = tag
        except:
            continue
        head += win_shift
        j += 1
    i += 1
    print('{} / {}'.format(i,numb_files))

In [None]:
print(Y.shape)

In [None]:
print(Y)

In [None]:
torch.unique(Y, return_counts = True)

## Saving

In [None]:
torch.save(tags,'./pickles/Y.pt')

## Loading

In [None]:
Y = torch.load('./pickles/Y.pt')

# MFCC Extraction

In [None]:
import torchaudio

In [None]:
mfcc_tensors = []

for audiofile in files_df['file']:
    waveform, sample_rate = torchaudio.load('./corpus/alignment/mfa_setup/'+audiofile+'.wav')
    mfcc = torchaudio.transforms.MFCC(n_mfcc=13)(waveform)
    torch.set_printoptions(sci_mode=False)
    mfcc_tensors.append(mfcc)

In [None]:
padded_tensors = []

for tensor in mfcc_tensors:
    target = torch.zeros(1, 13, max_len+1)
    source = tensor
    target[:, :, :tensor.shape[2]] = source
    padded_tensors.append(target)

In [None]:
X = torch.cat((padded_tensors), dim=0)

In [None]:
X.shape

In [None]:
X = torch.narrow(X, 2, 0, max_len)

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
Y = Y.unsqueeze(1)

In [None]:
Y.shape

## Saving

In [None]:
torch.save(X,'./pickles/X.pt')
torch.save(Y,'./pickles/Y.pt')

## Loading

In [None]:
X = torch.load('./pickles/X.pt')
Y = torch.load('./pickles/Y.pt')

# RNN Tagging

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
X = torch.load('./pickles/X.pt')
Y = torch.load('./pickles/Y.pt').squeeze(1)

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
BATCH_SIZE = 32
N_STEPS = X.shape[2]
N_INPUTS = 13
N_NEURONS = 150
N_OUTPUTS = 4
N_EPHOCS = 50
LR = 0.05

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.20, random_state=42)

train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)

In [None]:
class RNN(nn.Module):
    def __init__(self, batch_size, n_steps, n_inputs, n_neurons, n_outputs):
        super(RNN, self).__init__()
        
        self.n_neurons = n_neurons
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons)
        
        self.FC = nn.Linear(self.n_neurons, self.n_outputs)
        
    def init_hidden(self,):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.n_neurons))
        
    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        X = X.permute(1, 0, 2)

        self.batch_size = X.size(1)
        self.hidden = self.init_hidden()

        lstm_out, self.hidden = self.basic_rnn(X, self.hidden)
        out = self.FC(lstm_out)

        return out.view(-1, self.n_outputs) # batch_size X n_output

In [None]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model instance
model = RNN(BATCH_SIZE, N_STEPS, N_INPUTS, N_NEURONS, N_OUTPUTS)
criterion = nn.CrossEntropyLoss(size_average=True, ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)

def get_accuracy(logit, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100 * corrects/batch_size
    return accuracy.item()

In [None]:
for epoch in range(N_EPHOCS):  # loop over the dataset multiple times
    train_running_loss = 0.0
    train_acc = 0.0
    outs = []
    golds = []
    model.train()
    
    # TRAINING ROUND
    for i, data in enumerate(train_loader):
         # zero the parameter gradients
        optimizer.zero_grad()
        
        # reset hidden states
        model.hidden = model.init_hidden() 
        
        # get the inputs
        inputs, labels = data
        inputs = inputs.view(-1, N_STEPS, N_INPUTS)

        # forward + backward + optimize
        outputs = model(inputs)

        loss = criterion(outputs, labels.long().view(-1))
        
        loss.backward()
        optimizer.step()

        train_running_loss += loss.detach().item()
        train_acc += get_accuracy(outputs, labels.long().view(-1), BATCH_SIZE)
        
        out = torch.max(outputs, 1)[1].view(labels.long().view(-1).size()).data
        gold = labels.long().view(-1).data
        outs.append(out)
        golds.append(gold)
         
    model.eval()
    print('Epoch:  %d | Loss: %.4f | Train Accuracy: %.2f' 
          %(epoch+1, train_running_loss/i, train_acc/(i*N_STEPS)))
    
    y_pred = torch.cat(outs)
    y_gold = torch.cat(golds)

    print(confusion_matrix(y_gold, y_pred, labels=[1, 2, 3]))
    print(classification_report(y_gold, y_pred, labels=[1, 2, 3]))

In [None]:
test_acc = 0.0
outs = []
golds = []

for i, data in enumerate(valid_loader, 0):
    inputs, labels = data
    inputs = inputs.view(-1, N_STEPS, N_INPUTS)

    outputs = model(inputs)

    test_acc += get_accuracy(outputs, labels.long().view(-1), BATCH_SIZE)
    out = torch.max(outputs, 1)[1].view(labels.long().view(-1).size()).data
    gold = labels.long().view(-1).data
    outs.append(out)
    golds.append(gold)
        
print('Test Accuracy: %.2f'%(test_acc/(i*N_STEPS)))

y_pred = torch.cat(outs)
y_gold = torch.cat(golds)

print(confusion_matrix(y_gold, y_pred, labels=[1, 2, 3]))
print(classification_report(y_gold, y_pred, labels=[1, 2, 3]))

In [None]:
cm1 = pd.DataFrame({'speech':[307140,673,894],'silence':[67293,146,147],'filler':[77599,198,292]}, index=["speech", "silence", "filler"])
matrix = sns.heatmap(cm1.T, annot=True, fmt='d', linewidths=.5, cmap='coolwarm')
matrix.set(xlabel='predicted label', ylabel='true label')

In [None]:
cm = pd.DataFrame({'speech':[38602,0,0],'silence':[8888,0,0],'filler':[11674,0,0]}, index=["speech", "silence", "filler"])
matrix = sns.heatmap(cm.T, annot=True, fmt='d', linewidths=.5, cmap='coolwarm')
matrix.set(xlabel='predicted label', ylabel='true label')