In [None]:
import numpy as np
from tqdm import tqdm
from time import time
import json, pickle, os, string, tqdm, kenlm, json
from collections import defaultdict, Counter
from itertools import groupby
import Levenshtein as Lev

In [None]:
#s1 = True text
#s2 = predicted text

def wer_(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]
    
    return Lev.distance(''.join(w1), ''.join(w2))

def cer_(s1, s2):
    """
    Computes the Character Error Rate, defined as the edit distance.

    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """
    s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')

    return Lev.distance(s1, s2)




#When using the above implementation, use the code belove to calculate the wer in percentatge: 
#pred = list of ouput prediction of model (it is the text) # example [" MY NAME IS HEMANT", " I AM A GOD"]
# total_wer = 0
# for x in range(len(pred)):
#     transcript, reference = data_[x][1], pred[x]
#     wer_inst = wer(transcript, reference)
#     total_wer += float(wer_inst)
# print("WER is : ",total_wer/len(pred),"%")


In [None]:
def ctc_best_path(out,labels):
    "implements best path decoding as shown by Graves"
    out = [labels[i] for i in np.argmax(out, axis=1) if i!=labels[-1]]
    o = ""
    for i,j in groupby(out):
        o = o + i
    return o.replace("_","")

In [None]:
gred_txt = ctc_best_path(out,labels)

### WORD LM Implementation

In [None]:
# lm_w = kenlm.LanguageModel('/home/hemant/deep/lm/libri_lm/3-gram.binary')

In [None]:
def sort_beam(ptot,k):
    if len(ptot) < k:
        return [i for i in ptot.keys()]
    else:
        dict_ = sorted(dict((v,k) for k,v in ptot.items()).items(),reverse=True)[:k]
        return [i[1] for i in dict_]

#using WORD LM
def ctc_beam_search(out,labels, prune=0.0001, k=20, lm=None,alpha=0.3,beta=12):
    "implements CTC Prefix Search Decoding Algo13.043478260869565%'rithm as shown by Graves"
    '''
    out = ctc output
    labels = string of labels
    prune = prune the ctc output
    k=beam-width
    lm=word age model used
    alpha,beta = hyper-parameters
    '''

    bc_i = 0 # blank/special charatcter index 
    F = out.shape[1]
    out = np.vstack((np.zeros(F), out))
    steps = out.shape[0]
    
    pb, pnb = defaultdict(Counter), defaultdict(Counter)
    pb[0][''], pnb[0][''] = 1, 0
    prev_beams = ['']
    for t in range(1,steps):
        pruned_alphabet = [labels[i] for i in np.where(out[t] > prune)[0]]
        for b in prev_beams:
            for c_t in pruned_alphabet:
                index = labels.index(c_t)
                #Collapsing case (copy case as the last character in the beam)
                if c_t == "_": #Extending with a blank
                    pb[t][b] += out[t][index]*(pb[t-1][b] + pnb[t-1][b])   
                else:
                    i_plus = b + c_t
                    if len(b) > 0 and c_t == b[-1]: #Extending with the same character as the last one
                        pnb[t][b] += out[t][index]*pnb[t-1][b]
                        pnb[t][i_plus] += out[t][index]*pb[t-1][b]
                    #expanding the beam (extend case as the last character is different)
                    elif c_t == " " and len(b.replace(' ', '')) > 0 : # LM constraints
                        prob = [i[0] for i in lm.full_scores(i_plus,eos=False,bos=False)][-1]
                        lm_p = (10**prob)**alpha
                        pnb[t][i_plus] += lm_p*out[t][index]*(pb[t-1][b] + pnb[t-1][b])
                    else:
                        pnb[t][i_plus] += out[t][index]*(pb[t-1][b] + pnb[t-1][b])
                        
                    if i_plus not in prev_beams:
                        pb[t][i_plus] += out[t][index] * (pb[t - 1][i_plus] + pnb[t - 1][i_plus])
                        pnb[t][i_plus] += out[t][index] * pnb[t - 1][i_plus]

        ptot = pb[t] + pnb[t]
        for i in ptot.keys():
            ptot[i] = ptot[i]*(len(i)+1)**beta
        prev_beams = sort_beam(ptot,k)
    return prev_beams[0]

In [None]:
# beam_txt=ctc_beam_search(out,labels,0.001,k=10,lm=lm_w)

### CHARACTER LM Implementation

In [None]:
# lm_c = kenlm.LanguageModel('/home/hemant/decode_humonics/3_char_gram.arpa')

In [None]:
def sort_beam(ptot,k):
    if len(ptot) < k:
        return [i for i in ptot.keys()]
    else:
        dict_ = sorted(dict((v,k) for k,v in ptot.items()).items(),reverse=True)[:k]
        return [i[1] for i in dict_]

#using CHARACTER LM
def ctc_beam_search_clm(out,labels, prune=0.001, k=20, lm=None,alpha=0.3,beta=12):
    "implements CTC Prefix Search Decoding Algorithm as shown by Graves"
    
    '''
    out = ctc output
    labels = string of labels
    prune = prune the ctc output
    k=beam-width
    lm=charac language model used
    alpha,beta = hyper-parameters
    '''
    
    bc_i = 0 # blank/special charatcter index 
    F = out.shape[1]
    out = np.vstack((np.zeros(F), out))
    steps = out.shape[0]
    
    pb, pnb = defaultdict(Counter), defaultdict(Counter)
    pb[0][''], pnb[0][''] = 1, 0
    prev_beams = ['']
    for t in range(1,steps):
        pruned_alphabet = [labels[i] for i in np.where(out[t] > prune)[0]]
        for b in prev_beams:
            for c_t in pruned_alphabet:
                index = labels.index(c_t)
                #Collapsing case (copy case as the last character in the beam)
                if c_t == "_": #Extending with a blank
                    pb[t][b] += out[t][index]*(pb[t-1][b] + pnb[t-1][b])  
                else:  # LM constraints
                    i_plus = b + c_t
                     #Extending with the same character as the last one
                    if len(b) > 0 and c_t == b[-1]:
                        pnb[t][b] += out[t][index]*pnb[t-1][b]
                        pnb[t][i_plus] += out[t][index]*pb[t-1][b]
                    #expanding the beam (extend case as the last character is different)
                    elif len(b.replace(' ', '')) > 0 :
                        prob = [i[0] for i in lm.full_scores(i_plus,eos=False,bos=False)][-1]
                        lm_p = 1#(10**prob)**alpha
                        pnb[t][i_plus] += lm_p*out[t][index]*(pb[t-1][b] + pnb[t-1][b])
                    else:
                        pnb[t][i_plus] += out[t][index]*(pb[t-1][b] + pnb[t-1][b])
                        
                    if i_plus not in prev_beams:
                        pb[t][i_plus] += out[t][index] * (pb[t - 1][i_plus] + pnb[t - 1][i_plus])
                        pnb[t][i_plus] += out[t][index] * pnb[t - 1][i_plus]
                        
        ptot = pb[t] + pnb[t]
        for i in ptot.keys():
            ptot[i] = ptot[i]*(len(i)+1)**beta
        prev_beams = sort_beam(ptot,k)
    return prev_beams[0]

In [None]:
# beam_txt=ctc_beam_search_clm(out,labels,0.001,k=10,lm=lm_c)

# IMPLEMANTATION on deepspeech model

In [None]:
import os
os.chdir("/home/hemant/deep/")

import pickle
import json
import os.path
from data.data_loader import SpectrogramParser
import torch
from decoder import GreedyDecoder
import argparse

from tqdm import tqdm
import warnings

from opts import add_decoder_args, add_inference_args
from utils import load_model

### Reading data

In [None]:
with open("/home/hemant/decode_humonics/updatedAfricanNames/wav_utterance.txt", "r") as f:
    data_ = f.readlines()
    
data_ = [[i.split()[0], " ".join(i.split()[1:])] for i in data_]

In [None]:
device = torch.device("cuda")
model = load_model(device, "/home/hemant/decode_humonics/updatedAfricanNames/deepspeech_final.pth",False)
spect_parser = SpectrogramParser(model.audio_conf, normalize=True)

In [None]:
lm_w = kenlm.LanguageModel("/home/hemant/deep/lm/libri_lm/3-gram.binary") #word lm
lm_c= kenlm.LanguageModel('/home/hemant/2_char_gram.arpa') #character lm

# Use transcribe function for running the code
### Chosse the decoding in len number 12-14

In [None]:
#data_ is a list containing the audio path 

def transcribe(audio_path,alpha,beta,k=25,p=0.00001):
    start_time = time()
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)

    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    out = out.cpu().detach().numpy()[0] # gives the corresponding transcription
    out = ctc_best_path(out,labels) # greddy decoding
#     out = ctc_beam_search(out,labels,p,k,lm=lm_w,alpha,beta) # wordl lm decoding
#     out = ctc_beam_search_clm(out,labels,p,k,lm=lm_c,alpha,beta) # character lm decoding

    end_time = time()
    print(f"It took {(end_time - start_time)/60} mins" )
    return out

In [None]:
transcribe(audio_path,0.1,11.6)

## To tune the alpha and beta hyper parameters

In [None]:
#data_ is a list containing the audio path and the corresponding transcript i.e. 
#data_[0]= audio path and data_[1] is the corresponding transcript

def evalu_(data_,a,b,k=25,p=0.00001):
    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
    start_time = time()
    for i in data_[:5000]:
        audio_path = i[0] 
        
        try:
            spect = spect_parser.parse_audio(audio_path).contiguous()
            spect = spect.view(1, 1, spect.size(0), spect.size(1))
            spect = spect.to(device)

            input_sizes = torch.IntTensor([spect.size(3)]).int()
            out, output_sizes = model(spect, input_sizes)
            out = out.cpu().detach().numpy()[0]
            out = ctc_best_path(out,labels) # greddy decoding
#             out = ctc_beam_search(out,labels,p,k,lm=lm_w,alpha=a,beta=b) # wordl lm decoding
    #         out = ctc_beam_search_clm(out,labels,p,k,lm=lm_c,alpha=a,beta=b) # character lm decoding
            transcript, reference = out, i[1]
            wer_inst = wer_(transcript, reference)
            cer_inst = cer_(transcript, reference)
            total_wer += wer_inst
            total_cer += cer_inst
            num_tokens += len(reference.split())
            num_chars += len(reference)

        except: pass
    wer = float(total_wer) / num_tokens
    cer = float(total_cer) / num_chars
    print("WER :-",wer*100)
    print("CER :-:",cer*100)
    end_time = time()
    print(f"It took {(end_time - start_time)/60} mins" )
    return wer*100,cer*100

In [None]:
alpha, beta = np.linspace(0.1,2,20), np.linspace(2,25,20)
best = {}
for i in alpha:
    for j in beta:
        print(f"alpha :- {i} and beta :- {j}")
        wer,cer = evalu_(data_,i,j)
        print()
        best[wer] = [i,j]
        with open("best.json","w") as f: 
            json.dump(best,f)