In [None]:
import train_Q2
import glob
import os
import pandas as pd
import numpy as np
import pyctcdecode
import nemo.collections.asr as nemo_asr
import multiprocessing

In [31]:
test_dir = 'q2_dev_data'
# label -> interp mapping for "semiotic drift"
remapping = {
    "all":"all",
    "dark":"dark",
    "greasy":"greasy",
    "had":"in",
    "in": "she",
    "she":  "had",
    "suit": "year",
    "wash": "wash",
    "water":"water",
    "year": "suit",
    "your": "your"
}

reverse_remapping = {val: key for (key, val) in remapping.items()}

In [None]:
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained("nvidia/stt_en_conformer_ctc_large")

In [None]:
def recognize_wavs_in_dir(input, asr_model, n = None, alpha= 2.0, beta=1.5, print_outputs=True):

    if type(input) is list:
        test_files = pd.DataFrame({"filename":input})
    
    elif input.endswith(".wav"):
        test_files = pd.DataFrame({"filename":[input]})
    
    else:
        test_files = pd.DataFrame({"filename":glob.glob(os.path.join(input, '*.wav'))})
    
    
    test_files['word'] = [os.path.basename(x).split('_')[0] for x in test_files.filename]
    test_files['remapped_word'] = [reverse_remapping[x] for x in test_files['word']]
    
    if n is not None:
        test_files = pd.concat([y.sample(n) for x,y in test_files.groupby('word')])
    
    lm_path = 'LM/timit.LM'
    unigrams = "she had your suit in dark greasy wash water all year".split(' ')
    decoder = pyctcdecode.build_ctcdecoder(
            asr_model.decoder.vocabulary,
            unigrams = unigrams,
            kenlm_model_path=lm_path,  # either .arpa or .bin file
            alpha=alpha,  # tuned on a val set
            beta=beta,  # tuned on a val set
            unk_score_offset=-50.
        )
    
    num_cores = multiprocessing.cpu_count()
    timit_words = "she had your suit in dark greasy wash water all year".split(' ')+['UNK']
    
    candidates, guesses = train_Q2.batch_transcribe_with_pyctcdecoder(test_files.filename, lm_path, unigrams, asr_model, decoder, num_cores, timit_words)
    test_files['dist'] = guesses.tolist()
    test_files['candidates'] = candidates
    
    
    test_files['p_correct'] = [x['dist'][timit_words.index(x['remapped_word'])] for x in test_files.to_dict('records')]
    
    total_score = np.mean(test_files['p_correct'])
    agg_scores = test_files.groupby(['word']).p_correct.agg(np.mean)

    if print_outputs:
        print('Score:')
        print(total_score)
        print('Score by word')
        print(agg_scores)

    rdict = {
        "agg_scores": agg_scores,
        "total_score": total_score,
        "df": test_files
    }
    
    return(rdict)

In [None]:
import imp
imp.reload(train_Q2)

In [None]:
%%time
test = recognize_wavs_in_dir(test_dir, asr_model, n=10)

# Inspect Candidates

In [None]:
i = 5
target_word = 'water'
test.loc[test.remapped_word == target_word].iloc[i]

In [None]:
test.loc[test.remapped_word == target_word].iloc[i].candidates

# LM from TIMIT unigrams

In [None]:
import random
if not os.path.exists('LM'):
    os.makedirs('LM')

vocab = "she had your suit in dark greasy wash water all year".split(' ')
with open('LM/timit.txt', 'w') as f:
    for i in range(10000):
        single_sentence = random.choice(vocab) + ' '+ random.choice(vocab) + '\n'
        f.write(single_sentence)

In [None]:
command = "/usr/share/srilm/bin/i686-m64/ngram-count -order 2 -tolower -text LM/timit.txt -lm LM/timit.LM"
os.system(command)

# Search alpha and beta

In [None]:
alphas = np.arange(start=3.5, stop=7, step=.25)
betas = np.arange(start=.25, stop=3.5, step=.25)
total_scores = np.zeros([len(alphas), len(betas)])
full_results= {}

parameter_combos = len(alphas) * len(betas)
projected_time= (parameter_combos * 20) /60
print(str(parameter_combos) + ' parameter combos will take '+str(projected_time)+' minutes')

for i in range(len(alphas)):
    full_results[i] = {}
    for j in range(len(betas)):
        recognition_results = recognize_wavs_in_dir(test_dir, asr_model, 20, alphas[i], betas[j], False)
        total_scores[i,j] = recognition_results['total_score']
        full_results[i][j] = recognition_results['df']

In [None]:
total_scores

In [None]:
from matplotlib import pyplot as plt
plt.imshow(total_scores, interpolation='nearest')
plt.show()

In [None]:
np.max(total_scores)

In [None]:
best = np.nonzero(total_scores == np.max(total_scores))
print(best)
print(alphas[best[0]])
print(betas[best[1]])

In [None]:
full_results[2][11]

# How to Handle UNKs

In [None]:
test = recognize_wavs_in_dir(test_dir, asr_model, n=10, alpha=4, beta=3)

In [None]:
test['df'].loc[test['df'].remapped_word == 'had'].iloc[3].candidates

# Weird Water Example

In [None]:
test = recognize_wavs_in_dir("temp/500/water_1136bf14-c829-41f7-9e86-52c40a1c6de9.wav", asr_model)


In [None]:
test['df'].iloc[0].candidates

# Whisper With Prefix

In [94]:
import whisper
import glob
import pandas as pd
import os
import numpy as np
import string
from Levenshtein import distance as lev
import signal
from contextlib import contextmanager
import time

In [85]:
class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    #signal.alarm(seconds)
    signal.setitimer(signal.ITIMER_REAL,seconds) 
    try:
        yield
    finally:
        signal.alarm(0)


In [2]:
whisper_model = whisper.load_model("medium.en")

In [3]:
remapping = {
    "all":"all",
    "dark":"dark",
    "greasy":"greasy",
    "had":"in",
    "in": "she",
    "she":  "had",
    "suit": "year",
    "wash": "wash",
    "water":"water",
    "year": "suit",
    "your": "your"
}

reverse_remapping = {val: key for (key, val) in remapping.items()}
timit_words = "she had your suit in dark greasy wash water all year".split(' ')

In [4]:
vocab = pd.read_csv('data/vocab.csv')
vocab = vocab.loc[vocab['count'] > 20]
vocab['probability'] = vocab['count'] / np.sum(vocab['count'])
vocab.word = vocab.word.astype('str')
vocab.shape

(9528, 4)

In [5]:
vocab

Unnamed: 0.1,Unnamed: 0,word,count,probability
0,0,you,557716,0.037764
1,1,the,448231,0.030351
2,2,a,374563,0.025363
3,3,it,331390,0.022439
4,4,i,320912,0.021730
...,...,...,...,...
9523,9523,supper's,21,0.000001
9524,9524,superman's,21,0.000001
9525,9525,hitted,21,0.000001
9526,9526,rebel,21,0.000001


In [6]:
[x in set(vocab.word) for x in timit_words]

[True, True, True, True, True, True, True, True, True, True, True]

In [69]:
def whisper_recognize_wav(filename, whisper_model, timit_words):    
    
    ip = 'A person on the radio is saying one of the following words: '+', '.join(timit_words)+'. The word the person said was '
    
    try:
        with time_limit(.5):
            transcription= whisper_model.transcribe(filename, language="en", initial_prompt = ip)
            timeout = False

    except TimeoutException as e:
        timeout = True
        
    if not timeout:
    
        best_guess_of_string = transcription['text'].lower().strip().replace(' ','')
        best_guess_of_string = best_guess_of_string.translate(str.maketrans('', '', string.punctuation))
    
        #print(transcription)

    
        # likelihoods: compute levenshtein distance to all n
        distances = np.array([lev(x,best_guess_of_string) for x in timit_words])
    
        alpha = 4
        likelihoods = np.exp(-1. * alpha * distances)
        
        # priors: 
        priors = np.ones(len(timit_words)) * 1./len(timit_words)
    
        unnormalized = priors * likelihoods
        posteriors = unnormalized / np.sum(unnormalized)
        rdf = pd.DataFrame({"word":timit_words,"prob":posteriors})    

        rdict = {
            'df': rdf,
            'lexical_probs': rdf.prob.values,        
            'decoding_prob':np.exp(transcription['segments'][0]['avg_logprob']),
            'no_speech_prob': transcription['segments'][0]['no_speech_prob'],
            'timeout':timeout
        }

    else:
        rdict = {
            'df': None,
            'lexical_probs': None,        
            'decoding_prob':None,
            'no_speech_prob': None,
            'timeout': timeout
        }
            
    return(rdict)
    


def whisper_recognize_wavs(input, whisper_model, reverse_remapping, vocab, timit_words, n = None, print_outputs=True):        
    
    if type(input) is list:
        test_files = pd.DataFrame({"filename":input})
    
    elif input.endswith(".wav"):
        test_files = pd.DataFrame({"filename":[input]})
    
    else:
        test_files = pd.DataFrame({"filename":glob.glob(os.path.join(input, '*.wav'))})
    
    
    test_files['word'] = [os.path.basename(x).split('_')[0] for x in test_files.filename]
    test_files['remapped_word'] = [reverse_remapping[x] for x in test_files['word']]
    
    if n is not None:
        test_files = pd.concat([y.sample(n) for x,y in test_files.groupby('word')])
    
    
    results= pd.concat([fast_whisper_recognize_wav(x, whisper_model, timit_words, vocab) for x in test_files.filename])
    
    test_files = test_files.merge(results)    

    test_files['p_correct'] = [x['prob'][timit_words.index(x['remapped_word'])] for x in test_files.to_dict('records')]
    
    total_score = np.mean(test_files['p_correct'])
    agg_scores = test_files.groupby(['remapped_word']).p_correct.agg(np.mean)

    if print_outputs:
        print('Score:')
        print(total_score)
        print('Score by word')
        print(agg_scores)

    rdict = {
        "agg_scores": agg_scores,
        "total_score": total_score,
        "df": test_files
    }
    
    return(rdict)

In [8]:
%pdb

Automatic pdb calling has been turned ON


In [100]:
def fast_whisper_recognize_wav(filename, whisper_model, timit_words, vocab):    
    
    ip = 'A person on the radio just said one of the following words: "'+'", "'.join(timit_words)+'." The word was "'
    
    try:
        with time_limit(.4):
            segments, info = whisper_model.transcribe(filename, language="en", initial_prompt = ip)
            transcription = [x for x in segments][0]    
            timeout = False

    except TimeoutException as e:
        timeout = True
    
    if not timeout:
        best_guess_of_string = transcription.text.lower().strip().replace(' ','')
        best_guess_of_string = best_guess_of_string.translate(str.maketrans('', '', string.punctuation))
    
        #likelihoods: compute levenshtein distance to all n
    
        distances = np.array([lev(x,best_guess_of_string) for x in vocab.word])
    
        alpha = 4
        likelihoods = np.exp(-1. * alpha * distances)
        
        # priors: 
        # priors = np.ones(len(timit_words)) * 1./len(timit_words)
        priors = vocab.probability
    
        unnormalized = priors * likelihoods
        posteriors = unnormalized / np.sum(unnormalized)
    
        rdf = pd.DataFrame({"hypothesis":vocab.word,"prob":posteriors})
        rdf = rdf.sort_values(by=['prob'], ascending=False)
        candidates = rdf.iloc[0:10]

        limited_probs = candidates.loc[candidates.hypothesis.isin(timit_words)]
        remainder_prob =  np.sum(candidates.loc[~candidates.hypothesis.isin(timit_words)].prob)
        remainder_row = pd.DataFrame({'hypothesis':["UNK"], "logit_score":[np.nan], 
                                  "combined_score":[np.nan], "prob":[remainder_prob]})

        simplified = pd.concat([limited_probs, remainder_row])
    
        # make sure all timit words are present
        simplified = pd.DataFrame({'hypothesis':timit_words+['UNK']}).merge(simplified[['hypothesis', 'prob']], how="left") 
        simplified = simplified.fillna(0) 

        rdf = pd.DataFrame.from_records([{
            'candidates': candidates,
            'simplified': simplified,
            'prob': simplified.prob.values,        
            'decoding_prob':np.exp(transcription.avg_logprob),
            'no_speech_prob': transcription.no_speech_prob,
            'best_guess_of_string': best_guess_of_string,
            'filename':filename,
            'unk_prob': simplified.prob.values[-1],
            "timeout": timeout
            
        }], index=[0])
        
    else:
        dummy_prob = np.zeros(len(timit_words))
        dummy_prob[-1] = 1 
        rdf = pd.DataFrame.from_records([{
            'candidates': None,
            'simplified': None,
            'prob': dummy_prob,        
            'decoding_prob':None,
            'no_speech_prob': None,
            'best_guess_of_string': None,
            'filename':filename,
            'unk_prob': None,
            "timeout": timeout
            
        }], index=[0])
        
        
    return(rdf)

In [10]:
whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_1136bf14-c829-41f7-9e86-52c40a1c6de9.wav", whisper_model, timit_words)

{'df':       word          prob
 0      she  1.125344e-07
 1      had  1.125344e-07
 2     your  1.125344e-07
 3     suit  2.061140e-09
 4       in  2.061140e-09
 5     dark  1.125344e-07
 6   greasy  3.775109e-11
 7     wash  6.144170e-06
 8    water  9.999932e-01
 9      all  1.125344e-07
 10    year  1.125344e-07,
 'lexical_probs': array([1.12534407e-07, 1.12534407e-07, 1.12534407e-07, 2.06113956e-09,
        2.06113956e-09, 1.12534407e-07, 3.77510878e-11, 6.14417043e-06,
        9.99993176e-01, 1.12534407e-07, 1.12534407e-07]),
 'decoding_prob': 0.48548717139804465,
 'no_speech_prob': 0.0803394615650177}

In [43]:
# weirdly this only works after we load whisper
import faster_whisper
faster_whisper_model = faster_whisper.WhisperModel('medium.en', device="cuda", compute_type="int8")

In [101]:
%%time
fast_whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_1136bf14-c829-41f7-9e86-52c40a1c6de9.wav", faster_whisper_model, timit_words, vocab)

CPU times: user 1.41 s, sys: 3.75 s, total: 5.16 s
Wall time: 400 ms


Unnamed: 0,candidates,simplified,prob,decoding_prob,no_speech_prob,best_guess_of_string,filename,unk_prob,timeout
0,hypothesis prob 174 water 0.9...,hypothesis prob 0 she 0.00000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.991...",0.403339,0.006167,water,/home/stephan/notebooks/ciwganfiwgan-pytorch/t...,0.006827,False


In [56]:
whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_1136bf14-c829-41f7-9e86-52c40a1c6de9.wav", whisper_model, timit_words)

{'df':       word          prob
 0      she  1.125344e-07
 1      had  1.125344e-07
 2     your  1.125344e-07
 3     suit  2.061140e-09
 4       in  2.061140e-09
 5     dark  1.125344e-07
 6   greasy  3.775109e-11
 7     wash  6.144170e-06
 8    water  9.999932e-01
 9      all  1.125344e-07
 10    year  1.125344e-07,
 'lexical_probs': array([1.12534407e-07, 1.12534407e-07, 1.12534407e-07, 2.06113956e-09,
        2.06113956e-09, 1.12534407e-07, 3.77510878e-11, 6.14417043e-06,
        9.99993176e-01, 1.12534407e-07, 1.12534407e-07]),
 'decoding_prob': 0.48548717139804465,
 'no_speech_prob': 0.0803394615650177}

In [None]:
whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_1136bf14-c829-41f7-9e86-52c40a1c6de9.wav", whisper_model, timit_words)

In [None]:
whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_426cd806-b512-4aba-a2bd-7c358da4103c.wav", model, timit_words)

In [None]:
# should not be recognizable
test = whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_42d73bda-64c9-4867-95bc-d78f7a4c1ef2.wav", model, timit_words)
print(test)

In [None]:
# should not be recognizable
test = whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_733049a5-ec87-40c3-af46-66085a06449e.wav", model, timit_words)
print(test)

In [102]:
%%time
# should not be recognizable

fast_whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_733049a5-ec87-40c3-af46-66085a06449e.wav", faster_whisper_model, timit_words, vocab)

CPU times: user 1.65 s, sys: 4.14 s, total: 5.79 s
Wall time: 532 ms


Unnamed: 0,candidates,simplified,prob,decoding_prob,no_speech_prob,best_guess_of_string,filename,unk_prob,timeout
0,,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,,,/home/stephan/notebooks/ciwganfiwgan-pytorch/t...,,True


In [None]:
test = whisper_recognize_wav("/home/stephan/notebooks/ciwganfiwgan-pytorch/temp/500/water_52a51577-7ef1-4651-a47d-730e9f4c5474.wav", model, timit_words)
print(test)


In [103]:
%%time
test = whisper_recognize_wavs('q2_dev_data', faster_whisper_model, reverse_remapping, vocab, timit_words, n = 20)

Score:
0.28522615152731634
Score by word
remapped_word
all       0.246392
dark      0.496618
greasy    0.791777
had       0.000000
in        0.000000
she       0.000000
suit      0.104378
wash      0.236777
water     0.694357
year      0.432365
your      0.134823
Name: p_correct, dtype: float64
CPU times: user 5min 49s, sys: 14min 47s, total: 20min 36s
Wall time: 1min 41s


In [104]:
df = test['df']
selected_df = df.loc[(df.decoding_prob > .1) & (df.no_speech_prob < .1) & (df.unk_prob <.15) & ~df.timeout]
print(np.mean(selected_df.p_correct))
print(selected_df.shape)

0.8564859570880302
(53, 12)


In [None]:
.85
(53/220 in 101 seconds)

.82
(54 in 106 seconds

# beat 
#0.8419184657183111
#(54, 11) in 120 seconds

Beat 
0.8800743171340705
(45, 11) in 168 seconds




In [57]:
220 / 168

1.3095238095238095

In [81]:
220 / 124

1.7741935483870968

In [64]:
.4* 220

88.0

In [66]:
selected_df.loc[df.remapped_word == 'had']

Unnamed: 0,filename,word,remapped_word,candidates,simplified,prob,decoding_prob,no_speech_prob,best_guess_of_string,unk_prob,p_correct
80,q2_dev_data/in_28b8473e-9c59-4e80-ba4b-fedefa8...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.527169,0.002693,she,0.187596,0.0
81,q2_dev_data/in_cb5c374e-aa2e-4ddf-8baf-2a703a9...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.547741,0.060974,she,0.187596,0.0
82,q2_dev_data/in_3738b936-8a44-4798-995b-2a71657...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.512149,0.012817,she,0.187596,0.0
83,q2_dev_data/in_db0a37fd-a955-4b8f-9940-1adc4ec...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.555281,0.003538,she,0.187596,0.0
86,q2_dev_data/in_777e3e8b-aab5-4a2c-9a8a-98c5519...,in,had,hypothesis prob 1634 suit 0.5...,hypothesis prob 0 she 0.00000...,"[0.0, 0.0, 0.0, 0.5218881956128997, 0.0, 0.0, ...",0.570005,0.002935,suit,0.429889,0.0
88,q2_dev_data/in_2d4e33b6-b638-450e-98de-44833d0...,in,had,hypothesis prob 94 say 0.2...,hypothesis prob 0 she 0.00000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.426081,0.102295,gay,0.755745,0.0
89,q2_dev_data/in_7eb430c1-30a7-4a2b-97f0-29f823d...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.465407,0.006851,she,0.187596,0.0
90,q2_dev_data/in_aa59d685-b8fe-4ce4-9ae1-5ff71b7...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.584892,0.011597,she,0.187596,0.0
92,q2_dev_data/in_7a691732-286d-4714-a89b-76fd5b9...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.584892,0.025955,she,0.187596,0.0
93,q2_dev_data/in_058fee18-e5db-484b-87c9-cd83274...,in,had,hypothesis prob 60 she 0.806...,hypothesis prob 0 she 0.80641...,"[0.806414031381023, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",0.563585,0.002453,she,0.187596,0.0


In [None]:
import librosa
from IPython.display import Audio, display

test_audio_filename = selected_df.loc[df.remapped_word == 'she'].iloc[0].filename
signal, sample_rate = librosa.load(test_audio_filename, sr=None)

# display audio player for the signal
display(Audio(data=signal, rate=sample_rate))

In [None]:
# build out the rest of the Bayesian speech recognizer, and map it to UNK
if decoding prob is too low, not a word
if not_speech, not a work
if UNK is high, outside of this set of words

In [None]:
# todo
# [X] Fix the mapping
# [X] make it easy to output candidates here for inspection
# [X] train a new LM that upweights these words
# [ ] grid search alpha and beta -- how to get the most
# [ ] how to handle UNK -- what if we don't? QQ network tries to imitate 

# [ ] get higher perfomrance on "greasy" -- what sorts of errors do we see here?
# 1. greasy_16b6aca2-e108-44b7-8b55-7f23a6a60eec_candidates.csv is pretty high
# inspect 
# [ ] dumb heuristic Whisper




