In [1]:
import numpy as np
import pickle
from copy import copy
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [2]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [3]:
try:
    import librosa
    speech, rate = librosa.load("video_samples/aortic.wav",sr=16000)
except:
    with open('video_samples/aortic.pkl','rb') as f:
        speech = pickle.load(f)

In [4]:
def make_transcript(speech,chunk_length=30,overlap_length=5):
    # splits speech into chunks (with some overlap) and creates transcript for each
    transcript=[]
    end = speech.shape[0]
    points = list(np.arange(0,end,16000*chunk_length))+[end]
    for k in range(len(points)-1):
        input_values = tokenizer(speech[points[k]:points[k+1]+16000*overlap_length],
                                 return_tensors = 'pt').input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim =-1)
        transcriptions = tokenizer.decode(predicted_ids[0])
        transcript.append(transcriptions.lower())
    return transcript

def deoverlapper(t1,t2):
    # deletes overlap between two adjacent text snippets
    for k in range(150):
        try:
            a = t1.index(t2[k:k+30])
            return t1[:a],t2[k:]
        except:
            continue
    return t1,t2
        

def splice(transcript):
    # deoverlaps full transcript
    transcript2 = copy(transcript)

    for k in range(len(transcript2)-1):
        transcript2[k],transcript2[k+1] = deoverlapper(transcript2[k],transcript2[k+1])
    return transcript2

In [5]:
%%time
transcript = make_transcript(speech)

CPU times: user 7min 33s, sys: 2min 29s, total: 10min 2s
Wall time: 45.7 s


In [6]:
transcript2 = splice(transcript)
print('|  30  |'.join(transcript2))

when we take a history of a patient with theortic dissection we want a look for features that are gono allow us to um either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain thi sections are very very painful and the handful of cases i've seen throughout my career the patients have been in significant distress often very agitated clutching their chest moaning writhing |  30  |it's not subtle pain so you want to get a sense of how severe the pain is and how much distress the patient is in and when their distress levels really high think about the section dessection is also a sudden catostraphic event you're going about your business doing everything normally and then suddenly your intimate tairs and you develope this u accumulation of blood in the aortic wall this isn't something that gradually ramps up getting worse over time this is a sudden event and patience will report |  30  |that they experi

In [9]:
import speech_recognition as sr
audiofile = 'video_samples/aortic.wav'
audio = sr.AudioFile(audiofile)


In [219]:
input_values = tokenizer(speech[:20*16000], return_tensors = 'pt').input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim =-1)
t1 = tokenizer.decode(predicted_ids[0]).lower()
print(t1)

when we take a history of a patient with theortic dossection we want a look for features that are goin to allow us to um either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain the sections are very very painful and


In [220]:
input_values = tokenizer(speech[10*16000:30*16000], return_tensors = 'pt').input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim =-1)
t2 = tokenizer.decode(predicted_ids[0]).lower()
print(t2)

exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain the sections are very very painful and the handful of cases i've seen throughout my career the patients have been in significant distress often very agitated clutching their chest moaning writhing


In [224]:
seq=SequenceMatcher(t1.split(),t2.split())
list(seq.get_matching_blocks())

[Match(a=53, b=0, size=0)]

In [36]:
from difflib import SequenceMatcher
import itertools
seq = SequenceMatcher(None,t2,t1)
matches = list(seq.get_matching_blocks())
matches = [m for m in matches if m.size>1]
print(matches)
[t1[m.a:m.a+m.size] for m in matches]
mas = list(itertools.chain(*[(m.a,m.a+m.size) for m in matches]))
mbs = list(itertools.chain(*[(m.b,m.b+m.size) for m in matches]))
[(t1[:mbs[0]],'')]+[(t1[mbs[k]:mbs[k+1]],t2[mas[k]:mas[k+1]]) 
                for k in range(len(mbs)-1)]+[('',t2[mas[-1]:])]

[Match(a=9, b=1250, size=3), Match(a=18, b=1655, size=3), Match(a=99, b=1797, size=2), Match(a=251, b=2282, size=3), Match(a=657, b=3326, size=23), Match(a=1019, b=3724, size=3), Match(a=4239, b=4839, size=64), Match(a=4303, b=4905, size=55), Match(a=4436, b=5040, size=13), Match(a=4450, b=5053, size=31), Match(a=4656, b=5334, size=32), Match(a=4736, b=5410, size=61), Match(a=4934, b=6189, size=3), Match(a=5037, b=7086, size=3), Match(a=5269, b=8925, size=4), Match(a=5785, b=9071, size=2), Match(a=6199, b=9146, size=2), Match(a=6555, b=9289, size=2)]


[("when we take a history of a patient with theortic dissection we want a look for features that are gono allow us to um either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain thi sections are very very painful and the handful of cases i've seen throughout my career the patients have been in significant distress often very agitated clutching their chest moaning writhing it's not subtle pain so you want to get a sense of how severe the pain is and how it's not subtle pain so you want to get a sense of how severe the pain is and how much distress the patient is in and when their distress levels really high think about the section dessection is also a sudden catostraphic event you're going about your business doing everything normally and then suddenly your intimate tairs and you develope this u accumulation of blood in the aortic wall this isn't something that gradually ramps up getting worse over 

In [9]:
from termcolor import colored
def colorize(S,L,c='blue'):
    if type(L)==list:
        for l in L:
            S = S.replace(l,colored(l,c))
    elif type(L)==str:
        S = S.replace(L,colored(L,c))
    return S

def stitch(t1,t2):
    seq = SequenceMatcher(None,t2,t1)
    matches = list(seq.get_matching_blocks())
    matches = [m for m in matches if m.size>10]
    try:
        b = matches[0].b
    except:
        try:
            b = t1.index(t2[:10])
        except:
            print(t1[:10])
            b=-1
    return t1[:b],t2[matches[0].a:]
r1,r2=stitch(t1,t2)
print(colorize(t1+' && '+t2,[r1,r2]))

NameError: name 't1' is not defined

In [267]:
for t1,t2 in zip(transcript,transcript2):
    print(colorize(t1,t2))

[34mwhen we take a history of a patient with theortic dissection we want a look for features that are gono allow us to um either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain thi sections are very very painful and the handful of cases i've seen throughout my career the patients have been in significant distress often very agitated clutching their chest moaning writhing [0mit's not subtle pain so you want to get a sense of how severe the pain is and how
[34mit's not subtle pain so you want to get a sense of how severe the pain is and how much distress the patient is in and when their distress levels really high think about the section dessection is also a sudden catostraphic event you're going about your business doing everything normally and then suddenly your intimate tairs and you develope this u accumulation of blood in the aortic wall this isn't something that gradually ramps up getting 

In [200]:
seq = SequenceMatcher(None,transcript[1],transcript[0])
list(seq.get_matching_blocks())


[Match(a=2, b=325, size=1),
 Match(a=11, b=373, size=1),
 Match(a=57, b=408, size=3),
 Match(a=81, b=443, size=2),
 Match(a=139, b=517, size=3),
 Match(a=572, b=542, size=0)]

In [77]:
tokenizer.decode(np.argsort(np.array(logits.detach())[0],axis=-1)[:,-1])

"WHEN WE TAKE A HISTORY OF A PATIENT WITH THEORTIC DISSECTION WE WANT A LOOK FOR FEATURES THAT ARE GONO ALLOW US TO UM EITHER INCLUDE OR EXCLUDE THE DISEASE IN OUR DIFFERENTIAL DIAGNOSIS AND PROBABLY THE SINGLE MOST IMPORTANT QUESTION TO ASK IS HOW BAD IS THE PAIN THI SECTIONS ARE VERY VERY PAINFUL AND THE HANDFUL OF CASES I'VE SEEN THROUGHOUT MY CAREER THE PATIENTS HAVE BEEN IN SIGNIFICANT DISTRESS OFTEN VERY AGITATED CLUTCHING THEIR CHEST MOANING WRITHING IT'S NOT SUBTLE PAI"

In [34]:
with open('transcripts/9780323798785_0015.clean.txt') as f:
    real_text = f.read()

In [39]:
t2,t1=real_text.replace('>> Doctor Julianna Jung: ','').lower(),' '.join(transcript)
for k in  '.,!?\'\"”;:-=_~#^&%$*@/|()[]{}<>':
    t2 = t2.replace(k,'')

seq = SequenceMatcher(None,t2,t1)
matches = list(seq.get_matching_blocks())
matches = [m for m in matches if m.size>1]
#print(matches)
[t1[m.a:m.a+m.size] for m in matches]
mas = list(itertools.chain(*[(m.a,m.a+m.size) for m in matches]))
mbs = list(itertools.chain(*[(m.b,m.b+m.size) for m in matches]))
[(t1[:mbs[0]],t2[:mas[0]])]+[(t1[mbs[k]:mbs[k+1]],t2[mas[k]:mas[k+1]]) 
                for k in range(len(mbs)-1)]

[('', ''),
 ('when we take a history of a patient with ',
  'when we take a history of a patient with '),
 ('theortic dissection we want a', 'aortic dissection we want to'),
 (' look for features that are go', ' look for features that are go'),
 ('no allow us to um', 'ing to allow us to'),
 (' either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain ',
  ' either include or exclude the disease in our differential diagnosis and probably the single most important question to ask is how bad is the pain '),
 ("thi sections are very very painful and the handful of cases i've seen throughout my career the patients have been in significant distress often very agitated clutching their chest moaning writhing it's not subtle pain so you want to get a sense of how severe the pain is and how it's not subtle pain so you want to get a sense of how severe the pain is and how much distress the patient is in and wh

In [40]:
seq.ratio()

0.18316915282859556

In [272]:
np.mean([r in ' '.join(transcript2) for r in real_text.lower().split('.')])

0.11578947368421053

In [41]:
transcript2[-3:]

["s a three a an abdominal is the three be it's not so much important that you memorize these specific classification schemes as it is that you recognize that the anatomic location of aortic dessection is really really important because that's going to determine your management and it's also going to help you predict what complications the pationt might experience based on what vessels are going to be involved  let's have a closer look at the eortic dissection guide lines at first of course you have to identify patients who are at risk for ",
 'acute disection next you have to assess those at ris based on risk factors such as pain patterns exam findings and imaging if you find the patient has a low risk then continue with diagnostic evaluation as necessary if there is an intermediate risk you have to evaluate first for stemi and then for acute disection with a high risk patient an immediate surgery consultation and c t a are necessary if you have a clinically unstable patient order a '