## Load needed modules and define path to data

In [None]:
#for speech to text
import speech_recognition as sr

In [None]:
#for pyannote-audio's diarisation
import torch
from huggingface_hub import HfApi
available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")]
available_pipelines

In [None]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

## List Files in Tarxya Bucket

In [None]:
!aws s3 ls voicetarxya/purchased/

## Specify the file and desired time window

In [None]:
!aws s3 cp s3://voicetarxya/purchased/in-443303309465-07449988008-20190624-110909-1561370949.138008.wav test_audio_from_s3.wav
#!aws s3 cp s3://voicetarxya/purchased/q-801-1562945353.213997.wav test_audio_from_s3.wav
!ls *wav

base = "test_audio_from_s3" 
audio = base + ".wav" 

In [None]:
GLOBAL_START = 0. #480.
GLOBAL_END = 60. #540.

## Speech to text on specified time segment

In [None]:
r = sr.Recognizer()
audio_sr = sr.AudioFile(audio)



In [None]:
with audio_sr as source:
    audiodata = r.record(source, offset=GLOBAL_START, duration = GLOBAL_END-GLOBAL_START)
try:
    print(r.recognize_google(audiodata,language="en-GB"))
except Exception as e:
    print("Error : " + str(e))

# DONT RUN THIS NEXT CELL!
## this performs the diarisation on the entire audio file, it takes a while to run

In [None]:
dia = pipeline(audio)
dia

## Print out resulting time buckets of diarisation within desired time section
## Visualise the output 

In [None]:
for turn, _, speaker in dia.itertracks(yield_label=True):
    if (turn.start > GLOBAL_START and turn.start < GLOBAL_END):
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

In [None]:
# we visualize [0, 30] time range
from pyannote.core import notebook, Segment
notebook.crop = Segment(GLOBAL_START, GLOBAL_END)
dia

## Speech recognition on the above diarisation time bins and output with labelled speakers

In [None]:
import speech_recognition as sr

r = sr.Recognizer()
audio_sr = sr.AudioFile(audio)
!ls *wav

In [None]:
for turn, _, speaker in dia.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    if (turn.start > GLOBAL_START and turn.start < GLOBAL_END):
        with audio_sr as source:
            audiodata = r.record(source, offset=start_time-0.1, duration = end_time-start_time+0.1)
        try:
            words = r.recognize_google(audiodata,language="en-GB")
            #print(f"*------------------------START---------t={turn.start:.1f}s--------*")
            print(f"{speaker} : {words}")
            print("")
            #print(f"*-------------------------END----------t={turn.end:.1f}s--------*")
        except Exception as e:
            print(f"{speaker} : ???")
            print("")
        

##  Speech to text on specified time segment

In [None]:
with audio_sr as source:
    audiodata = r.record(source, offset=GLOBAL_START, duration = GLOBAL_END-GLOBAL_START)
try:
    print(r.recognize_google(audiodata,language="en-GB"))
except Exception as e:
    print("Error : " + str(e))

## My attempt at stitching audio pieces together

##### idea here is that sometimes continuous speach by one speaker is broken into multiple chunks here, one long chunk is better for speech to text

In [None]:
speech_fragments = []
for turn, track, speaker in dia.itertracks(yield_label=True):
    if (turn.end > GLOBAL_START and turn.start < GLOBAL_END):
        speech_fragments += [[speaker,turn.start,turn.end,"unique"]]
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
#for i in range(len(speech_fragments)): print(speech_fragments[i])

In [None]:
for i in reversed(range(len(speech_fragments)-1)): # the reverse is important as we want to pull the latest finish time to the first start time
    frag = speech_fragments[i]
    nextfrag = speech_fragments[i+1]
    if frag[0]==nextfrag[0]:
        nextfrag[1] = frag[1]
        frag[2]=nextfrag[2]
        nextfrag[3] = "repeat"
    
for frag in speech_fragments: print(frag)
        


In [None]:
fixed_speech = []      
for frag in speech_fragments:
    if (frag[3]=="unique"):
        fixed_speech += [frag]
        
for frag in fixed_speech: print(f"start={frag[1]:.1f}s stop={frag[2]:.1f}s speaker_{frag[0]}")
        

In [None]:
for frag in fixed_speech:
    with audio_sr as source:
            audiodata = r.record(source, offset=frag[1]-0.1, duration = frag[2]-frag[1]+0.1)
    try:
        words = r.recognize_google(audiodata,language="en-GB")
        print(f"*------------------------START---------t={frag[1]:.1f}s--------*")
        print(f"{frag[0]} : {words}")
        print(f"*-------------------------END----------t={frag[2]:.1f}s--------*")
    except Exception as e:
        print(f"*------------------------START---------t={frag[1]:.1f}s--------*")
        print(f"??? ")
        print(f"*-------------------------END----------t={frag[2]:.1f}s--------*")
    

## hand transcribed

SPEAKER_00 : yeah 

SPEAKER_01 : but none of it comes back out to me it's a limited company and it just says there the third one is umm there are four four of us it's a mental health trust 

SPEAKER_00 : yeah 

SPEAKER_01 : and the money just goes in to pay the employees salary I don't take anything from it 

SPEAKER_00 : errrr

SPEAKER_01 : and nor could I 

SPEAKER_00 : no no sure erm ok 

SPEAKER_01 : it's that I don't want to take it and then gum(?) someone goes ah well that wasn't our

SPEAKER_00 : no 

??? 

SPEAKER_00 : no no no what I'm going to do is I'll based on what you said is i'm going to ring up april(?) UK and clarify that with them cos I think it's important to so

SPEAKER_01 : yea

SPEAKER_00 : first one is it's open but there's nothing going in and 

SPEAKER_01 : or out it's not trading 

SPEAKER_00 : or out not trading ok the second one is cash in but it goes automatically so that's holiday lettings and it automatically just 

SPEAKER_01 : well yea it sits in the business it sits in the business it's a limited company

## plain speech to text

cheers yeah none of it comes back out to me it's a limited company and it just says the third one is there are four of us it's a mental health trust yeah and the money just goes into pay the employees salary I don't take anything from it no could I know no sure ok so that I don't want to take it and then got someone goes all that wasn't are no woman no no no what I'm going to do is I'll basically what you said he's going to ring up 84 UK and clarify that with them so I think it's important to so first one is it's open but there's nothing going in and it's not trading or non-trading ok the second one is cash in but it goes automatically so that's holiday lettings and it automatically just get it it's in the business it's a limited

## automatic transcript

In [None]:
for frag in fixed_speech:
    with audio_sr as source:
            audiodata = r.record(source, offset=frag[1]-0.1, duration = frag[2]-frag[1]+0.1)
    try:
        words = r.recognize_google(audiodata,language="en-GB")
        print(f"{frag[0]} : {words}")
        print("")
    except Exception as e:
        print(f"{frag[0]} : ??? ")
        print("")

# To do

#### -> try ignoring bins of too small width or merge them with bigger bins
#### -> try messing with yield_label=True in the dia.intertracks
#### -> look into the _ argument too?
#### -> try extending bins to regions of no speech, maybe improves accuracy?