## Load needed modules and define path to data

In [1]:
on_aws = False
import time # only needed to time differnet algorithms

In [2]:
#for editing wav files to text
from pydub import AudioSegment 
from pydub.playback import play
from pydub.utils import mediainfo

In [3]:
#for speech to text
import speech_recognition as sr

In [4]:
#for pyannote-audio's diarisation
import torch
from huggingface_hub import HfApi
available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")]
available_pipelines

['pyannote/speaker-segmentation',
 'pyannote/speaker-diarization',
 'pyannote/voice-activity-detection',
 'pyannote/overlapped-speech-detection']

In [5]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

In [6]:
# list files in tarxya bucket
if on_aws:
    !aws s3 ls voicetarxya/purchased/

In [7]:
# specify the audio file and the desired window
if on_aws:
    print("ON AWS!")
    !aws s3 cp s3://voicetarxya/purchased/in-443303309465-07449988008-20190624-110909-1561370949.138008.wav test_audio_from_s3.wav
    #!aws s3 cp s3://voicetarxya/purchased/q-801-1562945353.213997.wav test_audio_from_s3.wav
    !ls *wav
    base = "test_audio_from_s3" 
    audio = base + ".wav" 
    GLOBAL_START = 0. 
    GLOBAL_END = 60. 
if not on_aws:
    print("NOT ON AWS!")
    !ls *wav
    base = "audio" 
    audio = base + ".wav"
    GLOBAL_START = 0.
    GLOBAL_END = 632.

NOT ON AWS!
ls: cannot access '*wav': No such file or directory


## use pydub to edit the audio file 

In [None]:
wav_file = AudioSegment.from_file(file = "audio.wav", format = "wav")
info = mediainfo("audio.wav")
ratio = float(len(wav_file))/float(info["duration_ts"])
sample_rate = wav_file.frame_rate*ratio
start_index = int(GLOBAL_START*sample_rate)
end_index = int(GLOBAL_END*sample_rate)

In [None]:
modified_wav_file = wav_file[start_index:end_index]
modified_wav_file.export(out_f = "segment.wav" , format = "wav")

In [None]:
# Audio file
modified_wav_file

## Speech to text on specified time segment

In [None]:
r = sr.Recognizer()
audio_sr = sr.AudioFile("segment.wav")



In [None]:
start_time = time.time()
with audio_sr as source:
    audiodata = r.record(source)#, offset=GLOBAL_START, duration = GLOBAL_END-GLOBAL_START)
try:
    print(r.recognize_google(audiodata,language="en-GB"))
except Exception as e:
    print("Error : " + str(e))
end_time = time.time()
print("\nTime taken : ",end_time-start_time)

# WARNING THE NEXT CELL IS SLOW TO RUN, DONT RUN UNLESS NEED!
## this performs the diarisation on the entire audio file, it takes a while to run

In [None]:
# diarization
start_time = time.time()
dia = pipeline("segment.wav")
end_time = time.time()
end_time = time.time()
print("\nTime taken : ",end_time-start_time)

In [None]:
# plot whole dia
dia

In [None]:
# this can be used to crop the diarization if you dont do that before
# crop time interval and replot
#from pyannote.core import notebook, Segment
#notebook.crop = Segment(GLOBAL_START, GLOBAL_END)
#dia

## Print out resulting time buckets of diarisation within desired time section
## Visualise the output 

In [None]:
for turn, _, speaker in dia.itertracks(yield_label=True):
    if (True):#turn.start > GLOBAL_START and turn.start < GLOBAL_END):
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

## Speech recognition on the above diarisation time bins and output with labelled speakers

In [None]:
r = sr.Recognizer()
audio_sr = sr.AudioFile("segment.wav")
!ls *wav

In [None]:
for turn, _, speaker in dia.itertracks(yield_label=True):
    start_time = turn.start
    end_time = turn.end
    if (True):#turn.start > GLOBAL_START and turn.start < GLOBAL_END):
        with audio_sr as source:
            audiodata = r.record(source, offset=start_time-0.1, duration = end_time-start_time+0.1)
        try:
            words = r.recognize_google(audiodata,language="en-GB")
            print(f"*------------------------START---------t={turn.start:.1f}s--------*")
            print(f"{speaker} : {words}")
            print("")
            print(f"*-------------------------END----------t={turn.end:.1f}s--------*")
        except Exception as e:
            print(f"{speaker} : ???")
            print("")
        

## My attempt at stitching audio pieces together

##### idea here is that sometimes continuous speach by one speaker is broken into multiple chunks here, one long chunk is better for speech to text

In [None]:
# list each bin output from diariazation, some adjacent bins have the same speaker

speech_fragments = []
for turn, track, speaker in dia.itertracks(yield_label=True):
    if (True):#turn.end > GLOBAL_START and turn.start < GLOBAL_END):
        speech_fragments += [[speaker,turn.start,turn.end,"new"]]
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
#for i in range(len(speech_fragments)): print(speech_fragments[i])

In [None]:
# label if speaker is new or repeated and 
# make the "new" buckets have start and end times that absorb the later buckets with same speaker

# the reverse loop is important as we want to pull the latest finish time to the first start time

for i in reversed(range(len(speech_fragments)-1)): 
    frag = speech_fragments[i]
    nextfrag = speech_fragments[i+1]
    if frag[0]==nextfrag[0]:
        nextfrag[1] = frag[1]
        frag[2]=nextfrag[2]
        nextfrag[3] = "repeat"
    
for frag in speech_fragments: print(frag)
        


In [None]:
# output the modified/grrouped buckets so that there are no more neighbouting buckets with same speaker

fixed_speech = []      
for frag in speech_fragments:
    if (frag[3]=="new"):
        fixed_speech += [frag]
        
for frag in fixed_speech: print(f"start={frag[1]:.1f}s stop={frag[2]:.1f}s speaker_{frag[0]}")
        

## Here print the STT on the grouped speech buckets

In [None]:
start_time = time.time()
for frag in fixed_speech:
    with audio_sr as source:
            audiodata = r.record(source, offset=frag[1]-0.1, duration = frag[2]-frag[1]+0.1)
    try:
        words = r.recognize_google(audiodata,language="en-GB")
        #print(f"*------------------------START---------t={frag[1]:.1f}s--------*")
        print(f"{frag[0]} : {words}")
        print("")
        #print(f"*-------------------------END----------t={frag[2]:.1f}s--------*")
    except Exception as e:
        #print(f"*------------------------START---------t={frag[1]:.1f}s--------*")
        print(f"{frag[0]} : ??? ")
        print("")
        #print(f"*-------------------------END----------t={frag[2]:.1f}s--------*")
end_time = time.time()
print(f"Transcript time : {end_time-start_time}s")
    

## hand transcribed

SPEAKER_00 : yeah 

SPEAKER_01 : but none of it comes back out to me it's a limited company and it just says there the third one is umm there are four four of us it's a mental health trust 

SPEAKER_00 : yeah 

SPEAKER_01 : and the money just goes in to pay the employees salary I don't take anything from it 

SPEAKER_00 : errrr

SPEAKER_01 : and nor could I 

SPEAKER_00 : no no sure erm ok 

SPEAKER_01 : it's that I don't want to take it and then gum(?) someone goes ah well that wasn't our

SPEAKER_00 : no 

??? 

SPEAKER_00 : no no no what I'm going to do is I'll based on what you said is i'm going to ring up april(?) UK and clarify that with them cos I think it's important to so

SPEAKER_01 : yea

SPEAKER_00 : first one is it's open but there's nothing going in and 

SPEAKER_01 : or out it's not trading 

SPEAKER_00 : or out not trading ok the second one is cash in but it goes automatically so that's holiday lettings and it automatically just 

SPEAKER_01 : well yea it sits in the business it sits in the business it's a limited company

# To do

#### -> try ignoring bins of too small width or merge them with bigger bins
#### -> try messing with yield_label=True in the dia.intertracks
#### -> look into the _ argument too?
#### -> try extending bins to regions of no speech, maybe improves accuracy?
#### -> identify customer vs seller from use of works like I and me vs we or you? other indicators
#### -> write a wrappper function around the deepspeech as its a bit cumbersome.
#### -> maybe think about binning start and end buckets if they are too small, or finding the start/finish of buckets of they are slightly outside the diarization - might need to diarize t_0<->t_1 and t_0-delta <-> t_1+delta

## wordcounting

In [None]:
from collections import defaultdict
import re

start_time = time.time()
customer_counts = defaultdict(int)
seller_counts = defaultdict(int)
diff_counts = defaultdict(int)

for frag in fixed_speech:
    with audio_sr as source:
        audiodata = r.record(source, offset=frag[1]-0.1, duration = frag[2]-frag[1]+0.1)
    try:
        words = r.recognize_google(audiodata,language="en-GB")
        if frag[0] == 'SPEAKER_00':
            for word in re.findall('\w+', words.replace("'"," ")):
                diff_counts[word] -= 1
                customer_counts[word] += 1
        elif frag[0] == 'SPEAKER_01':
            for word in re.findall('\w+', words.replace("'"," ")):
                diff_counts[word] += 1
                seller_counts[word] += 1
    except Exception as e:
        pass

end_time = time.time()
print(f"Wordcount time : {end_time-start_time}s")



In [None]:
marked_list = sorted(diff_counts.items(), key=lambda x:x[1])
sorted_diff_counts = dict(marked_list)
sorted_diff_counts

In [None]:
marked_list = sorted(customer_counts.items(), key=lambda x:x[1])
customer_counts = dict(marked_list)
customer_counts

In [None]:
marked_list = sorted(seller_counts.items(), key=lambda x:x[1])
seller_counts = dict(marked_list)
seller_counts

In [None]:
string_thing = "hello hello here is a listy poo of words yes yes hello"
splitted = string_thing.split()
print(string_thing)
splitted

In [None]:
customer_time = 0.
seller_time = 0.

for frag in fixed_speech:
    if frag[0] == 'SPEAKER_01':
        customer_time += frag[2]-frag[1]
    elif frag[0] == 'SPEAKER_00':
        seller_time += frag[2]-frag[1]
print(f"Sellers talking time : {round(seller_time)}s\nCustomers talking time : {round(customer_time)}s")