In [2]:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
from pydub import AudioSegment

import json
client = speech.SpeechClient()

In [1]:
gcs_uri = "gs://ami_corpus/meeting_files/interruptions1.wav"

In [7]:
audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=44100,
    language_code="en-US",
    enable_speaker_diarization=True,
    diarization_speaker_count=4,
    audio_channel_count=2,
)

In [8]:
print("Waiting for operation to complete...")
response = client.long_running_recognize(config=config, audio=audio)

Waiting for operation to complete...


In [9]:
result = response.result().results

In [10]:
words_info = result[-1].alternatives[0].words

In [11]:
words_info 

[start_time {
}
end_time {
  nanos: 300000000
}
word: "I\'m"
speaker_tag: 2
, start_time {
  nanos: 300000000
}
end_time {
  nanos: 400000000
}
word: "really"
speaker_tag: 2
, start_time {
  nanos: 400000000
}
end_time {
  nanos: 700000000
}
word: "happy"
speaker_tag: 2
, start_time {
  nanos: 700000000
}
end_time {
  nanos: 900000000
}
word: "for"
speaker_tag: 2
, start_time {
  nanos: 900000000
}
end_time {
  seconds: 1
  nanos: 100000000
}
word: "you"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 100000000
}
end_time {
  seconds: 1
  nanos: 400000000
}
word: "imma"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 400000000
}
end_time {
  seconds: 1
  nanos: 500000000
}
word: "let"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 500000000
}
end_time {
  seconds: 1
  nanos: 600000000
}
word: "you"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 600000000
}
end_time {
  seconds: 1
  nanos: 800000000
}
word: "finish"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 8

In [25]:
for word in words_info: 
    if (word.speaker_tag == 1): 
        priint(word, word.start_time, word.end_time)

In [31]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )

In [38]:
def create_speaker_clips(sound_file_name, speaker_count, words):
    speaker_clips = [None] * speaker_count

    second_counter = 0
    prev_speaker_tag = None
    for word in words: 
        
        if((prev_speaker_tag == None or prev_speaker_tag == word.speaker_tag) and not speaker_clips[word.speaker_tag-1]):
            prev_speaker_tag = word.speaker_tag
            second_counter += (word.end_time - word.start_time).total_seconds() 
            
            if (second_counter >= 1): 
                speaker_clips[prev_speaker_tag-1] = (word.end_time.total_seconds() - second_counter, word.end_time.total_seconds())
                prev_speaker_tag = None
                second_counter = 0
        
        else:
            prev_speaker_tag = word.speaker_tag
            second_counter = 0
    
    for idx, clip in enumerate(speaker_clips):
        if(clip != None): 
            audio_chunk = sound[int(clip[0]*1000):int(clip[1]*1000)]
            audio_chunk_name = "speaker_{}_clip.wav".format(idx)
            audio_chunk.export(audio_chunk_name, format="wav")
            upload_blob("ami_corpus", audio_chunk_name, "meeting_files/{}/speaker_clips/{}".format(sound_file_name, audio_chunk_name))
    

    return True

print("run")

run


In [57]:
import numpy as np

def get_speaker_stats(speaker_count, words=words_info):

    stats = {}
    # Get Total Speaker Minutes 
    timing = [0] * speaker_count
    for word in words: 
        timing[word.speaker_tag-1] += (word.end_time - word.start_time).total_seconds() 
    timing =  [time / 60 for time in timing]

    # Speaker Average Spoken Time 
    average = np.average(timing)

    # Speaker Variance
    variance = np.var(timing)

    # Speaker Standard Deviation 
    stdev = np.std(timing)

    stats.update({"speaker_minutes" : timing})
    stats.update({"average_speaker_time" : average})
    stats.update({"variance_speaker_time" : variance})
    stats.update({"standard_deviation_speaker_time" : stdev})

    return stats

In [58]:
get_speaker_stats(4)

{'speaker_minutes': [0.0,
  0.17499999999999996,
  0.11166666666666665,
  0.3116666666666666],
 'average_speaker_time': 0.1495833333333333,
 'variance_speaker_time': 0.012682465277777772,
 'standard_deviation_speaker_time': 0.11261645207418751}

In [49]:
def get_split_centers(seconds):
    return [sec for sec in range(500, int(seconds*100)-1, 500)]

def process_recording(sound_files):
    counter = 0
    
    for sound_file in sound_files:
        sound = AudioSegment.from_wav(sound_file)
        for time in get_split_centers(sound.duration_seconds): 
            audio_chunk=sound[time-500:time+500]
            sound_file_name = sound_file.split(".wav")[0]
            file_name = "{}_{}_{}.wav".format(sound_file, time-500, time+500)
            audio_chunk.export(file_name, format="wav")
            upload_blob("ami_corpus", file_name, "meeting_files/" + sound_file_name + "/audio_chunks/" + file_name)
            counter += 1
        
         
        upload_blob("ami_corpus", sound_file, "meeting_files/" + sound_file_name + "/" + sound_file)



In [50]:
process_recording(["interruptions1.wav"])

File interruptions1.wav_0_1000.wav uploaded to meeting_files/interruptions1/interruptions1.wav_0_1000.wav.
File interruptions1.wav_500_1500.wav uploaded to meeting_files/interruptions1/interruptions1.wav_500_1500.wav.
File interruptions1.wav_1000_2000.wav uploaded to meeting_files/interruptions1/interruptions1.wav_1000_2000.wav.
File interruptions1.wav_1500_2500.wav uploaded to meeting_files/interruptions1/interruptions1.wav_1500_2500.wav.
File interruptions1.wav_2000_3000.wav uploaded to meeting_files/interruptions1/interruptions1.wav_2000_3000.wav.
File interruptions1.wav_2500_3500.wav uploaded to meeting_files/interruptions1/interruptions1.wav_2500_3500.wav.
File interruptions1.wav_3000_4000.wav uploaded to meeting_files/interruptions1/interruptions1.wav_3000_4000.wav.
File interruptions1.wav_3500_4500.wav uploaded to meeting_files/interruptions1/interruptions1.wav_3500_4500.wav.
File interruptions1.wav_4000_5000.wav uploaded to meeting_files/interruptions1/interruptions1.wav_4000_5

In [42]:
create_speaker_clips("interruptions1", 4, words_info)

File speaker_1_clip.wav uploaded to meeting_files/interruptions1/speaker_clips/speaker_1_clip.wav.
File speaker_2_clip.wav uploaded to meeting_files/interruptions1/speaker_clips/speaker_2_clip.wav.
File speaker_3_clip.wav uploaded to meeting_files/interruptions1/speaker_clips/speaker_3_clip.wav.


'Success'

In [27]:
storage_client = storage.Client()
bucket = storage_client.get_bucket("ami_corpus")
blob = bucket.blob("meeting_files/interruptions1.wav")
blob.download_to_filename("/tmp/headset.wav")

#Convert downloaded object and save the export to a tmp file
sound = AudioSegment.from_wav("/tmp/headset.wav")


In [30]:
audio_chunk = sound[0:1100]
audio_chunk.export("mofo.wav", format="wav")

<_io.BufferedRandom name='mofo.wav'>

In [14]:
def get_speaker_timings(words, speaker_count):
    timing = [0] * speaker_count
    for word in words: 
        timing[word.speaker_tag-1] += (word.end_time - word.start_time).total_seconds() 
    return [time / 60 for time in timing]

def get_speaker_stats(timing):
    return np.mean(timing), np.std(timing), np.var(timing)

def get_benchmark_communication():
    pass
timings = get_speaker_timings(words_info, 4)
get_speaker_stats(timings)

[4.941666666666662,
 2.0066666666666655,
 5.440000000000003,
 0.013333333333333334]

In [77]:
 
old_speaker = ""
old_end = datetime.timedelta(0)
start = 0
analysis_data = []
transcript = []

for word in words_info:
    tag = word.speaker_tag
    #print(word.word, word.speaker_tag)
    if(tag != old_speaker):
        analysis_data.append((old_speaker, tag, old_end.total_seconds()))
        start = word.start_time
        transcript = []
    else: 
        transcript.append(word.word)
    old_end = word.end_time
    old_speaker = tag

analysis_data

went
went
went
went
went
went
went
went
went
went


[('', 2, 0.0),
 (2, 4, 1.8),
 (4, 2, 4.5),
 (2, 4, 12.5),
 (4, 2, 26.3),
 (2, 4, 29.0),
 (4, 3, 38.1),
 (3, 2, 41.0),
 (2, 3, 45.3),
 (3, 2, 49.1)]

In [21]:
print(analysis_data)

[[1, 0, 0, []], [3, datetime.timedelta(seconds=3, microseconds=700000), datetime.timedelta(seconds=6, microseconds=800000), ['God', 'she', 'produced', 'a']], [4, datetime.timedelta(seconds=6, microseconds=800000), datetime.timedelta(seconds=9, microseconds=400000), ['presentation', 'I', 'think', "it's", 'already', 'on', 'actually']], [3, datetime.timedelta(seconds=14, microseconds=900000), datetime.timedelta(seconds=34, microseconds=900000), ['kind', 'of', 'medicine', 'work', 'a', 'place', 'in', 'the', 'back', 'but']], [4, datetime.timedelta(seconds=37, microseconds=900000), datetime.timedelta(seconds=40, microseconds=800000), ['right']], [3, datetime.timedelta(seconds=50, microseconds=200000), datetime.timedelta(seconds=50, microseconds=800000), []], [1, datetime.timedelta(seconds=55, microseconds=800000), datetime.timedelta(seconds=74, microseconds=500000), ["it's", 'the', 'kickoff', 'meeting', 'for', 'our', 'project', 'this', 'is', 'just', 'what', "we're", 'going', 'to', 'be', 'doin

In [29]:
response.result()

TypeError: 'LongRunningRecognizeResponse' object is not subscriptable

In [14]:
speaker_change_times = [x[1].seconds * 1000 + x[1].microseconds  / 1000 for x in analysis_data[1:]]

NameError: name 'analysis_data' is not defined

In [24]:
sound = AudioSegment.from_wav("interruptions1.wav")

In [12]:
sound[1000:2000].export('newSong.wav', format="wav")

<_io.BufferedRandom name='newSong.wav'>

In [25]:
for time in range(1000, 55000, 1000): #speaker_change_times:
    audio_chunk=sound[time-500:time+500]
    audio_chunk.export( "audio_chunk_{}.wav".format(time/1000), format="wav")


In [5]:
sound.duration_seconds

1272.64

In [12]:


split_centers = get_split_centers(sound.duration_seconds)

In [13]:
len(split_centers)

255

In [5]:
import jxmlease

In [34]:
xml_files = ['ES2002a.A.words.xml', 'ES2002a.B.words.xml', 'ES2002a.C.words.xml', 'ES2002a.D.words.xml']

# THIS SHIT IS IMPORTANT 

In [65]:
def get_speaker_change_times(xml_files):
    speaker_change_times = []
    
    for xml_file in xml_files:
        with open(xml_file) as topic:
            topic_content = topic.read()

        root = jxmlease.parse(topic_content)
        words = root['nite:root']['w']

        for i in range(1, len(words)):
            if(words[i-1].get_xml_attr('endtime') != words[i].get_xml_attr('starttime')):
                speaker_change_times.append(words[i].get_xml_attr('starttime'))
    
    speaker_change_times = set(speaker_change_times)
        
    return [int(float(x)*1000) for x in speaker_change_times]
        
            

In [66]:
speaker_change_times = get_speaker_change_times(xml_files)
get_sound_chunks(["/tmp/headset.wav"], speaker_change_times)