In [1]:
import pandas as pd
from collections import Counter, namedtuple
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from pyannote.metrics.errors.identification import IdentificationErrorAnalysis as IEA

from pyannote.core import Segment, Timeline, Annotation
from pyannote.metrics.diarization import DiarizationErrorRate
from pydub import AudioSegment
from pydub.playback import play
from pyannote.metrics.identification import IdentificationPrecision
from librosa import display as DS
import librosa
from sphfile import SPHFile
import wave
import pickle
import glob
from pydub import AudioSegment


In [2]:
file_name_header = "~/DEMO_DATA/"


In [3]:
def speaker_designation_df(
                 sph_path ,
                 text_data_path            
                
                ):
    
    """
    create speaker designation
    path : path to sph file
    text_data_path : path to transcriptions
    """
    # print(sph_path,text_data_path)
    data = open( text_data_path,'r').read().splitlines()
    data_2 = [item.split(' ')[:3]+[' '.join(item.split(' ')[3:])]  for item in data ]
    df = pd.DataFrame(data_2  , columns=['start','stop','speaker','spoken'])
    df['spoken_clean'] = [striphtml(item)for item in df['spoken']]
    df["start"] = pd.to_numeric(df["start"])
    df["stop"] = pd.to_numeric(df["stop"])
    min_val = min(df.start)
    df['start_reset']=df.start -min_val
    df['stop_reset']=df.stop -min_val
    df['duration'] = df.stop - df.start 

    
    
    return df

#### This what the text file looks like
start | stop | speaker | text

In [4]:
open(glob.glob(file_name_header+'original_data/*.txt')[0],'r').readlines()[:3]

['426.278 427.293 spkr1: Well. \n',
 '426.289 426.879 spkr4: Mhm.\n',
 '427.576 431.446 spkr1: This is what highlights the broad -- broadly the costs that we associated with. \n']

In [5]:

import re
## text processing
def striphtml(data):
    p = re.compile(r'<.*=>.*?>')
    return p.sub('', data)
def overlap(start1, end1, start2, end2):
    """Does the range (start1, end1) overlap with (start2, end2)?"""
    return end1 >= start2 and end2 >= start1

In [6]:
speaker_designation_df(sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0]).head(5)


Unnamed: 0,start,stop,speaker,spoken,spoken_clean,start_reset,stop_reset,duration
0,426.278,427.293,spkr1:,Well.,Well.,0.0,1.015,1.015
1,426.289,426.879,spkr4:,Mhm.,Mhm.,0.011,0.601,0.59
2,427.576,431.446,spkr1:,This is what highlights the broad -- broadly t...,This is what highlights the broad -- broadly t...,1.298,5.168,3.87
3,431.27,431.941,spkr4:,Mhm.,Mhm.,4.992,5.663,0.671
4,431.446,432.712,spkr1:,"First of all, we have this","First of all, we have this",5.168,6.434,1.266


In [7]:
def create_non_overlap_df(data_df):
    """
    data_df = pandas dataframe, output of the function speaker_designation_df
    return non_overlap_df : dataframe with non ovelapping speakers
    """
    df = data_df
    time_ranges = list(enumerate((zip(df.start_reset, df.stop_reset))))
    
    overlap_data = []
    for i in range(df.shape[0]):
        #print(i, end = ' ')
        idx, start_stop = time_ranges[i]
        start, stop = start_stop
        for j in range(df.shape[0]):
            if i!=j:
                jdx, start_stop__compare = time_ranges[j]
                start_c, stop_c = start_stop__compare

                if overlap(start, stop, start_c, stop_c):
                   # print(start, stop, start_c, stop_c)
                    overlap_data.append((i,j))
                    
                    
    non_overlap_idx = [i for i in range(df.shape[0]) if i not in list(set([item[0] for item in overlap_data]))]
    non_overlap_df = df.ix[non_overlap_idx]


    return non_overlap_df

In [8]:
create_non_overlap_df(speaker_designation_df(sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0])).head(5)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Unnamed: 0,start,stop,speaker,spoken,spoken_clean,start_reset,stop_reset,duration
5,432.812,434.017,spkr1:,kinetic battery.,kinetic battery.,6.534,7.739,1.205
6,434.307,436.552,spkr1:,And this costs us three euros.,And this costs us three euros.,8.029,10.274,2.245
10,439.034,440.489,spkr1:,And %uh,And %uh,12.756,14.211,1.455
11,440.919,443.283,spkr1:,then if we give %uh --,then if we give %uh --,14.641,17.005,2.364
15,447.367,449.421,spkr1:,%uh %uh that would cost us four.,%uh %uh that would cost us four.,21.089,23.143,2.054


In [9]:
def get_speaker_attribution(non_overlap_df):
    """
    label utterence type
    "start" : this and the next segement is spoken by the same speaker
    "cont": the previous and  the next segement is spoken by the same speaker
    "end": the previous is spoken by this speaker, and  the next segement is spoken by another speaker
    """
    idx_spkr_duartion = list(zip(non_overlap_df.index,non_overlap_df.speaker, non_overlap_df.duration))
    speaker_attribution = []
    for i in range(len(idx_spkr_duartion)):
        idx_c, speaker_c, duration_c = idx_spkr_duartion[i]
        if i not in [0,len(idx_spkr_duartion)-1]:
            #print(i)


            idx_b, speaker_b, duration_b = idx_spkr_duartion[i+1]
            idx_a, speaker_a, duration_a = idx_spkr_duartion[i-1]
            same_b = speaker_c == speaker_b
            same_a = speaker_c == speaker_a


            if same_a == False and same_b == True:
                status = 'start'
            if same_a == True and same_b == False:
                status = 'end'
            if same_a == True and same_b == True:
                status = 'cont'
            if same_a == False and same_b == False:
                status = 'intj'




        if i == 0:
            status = 'start'

        if i == len(idx_spkr_duartion)-1:
            status = 'end'

        speaker_attribution.append((idx_c,speaker_c, duration_c,status))
        
        
    return speaker_attribution



    
    


In [10]:
get_speaker_attribution(create_non_overlap_df(speaker_designation_df(sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0])))[:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


[(5, 'spkr1:', 1.205000000000041, 'start'),
 (6, 'spkr1:', 2.2450000000000045, 'cont'),
 (10, 'spkr1:', 1.455000000000041, 'cont'),
 (11, 'spkr1:', 2.363999999999976, 'cont'),
 (15, 'spkr1:', 2.0540000000000305, 'end'),
 (19, 'spkr2:', 0.3229999999999791, 'intj'),
 (20, 'spkr1:', 1.0319999999999823, 'start'),
 (23, 'spkr1:', 0.9569999999999936, 'cont'),
 (31, 'spkr1:', 0.4009999999999536, 'cont'),
 (32, 'spkr1:', 1.3020000000000778, 'cont')]

In [11]:
def cut_audio_files(non_overlap_df,sph_filepath,audio_outfile_dir):
    """
    cut and make audio files with single phrases
    non_overlap_df : pandas dataframe, out out of the funtion create_non_overlap_df
    sph_filepath: path to sph file
    audio_outfile_dir : path where the single phrases will be saved as wav files in as filder called 'single_phrases'
    
    returns list of tuples [(dataframe index, start time in sph file, stop time in sph file), ()]
    
    """
    cut_time_ranges = list(zip(non_overlap_df.index,non_overlap_df.start, non_overlap_df.stop))
    for idx_start_stop in cut_time_ranges:
        idx,start,stop = idx_start_stop
        #audiofilename = 'non_overlap_data_{}/single_phrases/{}_{}_{}_.wav'.format(file_info,str(idx), str(start), str(stop))
        #print(audiofilename)
        sph =SPHFile(sph_filepath)
        try:
            os.listdir(audio_outfile_dir+'single_phrases/')
        except:
            os.makedirs(audio_outfile_dir+'single_phrases/')
        audio_outfilename_mod = audio_outfile_dir+'single_phrases/{}_{}_{}_.wav'.format(str(idx),str(start)[:-2],str(stop)[:-2])
        
        #print(audio_outfilename_mod)
        #print('....-')
        sph.write_wav( audio_outfilename_mod, start,stop )

    return cut_time_ranges

In [12]:
nodf = create_non_overlap_df(speaker_designation_df(sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0]))
cut_audio_files(nodf,glob.glob(file_name_header+'original_data/*.sph')[0],  file_name_header)[:10]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


[(5, 432.81199999999995, 434.017),
 (6, 434.30699999999996, 436.55199999999996),
 (10, 439.034, 440.48900000000003),
 (11, 440.91900000000004, 443.283),
 (15, 447.36699999999996, 449.421),
 (19, 454.103, 454.426),
 (20, 454.685, 455.717),
 (23, 463.745, 464.702),
 (31, 475.095, 475.496),
 (32, 475.87699999999995, 477.17900000000003)]

In [13]:
def multi_speaker_segments(speaker_attribution):
    
    """
    speaker_attribution: output of the function get_speaker_attribution()
    
    Creates segments with mutiple speakers
    returns : list with dataframe index number, spekaer label, duration and utterence type
    
    """
    
    
    
    multi_speaker_segment_data = []
    mid_list = []
    count = 0
    duration = 0
    for i in range(len(speaker_attribution)):


        #print(mid_list)

        idx,lab,dur,attrib = speaker_attribution[i]
        if attrib == 'end' or  attrib == 'start':
            count+=1
            #print(count,attrib,lab, end = '--->')
        if attrib in ['end', 'start', 'cont']:
            mid_list.append(speaker_attribution[i])
        if attrib in ['intj'] and dur > 5:
            #print('long_interjection',count,attrib,lab, end = '--->')
            mid_list.append(speaker_attribution[i])

        if attrib in ['intj'] and dur < 5 :
            idx_1m,lab_1m,dur_1m,attrib_1m = speaker_attribution[i-1]
            idx_1p,lab_1p,dur_1p,attrib_1p = speaker_attribution[i+1]

            if lab_1m == lab_1p:
                #print('!!!!!!',count,attrib,lab,dur, end = '<----')
                count =-2




        if count ==4:
            multi_speaker_segment_data.append(mid_list)
            mid_list = []
            count = 0
            #print(duration)
            #print('||')
            duration = 0
            
    return multi_speaker_segment_data
        

        


        
    

In [14]:
sprk_attr = get_speaker_attribution(create_non_overlap_df(speaker_designation_df(sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0])))
multi_speaker_segments(sprk_attr)[:3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


[[(5, 'spkr1:', 1.205000000000041, 'start'),
  (6, 'spkr1:', 2.2450000000000045, 'cont'),
  (10, 'spkr1:', 1.455000000000041, 'cont'),
  (11, 'spkr1:', 2.363999999999976, 'cont'),
  (15, 'spkr1:', 2.0540000000000305, 'end'),
  (20, 'spkr1:', 1.0319999999999823, 'start'),
  (23, 'spkr1:', 0.9569999999999936, 'cont'),
  (31, 'spkr1:', 0.4009999999999536, 'cont'),
  (32, 'spkr1:', 1.3020000000000778, 'cont'),
  (46, 'spkr1:', 5.286000000000001, 'end'),
  (63, 'spkr1:', 4.63799999999992, 'start'),
  (64, 'spkr1:', 3.830000000000041, 'end'),
  (79, 'spkr2:', 1.9440000000000737, 'start'),
  (80, 'spkr2:', 0.63799999999992, 'cont'),
  (81, 'spkr2:', 1.1779999999998836, 'end')],
 [(99, 'spkr2:', 0.8199999999999363, 'start'),
  (110, 'spkr2:', 0.8920000000000528, 'cont'),
  (111, 'spkr2:', 1.4149999999999636, 'cont'),
  (112, 'spkr2:', 1.5919999999998709, 'cont'),
  (113, 'spkr2:', 2.09599999999989, 'end'),
  (217, 'spkr4:', 2.0920000000000982, 'start'),
  (218, 'spkr4:', 0.8660000000000991, 'c

In [15]:



def join_files(list_file_names, out_file_name):
    """
    join multiple wav files
    """
    infiles = list_file_names
    outfile = out_file_name
    outsound = None
    for infile in infiles:
        sound = AudioSegment.from_wav(infile)
        if outsound == None:
            outsound = sound
        else:
            outsound = outsound+sound
            
    outsound.export(out_file_name, format="wav")
            
            


In [16]:
def create_artificial_segments(file_name_header,multi_speaker_segment_data):
    
    """
    cut and make audio files with mulpliple non overlapping phrases and speaker,
    and save them to a folder called 'artificial_segments'

    file_name_header: path where preprocessed data is stored
    
    multi_speaker_segment_data : output from the funtion multi_speaker_segments
    
       
    
    returns list of file paths where the artificial segments are saved on disk

    
    
    
    """
    
    
    
    relevant_file_names = [file_name_header+'single_phrases/'+item for item in os.listdir(file_name_header+'single_phrases/')]



    artificial_filenames = []
    segment_num = 0
    try:
        os.listdir(file_name_header+'artificial_segments/')
    except:
        os.makedirs(file_name_header+'artificial_segments/')

    for segment in multi_speaker_segment_data:
        list_file_names = []

        for ind_seg in segment:
            idx_info = ind_seg[0]
            for fname in relevant_file_names:
                if fname.startswith(file_name_header+'single_phrases/'+str(idx_info)+'_'):
                    #print(fname)
                    list_file_names.append(fname)
        artificial_filename =file_name_header+'artificial_segments/segment_'+str(segment_num)+'_.wav'
        join_files(list_file_names, artificial_filename)
        artificial_filenames.append(artificial_filename)
        #print(list_file_names)

        #print('-----')
        segment_num +=1
        
        
    return artificial_filenames

In [17]:
ot_dt = multi_speaker_segments(sprk_attr)
create_artificial_segments(file_name_header,ot_dt)

['/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_0_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_1_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_2_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_3_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_4_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_5_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_6_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_7_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_8_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_9_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_10_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segme

In [18]:

def prepare_audio_data(file_name_header,
                       sph_path,
                       text_data_path, 
                       cut_files =True,
                       save_df = False,
                       file_label = 'x'
                      ):
    """
    file_name_header: path where preprocessed data should be stored
    sph_path: path to sph file
    text_data_path: path to txt file with annotation 
    
    cut_files : if true, then files are cute, otherwise skipped
    
    returns: 
    df : dataframe with all speaker data
    non_overlap_df: dataframe with non overlapping speaker data
    speaker_attribution: list with dataframe index number, spekaer label, duration and utterence type
    cut_ranges: list of tuples of non overlapping speakers [(df index, start time in sph file, 
                stop time in sph file), ()]
    multi_speaker_segment_data: list with dataframe index number, spekaer label, duration and utterence type
                                of segments with multiple speakers
    
    artificial_filenames: list of file paths where the artificial multispeaker segments are saved on disk
    
    
    
    """

    df = speaker_designation_df(
                      sph_path=sph_path,
                      text_data_path=text_data_path
                     )
    non_overlap_df = create_non_overlap_df(df)
    speaker_attribution= get_speaker_attribution(non_overlap_df)
    
    if cut_files:

        cut_ranges = cut_audio_files(non_overlap_df =non_overlap_df,
                    sph_filepath =sph_path, audio_outfile_dir=file_name_header)
    else:
        cut_ranges = None

    multi_speaker_segment_data = multi_speaker_segments(speaker_attribution)
    artificial_filenames = create_artificial_segments(file_name_header,multi_speaker_segment_data)
    
    save_output = df, non_overlap_df,speaker_attribution,cut_ranges,multi_speaker_segment_data,artificial_filenames
    if save_df:
        try:
            os.listdir(file_name_header+'df_data/')
        except:
            os.makedirs(file_name_header+'df_data/')

        
        
        pickle.dump(save_output, open(file_name_header+'df_data/'+file_label+'.pkl','wb' ))
    
    return df, non_overlap_df,speaker_attribution,cut_ranges,multi_speaker_segment_data,artificial_filenames





In [20]:
df, non_overlap_df,speaker_attribution,cut_ranges,multi_speaker_segment_data,artificial_filenames = prepare_audio_data(file_name_header,sph_path= glob.glob(file_name_header+'original_data/*.sph')[0],text_data_path = glob.glob(file_name_header+'original_data/*.txt')[0], cut_files =True,save_df = True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [21]:
df.head(5)

Unnamed: 0,start,stop,speaker,spoken,spoken_clean,start_reset,stop_reset,duration
0,426.278,427.293,spkr1:,Well.,Well.,0.0,1.015,1.015
1,426.289,426.879,spkr4:,Mhm.,Mhm.,0.011,0.601,0.59
2,427.576,431.446,spkr1:,This is what highlights the broad -- broadly t...,This is what highlights the broad -- broadly t...,1.298,5.168,3.87
3,431.27,431.941,spkr4:,Mhm.,Mhm.,4.992,5.663,0.671
4,431.446,432.712,spkr1:,"First of all, we have this","First of all, we have this",5.168,6.434,1.266


In [22]:
non_overlap_df.head(5)

Unnamed: 0,start,stop,speaker,spoken,spoken_clean,start_reset,stop_reset,duration
5,432.812,434.017,spkr1:,kinetic battery.,kinetic battery.,6.534,7.739,1.205
6,434.307,436.552,spkr1:,And this costs us three euros.,And this costs us three euros.,8.029,10.274,2.245
10,439.034,440.489,spkr1:,And %uh,And %uh,12.756,14.211,1.455
11,440.919,443.283,spkr1:,then if we give %uh --,then if we give %uh --,14.641,17.005,2.364
15,447.367,449.421,spkr1:,%uh %uh that would cost us four.,%uh %uh that would cost us four.,21.089,23.143,2.054


In [23]:
speaker_attribution[:5]

[(5, 'spkr1:', 1.205000000000041, 'start'),
 (6, 'spkr1:', 2.2450000000000045, 'cont'),
 (10, 'spkr1:', 1.455000000000041, 'cont'),
 (11, 'spkr1:', 2.363999999999976, 'cont'),
 (15, 'spkr1:', 2.0540000000000305, 'end')]

In [24]:
cut_ranges[:5]

[(5, 432.81199999999995, 434.017),
 (6, 434.30699999999996, 436.55199999999996),
 (10, 439.034, 440.48900000000003),
 (11, 440.91900000000004, 443.283),
 (15, 447.36699999999996, 449.421)]

In [25]:
multi_speaker_segment_data[:5]

[[(5, 'spkr1:', 1.205000000000041, 'start'),
  (6, 'spkr1:', 2.2450000000000045, 'cont'),
  (10, 'spkr1:', 1.455000000000041, 'cont'),
  (11, 'spkr1:', 2.363999999999976, 'cont'),
  (15, 'spkr1:', 2.0540000000000305, 'end'),
  (20, 'spkr1:', 1.0319999999999823, 'start'),
  (23, 'spkr1:', 0.9569999999999936, 'cont'),
  (31, 'spkr1:', 0.4009999999999536, 'cont'),
  (32, 'spkr1:', 1.3020000000000778, 'cont'),
  (46, 'spkr1:', 5.286000000000001, 'end'),
  (63, 'spkr1:', 4.63799999999992, 'start'),
  (64, 'spkr1:', 3.830000000000041, 'end'),
  (79, 'spkr2:', 1.9440000000000737, 'start'),
  (80, 'spkr2:', 0.63799999999992, 'cont'),
  (81, 'spkr2:', 1.1779999999998836, 'end')],
 [(99, 'spkr2:', 0.8199999999999363, 'start'),
  (110, 'spkr2:', 0.8920000000000528, 'cont'),
  (111, 'spkr2:', 1.4149999999999636, 'cont'),
  (112, 'spkr2:', 1.5919999999998709, 'cont'),
  (113, 'spkr2:', 2.09599999999989, 'end'),
  (217, 'spkr4:', 2.0920000000000982, 'start'),
  (218, 'spkr4:', 0.8660000000000991, 'c

In [26]:
artificial_filenames[:5]

['/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_0_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_1_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_2_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_3_.wav',
 '/Users/pari/University/DIAR_Project/DEMO_DATA/artificial_segments/segment_4_.wav']

# Audio file preprocessing done!
Start STEP_2