## Materials 
At the start, we have audio and annotated textgrids of **regilaul** songs, annotated for ictus/off-ictus and phrase text, then force-aligned using Praat's built in eSpeak forced aligner for Estonian to word and then segment. Then, we use the estnltk vabamorf package to syllabify the words so that we can annotate the textgrid further with syllable quantity (Estonian has 3) and whether or not it is accented at the word level. We end up with a dataframe containing the data from three of the(Interval) tiers of the textgrid, acquiring duration data for words, individual segments, and (eventually) syllables. 

In [59]:
import pandas as pd
import parselmouth
from estnltk.vabamorf.morf import syllabify_word
import tgt
import string
import unicodedata

#test method on a single TextGrid:
gridDir2 = "songs/txtgrids/09.TextGrid"




def syllShape(syll,quant):
    syll = unicodedata.normalize('NFC',syll)
    prenorms = ['e','o','õ','ö', 'i', 'a', 'u', 'ä', 'ü']
    vowels = []
    for v in prenorms: 
        vowels.append(unicodedata.normalize('NFC', v))

    shortC = ['b','g','d', 'j','l','r', 's', 'n','m', 'h']
    syllables = ['or','jal','ood','mull', 'muks', 'tan']
    onset = " "
    nucleus = " "
    coda = " "
    codapos = (len(syll) -1 )
    shape = " "
    #all onsets are either null or singleton
    if syll[0] in vowels: 
        onset = "V"
    else: 
        onset = "C"
    if quant == 1: 
        if syll[codapos] not in vowels: 
            coda = "C"
            nucleus = "V"
        else: 
            nucleus = "V"
            coda = ""
    if quant == 2: 
        if syll[codapos] in vowels:
            nucleus = "VV"
            coda = ""
        elif syll[codapos] not in shortC:
            nucleus = "V"
            coda = "CC"
        else:
            nucleus = "VV"
            coda = "C"
   
    if quant == 3: 
        if syll[codapos] in vowels:
            nucleus = "VV"
            coda = "VV"
        elif syll[codapos-1] not in vowels: 
                nucleus = "VV"
                coda = "CCC"
        elif syll[codapos] in shortC:
                nucleus = "VVV"
                coda = "C"
        else: 
            nucleus = "VV"
            coda = "VV"
    shape = onset + nucleus + coda


    return shape


def get_duration_labels(textgrid, wordTier,s1,s2,ictusTier):
    #tmp = codecs.open(textgrid,'r','utf-8')
    tmp = tgt.io.read_textgrid(textgrid)
    words = tmp.get_tier_by_name(wordTier)
    firstSyll = tmp.get_tier_by_name(s1)
    secSyll = tmp.get_tier_by_name(s2)
    ictus = tmp.get_tier_by_name(ictusTier)
    segments = []
    wordlist = words.intervals
    
    for interval in wordlist:
        onset = interval.start_time
        offset = interval.end_time
       # wordms = offset-onset
        word = interval.text
        syllablist = syllabify_word(word,as_dict=True)
        i = 0
        
        while i < len(syllablist): 
            if i >= 2: break
            tmpsy = syllablist[i] 
            ortho = tmpsy.get('syllable')
            ortho = ortho.strip(string.punctuation)
            q = tmpsy.get('quantity')
            a = tmpsy.get('accent')
            shape = syllShape(ortho,q)
            
            if i == 0: 
                tmpinterval = firstSyll.get_annotations_between_timepoints(onset,offset)
            elif i == 1: 
                tmpinterval = secSyll.get_annotations_between_timepoints(onset,offset)
           
            #skip syllables with no annotations in the analysis tiers
            if len(tmpinterval)==0: 
                i+= 1 
                break
            for vowel in tmpinterval:
              
                segment = vowel.text
                vOnset = vowel.start_time
                vOffset = vowel.end_time
                dur = vOffset-vOnset
                vMidpoint = vOffset - (dur/2)
                tmpick = ictus.get_annotations_by_time(vMidpoint)
                if len(tmpick) > 0 :
                    ick = tmpick[0].text
                else: ick = "off"
                row = (word,ortho,shape,i,segment,q,a,ick,dur,vMidpoint) 
                segments.append(row)
                
            i+= 1  
            
    nu_df = pd.DataFrame(segments,columns=["word","syll","shape","index","segment","quantity","stressed","ictus","duration","midpoint"])
    return nu_df
    

# syl_dur_df = get_duration_labels(gridDir2,"word","word/phon","ictus")
# syl_dur_df.head()

onetwo_df = get_duration_labels(gridDir2,"word","s1","s2","ictus")
onetwo_df



Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint
0,"Lõpe,",lõ,CVV,0,ɵ,2,1,ictus,0.221401,0.398469
1,"lõpe,",lõ,CVV,0,ɵ,2,1,ictus,0.208552,1.227423
2,"lõpe,",pe,CVV,1,e,2,0,x,0.293176,1.714267
3,"linakene,",li,CV,0,i,1,1,ictus,0.222958,2.04527
4,"linakene,",na,CVV,1,ɑ,2,0,x,0.312941,2.379412
5,kui,kui,CVVV,0,uiː,3,1,ictus,0.382281,5.463635
6,sa,sa,CVVV,0,a,3,1,x,0.165045,5.846221
7,"lõpe,",lõ,CVV,0,ɵ,2,1,ictus,0.216827,6.230085
8,siia,sii,CVV,0,iː,2,1,ictus,0.287818,7.058594
9,siia,a,V,1,ja,1,0,x,0.373816,7.389411


## Adding Spectral data
now that we have the duration data from the textgrid, we can query specific timepoints for information about the acoustic signal. The following function uses the midpoint (which we snagged while we were making the dataframe above) and get the first three formants(Hz) for each segment. 

In [60]:

import parselmouth

test = "songs/wavs_aligned/65.wav"

def get_formants(syl_dur_df, wave):
    song = parselmouth.Sound(wave)
    formant = song.to_formant_burg()
    f1 = []
    f2 = []
    for float in syl_dur_df.midpoint:
        time = float
        first = formant.get_value_at_time(1,time)
        f1.append(first)
        second = formant.get_value_at_time(2, time)
        f2.append(second)
    syl_dur_df["f1"] = f1
    syl_dur_df["f2"] = f2
    return syl_dur_df
nu_df = get_formants(onetwo_df,test)
nu_df.head()


Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint,f1,f2
0,"Lõpe,",lõ,CVV,0,ɵ,2,1,ictus,0.221401,0.398469,537.95586,2636.111467
1,"lõpe,",lõ,CVV,0,ɵ,2,1,ictus,0.208552,1.227423,376.720589,1061.124425
2,"lõpe,",pe,CVV,1,e,2,0,x,0.293176,1.714267,511.874473,1462.904048
3,"linakene,",li,CV,0,i,1,1,ictus,0.222958,2.04527,477.258475,1491.004187
4,"linakene,",na,CVV,1,ɑ,2,0,x,0.312941,2.379412,793.983102,1348.612283


In [22]:
nu_df['index'].unique()

array([0, 1])

In [61]:
from os.path import join
#runs a for loop over a directory using the above-specified functions

test = "songs/txtgrids"
songs = "songs/wavs_aligned"

for fn in os.listdir(test):
    if '.TextGrid' not in fn: 
        continue 
    n = fn.strip('.TextGrid') 
    wave = join(songs, n + '.wav')
    data_file = open(  n +"_nono.csv",'w')
    #make a dataframe with the interval tiers of the textgrid
    tmp = pd.DataFrame(get_duration_labels(join(test,fn), "word","s1","s2","ictus"))
    #add the formant data to the dataframe
    nu_df = get_formants(tmp,wave)
    #print(nu_df.head())
    nu_df.to_csv(data_file)
    data_file.close()

# Now we put it into a big pile!

Here we concatenate all the data we have so far into one large pandas dataframe. At this point, we can keep annotating songs for the corpus, and as textgrids are finished we can run the scripts above to add them into the larger dataset. We're also gonna take the opportunity to add some metadata to the dataframes: fileid(song) and performer initials as potential grouping factors. 

In [62]:

import os 
import pandas as pd 
import statsmodels.formula.api as smf
folder = "datum"
meta =  pd.read_csv("songs/song_metadata.csv")


songs_dfs = []
for fn in os.listdir(folder):
    if '.csv' not in fn: continue
    whole_name = os.path.join(folder,fn)
    song_df = pd.read_csv(whole_name)
    fileid1 = fn.strip('_nono.csv')
    fileid = int(fileid1)
    row = meta.index[meta['track'] == fileid].tolist()
    if len(row) !=0 :
        performer = meta.performer[row[0]]
    else: performer = "couldn't get match"
    for index in song_df:
        song_df['fileid'] = fileid
        song_df['performer'] = performer

        
    songs_dfs.append(song_df)

big_frame = pd.concat(songs_dfs, ignore_index=True)
big_frame.describe()


#move ictus-off replacement to here! 
##r_df.ictus = r_df.ictus.replace("x","off")
##r_df


# big_frame
#clean_frame = pd.DataFrame(big_frame[['quantity','stress','segment','seg_duration','ictus','euc','fileid','performer']])
#clean_frame.head()

big_frame

Unnamed: 0.1,Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint,f1,f2,fileid,performer
0,0,Kelle,kel,CVVC,0,e,2,1,ictus,0.159107,0.104965,469.306221,1315.902306,77,LO
1,1,Kelle,le,CV,1,e,1,0,off,0.280280,0.408230,603.945715,931.988824,77,LO
2,2,Meie,mei,CVV,0,ei,2,1,ictus,0.394162,2.673726,507.355863,1682.260595,77,LO
3,3,Metu,me,CVV,0,e,2,1,ictus,0.151071,5.727567,741.727764,1804.359021,77,LO
4,4,olid,o,V,0,o,1,1,off,0.087952,8.575493,509.045388,1044.143609,77,LO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,223,"näl´lasandi,",la,CV,1,a,1,0,x,0.155594,214.189454,962.969957,1754.035377,18,MH
754,224,pal´lalt,pal´,CVCC,0,a,2,1,ictus,0.145970,217.878852,897.374205,1575.819287,18,MH
755,225,pal´lalt,lalt,CVCC,1,a,2,0,x,0.120242,218.173949,852.870930,1377.477503,18,MH
756,226,lasandi.,las,CVVVC,0,a,3,1,x,0.177864,218.751862,1036.242848,1643.483780,18,MH


In [63]:
big_frame['index'] = big_frame['index'].astype(object)
big_frame['quantity'] = big_frame['quantity'].astype(object)
big_frame['stressed'] = big_frame['stressed'].astype(object)
big_frame['shape'] = big_frame['shape'].astype(object)

big_frame['fileid'] = big_frame['fileid'].astype(object)

corpus_data = open('regilaul_vowels.csv','w')
big_frame.to_csv(corpus_data)
corpus_data.close()
big_frame.dtypes

Unnamed: 0      int64
word           object
syll           object
shape          object
index          object
segment        object
quantity       object
stressed       object
ictus          object
duration      float64
midpoint      float64
f1            float64
f2            float64
fileid         object
performer      object
dtype: object