## Materials 
At the start, we have audio and annotated textgrids of **regilaul** songs, annotated for ictus/off-ictus and phrase text, then force-aligned using Praat's built in eSpeak forced aligner for Estonian to word and then segment. Then, we use the estnltk vabamorf package to syllabify the words so that we can annotate the textgrid further with syllable quantity (Estonian has 3) and whether or not it is accented at the word level. We end up with a dataframe containing the data from three of the(Interval) tiers of the textgrid, acquiring duration data for words, individual segments, and (eventually) syllables. 

In [9]:
import pandas as pd
import parselmouth
from estnltk.vabamorf.morf import syllabify_word
import tgt
import string
import unicodedata

#test method on a single TextGrid:
gridDir2 = "/Users/sarah/Git/regilaul_project/songs/txtgrids/09.TextGrid"



def syllShape(syll):
    syll1 = unicodedata.normalize('NFC',syll)
    #remove punctuation for this method to avoid false CCs
    syll = syll1.strip(string.punctuation)
    prenorms = ['e','o','õ','ö', 'i', 'a', 'u', 'ä', 'ü']
    vowels = []
    for v in prenorms: 
        vowels.append(unicodedata.normalize('NFC', v))

    geminid = ['p','t','k']
    shape = []
    
  
    for index, item in enumerate(syll):
    #all onsets are either null or singleton
        if index == 0:
            if item not in vowels:
                shape.append("C")
            
            else: 
                shape.append("V")
     #after that, 'p,t,k' in orthography are geminate C,
     # others either vowels or short C 
        elif item in vowels:
            shape.append("V")
        elif item in geminid:
            shape.append("Cː")
        else:
            if item != '´':
                shape.append("C")
        
      
   

    shaped = "".join(shape)
    return shaped


def get_duration_labels(textgrid, wordTier,s1,s2,ictusTier):
    #tmp = codecs.open(textgrid,'r','utf-8')
    tmp = tgt.io.read_textgrid(textgrid)
    words = tmp.get_tier_by_name(wordTier)
    firstSyll = tmp.get_tier_by_name(s1)
    secSyll = tmp.get_tier_by_name(s2)
    ictus = tmp.get_tier_by_name(ictusTier)
    segments = []
    wordlist = words.intervals
    
    for interval in wordlist:
        onset = interval.start_time
        offset = interval.end_time
       # wordms = offset-onset
        word = interval.text
        syllablist = syllabify_word(word,as_dict=True)
        i = 0
        
        while i < len(syllablist): 
            if i >= 2: break
            tmpsy = syllablist[i] 
            ortho = tmpsy.get('syllable')
            ortho = ortho.strip(string.punctuation)
            q = tmpsy.get('quantity')
            a = tmpsy.get('accent')
            shape = syllShape(ortho)
            
            if i == 0: 
                tmpinterval = firstSyll.get_annotations_between_timepoints(onset,offset)
            elif i == 1: 
                tmpinterval = secSyll.get_annotations_between_timepoints(onset,offset)
           
            #skip syllables with no annotations in the analysis tiers
            if len(tmpinterval)==0: 
                i+= 1 
                break
            for vowel in tmpinterval:
              
                segment = vowel.text
                vOnset = vowel.start_time
                vOffset = vowel.end_time
                dur = vOffset-vOnset
                vMidpoint = vOffset - (dur/2)
                tmpick = ictus.get_annotations_by_time(vMidpoint)
                if len(tmpick) > 0 :
                    ick = tmpick[0].text
                else: ick = "off"
                row = (word,ortho,shape,i,segment,q,a,ick,dur,vMidpoint) 
                segments.append(row)
                
            i+= 1  
            
    nu_df = pd.DataFrame(segments,columns=["word","syll","shape","index","segment","quantity","stressed","ictus","duration","midpoint"])
    return nu_df
    








In [10]:
#test duration label method:

onetwo_df = get_duration_labels(gridDir2,"word","s1","s2","ictus")
onetwo_df


Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint
0,"Lõpe,",lõ,CV,0,ɵ,2,1,ictus,0.221401,0.398469
1,"lõpe,",lõ,CV,0,ɵ,2,1,ictus,0.208552,1.227423
2,"lõpe,",pe,CV,1,e,2,0,x,0.293176,1.714267
3,"linakene,",li,CV,0,i,1,1,ictus,0.222958,2.04527
4,"linakene,",na,CV,1,ɑ,2,0,x,0.312941,2.379412
5,kui,kui,CVV,0,uiː,3,1,ictus,0.382281,5.463635
6,sa,sa,CV,0,a,3,1,x,0.165045,5.846221
7,"lõpe,",lõ,CV,0,ɵ,2,1,ictus,0.216827,6.230085
8,siia,sii,CVV,0,iː,2,1,ictus,0.287818,7.058594
9,siia,a,V,1,ja,1,0,x,0.373816,7.389411


In [11]:
#test syllable shape method: 
syll = "kook"
syllShape(syll)

'CVVCː'

## Adding Spectral data
now that we have the duration data from the textgrid, we can query specific timepoints for information about the acoustic signal. The following function uses the midpoint (which we snagged while we were making the dataframe above) and get the first three formants(Hz) for each segment. 

In [12]:

import parselmouth

test = "/Users/sarah/Git/regilaul_project/songs/wavs_aligned/65.wav"

def get_formants(syl_dur_df, wave):
    song = parselmouth.Sound(wave)
    formant = song.to_formant_burg()
    f1 = []
    f2 = []
    for float in syl_dur_df.midpoint:
        time = float
        first = formant.get_value_at_time(1,time)
        f1.append(first)
        second = formant.get_value_at_time(2, time)
        f2.append(second)
    syl_dur_df["f1"] = f1
    syl_dur_df["f2"] = f2
    return syl_dur_df
nu_df = get_formants(onetwo_df,test)
nu_df.head()


Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint,f1,f2
0,"Lõpe,",lõ,CV,0,ɵ,2,1,ictus,0.221401,0.398469,537.95586,2636.111467
1,"lõpe,",lõ,CV,0,ɵ,2,1,ictus,0.208552,1.227423,376.720589,1061.124425
2,"lõpe,",pe,CV,1,e,2,0,x,0.293176,1.714267,511.874473,1462.904048
3,"linakene,",li,CV,0,i,1,1,ictus,0.222958,2.04527,477.258475,1491.004187
4,"linakene,",na,CV,1,ɑ,2,0,x,0.312941,2.379412,793.983102,1348.612283


In [13]:
def idealCoda(shape):
    onset = False
    open = False
    closed = False
    gemini = False 
    cplex = False 
    if shape[0] == "C":
        onset = True
    tail = len(shape - 1)
    if shape[tail] != "V": 
        closed = True
        
    
    return [ onset, open, closed, gemini, cplex]


def nucleus(segment):
    gemini = False
    diphthong = False 


In [14]:
from os.path import join
#runs a for loop over a directory using the above-specified functions

test = "/Users/sarah/Git/regilaul_project/songs/txtgrids"
songs = "/Users/sarah/Git/regilaul_project/songs/wavs_aligned"
datum = "/Users/sarah/Git/regilaul_project/songs/datum/"
for fn in os.listdir(test):
    if '.TextGrid' not in fn: 
        continue 
    n = fn.strip('.TextGrid') 
    wave = join(songs, n + '.wav')
    data = join(datum, n)
    data_file = open(data +".csv",'w')
    #make a dataframe with the interval tiers of the textgrid
    tmp = pd.DataFrame(get_duration_labels(join(test,fn), "word","s1","s2","ictus"))
    #add the formant data to the dataframe
    nu_df = get_formants(tmp,wave)
    #print(nu_df.head())
    nu_df.to_csv(data_file)
    data_file.close()

# Now we put it into a big pile!

Here we concatenate all the data we have so far into one large pandas dataframe. At this point, we can keep annotating songs for the corpus, and as textgrids are finished we can run the scripts above to add them into the larger dataset. We're also gonna take the opportunity to add some metadata to the dataframes: fileid(song) and performer initials as potential grouping factors. 

In [15]:

import os 
import pandas as pd 
import statsmodels.formula.api as smf
folder = "/Users/sarah/Git/regilaul_project/songs/datum"
meta =  pd.read_csv("/Users/sarah/Git/regilaul_project/songs/song_metadata.csv")


songs_dfs = []
for fn in os.listdir(folder):
    if '.csv' not in fn: continue
    whole_name = os.path.join(folder,fn)
    song_df = pd.read_csv(whole_name)
    fileid1 = fn.strip('.csv')
    fileid = int(fileid1)
    row = meta.index[meta['track'] == fileid].tolist()
    if len(row) !=0 :
        performer = meta.performer[row[0]]
    else: performer = "couldn't get match"
    for index in song_df:
        song_df['fileid'] = fileid
        song_df['performer'] = performer

        
    songs_dfs.append(song_df)

big_frame = pd.concat(songs_dfs, ignore_index=True)
#move ictus-off replacement to here! 
big_frame.ictus = big_frame.ictus.replace("x","off")

# big_frame.describe()





# big_frame
#clean_frame = pd.DataFrame(big_frame[['quantity','stress','segment','seg_duration','ictus','euc','fileid','performer']])
#clean_frame.head()

big_frame

Unnamed: 0.1,Unnamed: 0,word,syll,shape,index,segment,quantity,stressed,ictus,duration,midpoint,f1,f2,fileid,performer
0,0,sain,sain,CVVC,0,ai,3,1,ictus,0.217500,4.263002,714.071536,1129.944778,41,LK
1,1,"mal´lika,",mal´,CVC,0,a(i),2,1,off,0.179055,4.614738,946.021821,1448.348662,41,LK
2,2,"mal´lika,",li,CV,1,i,2,0,ictus,0.130760,4.936603,574.096119,1864.237985,41,LK
3,3,mal´likast,mal´,CVC,0,a(i),2,1,ictus,0.193353,5.891856,1075.429489,1570.329209,41,LK
4,4,mal´likast,li,CV,1,i,2,0,off,0.122714,6.186455,589.298998,1688.844658,41,LK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,73,südant,dant,CVCCː,1,ɑ,3,1,off,0.172132,94.811813,654.258343,1112.070873,65,LK
754,74,sülle,sül,CVC,0,yl,2,1,ictus,0.336953,95.331866,640.080936,1706.830954,65,LK
755,75,sülle,le,CV,1,e,1,0,off,0.308579,95.744141,721.464844,1631.467177,65,LK
756,76,rabadaie.,ra,CV,0,a,1,0,ictus,0.333783,96.176488,935.850426,1478.448942,65,LK


In [16]:
big_frame['shape'].unique()

array(['CVVC', 'CVC', 'CV', 'CVCː', 'CVV', 'CVCːC', 'VCCC', 'V', 'CVVCː',
       'CVCC', 'VV', 'VC', 'VVC', 'CVCːCC', 'CVCCː', 'VCː', 'CCVC'],
      dtype=object)

In [25]:
codas_ct = pd.crosstab(big_frame['shape'],big_frame.quantity)
codas_ct

quantity,1,2,3
shape,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CCVC,0,1,0
CV,267,87,6
CVC,0,192,3
CVCC,0,3,0
CVCCː,0,7,5
CVCː,0,13,3
CVCːC,0,1,2
CVCːCC,0,1,0
CVV,0,81,5
CVVC,0,4,15


In [26]:
quant_ct = pd.crosstab(big_frame.quantity,big_frame.ictus )
quant_ct

ictus,ictus,off
quantity,Unnamed: 1_level_1,Unnamed: 2_level_1
1,90,192
2,268,160
3,28,20


In [29]:
stress_ct = pd.crosstab([big_frame.quantity, big_frame.stressed],big_frame.ictus)
stress_ct

Unnamed: 0_level_0,ictus,ictus,off
quantity,stressed,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,22,176
1,1,68,16
2,0,13,149
2,1,255,11
3,1,28,20


In [18]:
big_frame['index'] = big_frame['index'].astype(object)
big_frame['quantity'] = big_frame['quantity'].astype(object)
big_frame['stressed'] = big_frame['stressed'].astype(object)
big_frame['shape'] = big_frame['shape'].astype(object)

big_frame['fileid'] = big_frame['fileid'].astype(object)

corpus_data = open('regilaul_vowels.csv','w')
big_frame.to_csv(corpus_data)
corpus_data.close()
big_frame.dtypes

Unnamed: 0      int64
word           object
syll           object
shape          object
index          object
segment        object
quantity       object
stressed       object
ictus          object
duration      float64
midpoint      float64
f1            float64
f2            float64
fileid         object
performer      object
dtype: object