# Processing Yonaguni transcriptions for MFA forced aligner
- This script processes Yonaguni transcriptions from ELAN in the following ways:
    
    - Modifies the ELAN transcriptions .csv file into a dataframe with start and end times and MFA transcriptions for each annotation

    - Moves the transcriptions that have the target phonemes to a folder for alignment

    - Fills in the textgrids automatically with MFA friendly transcriptions

    - Builds a dictionary mapping the transcriptions the MFA can process
     
- Before starting this script, you should

    1) Have your ELAN file fully annotated. If you have empty annotations segmented out, fill them in with "xxx" for now
    
    2) In ELAN, export your .eaf file as a Praat .TextGrid and as a .csv file (Tab Delimited is the option under File)
    
    3) Load the whole .wav file as a LongSound File and the .TextGrid file you just exported into Praat 
    
    4) Make a clips folder and use the save_intervals_to_wav_sound_files.praat script (skip empty annotations) to cut the .wav file into the annotated chunks
    
    5) This should produce a folder with the clipped .wav files numbered 1 through however many annotations you have + a .txt file with all the annotations

- Now name the following folders

    - working_folder : the overall folder you are working with
    - clipped_folder : folder where you have the original clipped files from the script
    - target_folder : folder for the files with the target segments to be moved to
    
## You can scroll down to the bottom to see a screenshot of what the folders could look like

In [20]:
working_folder = 'YO0008c_yoneshiro_sosensuuhai_20130410'
clipped_folder = working_folder + '/clips'
target_folder = working_folder + '/targets-stereo'

In [21]:
from __future__ import nested_scopes

import pandas as pd
import glob, os
import numpy as np
import audiolabel as al
import re

from pathlib import Path

In [22]:
def multiple_replace(adict, text):
    '''Takes a dictionary and a string and replaces keys with values'''
    # Create a regular expression from all of the dictionary keys
    regex = re.compile("|".join(map(re.escape, adict.keys(  ))))
    
    # For each match, look up the corresponding value in the dictionary
    return regex.sub(lambda match: adict[match.group(0)], text)

# Read in file that we are managing

In [23]:
f = working_folder + '/' + working_folder + '.csv'

df = pd.read_csv(f, keep_default_na = False)

# Drop unnamed columns
df = df.drop([c for c in df.columns if 'Unnamed' in c], axis = 1)

df['fileno'] = df.index + 1
df['fileno'] = df['fileno'].apply(str)

# Add column for number corresponding to file number
df.head()

Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno
0,120865,127366,hata=nki m dama=nkí iti=ti (n tti) timunu sai ...,1
1,127800,133940,ndi=nki maru munu ni hamiru=ndí nta=ba unu ang...,2
2,134285,138166,da=gara tundiru munu dama=nki=ja hiranu=ti umi,3
3,139380,147611,aci-taburu=ndi taburu=du a=nga ata=ba umi udi ...,4
4,148651,157866,ja ubuda=nga nda=ja ba-nta=ja=ju ttui=ja h-i=d...,5


In [24]:
if 'YS_Transcription-txt-rys' in df.columns:
    df = df.rename(columns = {'YS_Transcription-txt-rys':'Yonaguni'})

# Only want files that begin with these phonemes

In [25]:
targets = ['p', 't', 'k', 'c', 'n', 'm', 'ts']

In [26]:
def target_first(sentence):
    '''Breaks up the sentence by spaces and looks at the first letter for each word'''
    # Get rid of Japanese code switching or uncertainties (parentheses)
    sentence_nojp = re.sub('<[^(.+>.+<)]+>', '', sentence)
    sentence_nojp = re.sub('\([^(.+\).+\()]+\)', '', sentence_nojp)
    
    # Look at the first letter in each word. Ignore n
    first_letters = [w[0] for w in sentence_nojp.split() if len(w) > 1]
    # And first 2, in case ts
    first_digraph = [w[0:2] for w in sentence_nojp.split() if len(w) > 1]
    
    # Only take those that are overlapped
    inter = set.intersection(set(first_letters), set(targets))
    
    return(len(inter) > 0)

### Apply function to subset dataframe to just those tokens that contain our targets

In [27]:
# Subset by taking only those that don't end up being zero
target_df = df[df['Yonaguni'].apply(lambda s: target_first(s))].reset_index(drop = True)

print(len(target_df))

target_df.head()

28


Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno
0,120865,127366,hata=nki m dama=nkí iti=ti (n tti) timunu sai ...,1
1,127800,133940,ndi=nki maru munu ni hamiru=ndí nta=ba unu ang...,2
2,134285,138166,da=gara tundiru munu dama=nki=ja hiranu=ti umi,3
3,139380,147611,aci-taburu=ndi taburu=du a=nga ata=ba umi udi ...,4
4,148651,157866,ja ubuda=nga nda=ja ba-nta=ja=ju ttui=ja h-i=d...,5


### Move files to target folder

In [28]:
try: os.mkdir(target_folder)
except FileExistsError: pass

for f in sorted(glob.glob(clipped_folder + '/*.wav')):
    stem = Path(f).stem
    if stem in target_df['fileno'].values:
        os.rename(f, target_folder + '/' + stem + '.wav')

### Make Yonaguni transcription MFA friendly

In [29]:
to_mfa = {
    '1': '',
    '=': '',
    '-': '',
    '\(': '',
    '\)': '',
    '\?': '',
    '\.': ''
}

target_df['mfa'] = target_df['Yonaguni'].replace(to_mfa, regex = True)

We need to mark Japanese words so they won't be analyzed incorrectly as Yonaguni
- Use regex to search for everything between < >
- We will mark each Japanese word as beginning with "q" (MFA doesn't like symbols like < >) so that we can later filter them out

In [30]:
def mark_jp(sentence):
    '''Takes sentence and looks for words between <> and adds q in front of each word'''
    
    # Find all instances of codeswitching
    # [^(.+>.+<)]+ lets us get all instances if there is multiple codeswitching
    matches = [m for m in re.finditer('<[^(.+>.+<)]+>', sentence)]
    
    # Then for parentheses
    matches.extend([m for m in re.finditer('\([^(.+\).+\()]+\)', sentence)])
    
    if len(matches) == 0:
        
        return(sentence)
    
    else:
    
        # Initialize variable to keep track of how far into the word we are and our new sentence
        i = 0
        new_s = ''
        
        # Replace matches with q at the beginning of each word, so we know what the Japanese words are
        for m in matches:
            
            # Shows what the code-switched segments are
            print(m.group())

            # Break up code-switched sentence into words
            code_switch = m.group()[1:-1].split()

            # Add q to beginning of every word
            jp_marked = ' '.join(['q' + w for w in code_switch])
            
            # Conjoin to form new sentence
            new_s += sentence[i:m.span()[0]] + jp_marked

            # Update index
            i = m.span()[1]

        new_s += sentence[i:]
        
        return(new_s)

In [31]:
# Sanity check
target_df['mfa'].apply(lambda s: mark_jp(s))

<ttara nibanmeedakara>
<genki>
<kaza>
<butsudan>
<daitai>
<niban>
<iciban>
<niban>
<kaza>


0       hatanki m damankí ititi n tti timunu sai sutaja
1     ndinki maru munu ni hamirundí ntaba unu angami...
2            dagara tundiru munu damankija hiranuti umi
3     acitaburundi taburudu anga ataba umi udi anbir...
4     ja ubudanga ndaja bantajaju ttuija hidu macind...
5     utu nniti busa busa ndutasi nagudangadu naguda...
6                        nagudanga magitaba nagudabagai
7     damanki tui suta mata unu nuguru ttunu utudant...
8     nagudanga tui sutaba timunu sai sutaba danki siti
9     unindungadu nagudaja timunu ndu namakidu citi ...
10    ntaba ijanga ja ija abutanga n o abuta abuta m...
11    haatu maihuna angaminta atattumuti usi marumun...
12    timunu mun n mujaninuti inamunundi umuiti na u...
13    badija bungadu unu ki uni hadimiti damagara tu...
14    abutanga tuiti kkurunu naganki icin iriti buta...
15                   abutaja ibi matiti hananki naitima
16    turitaba angamiti buuru dusi isiti abutaja ma ...
17          qgenki minungara ndi maihuna maririj

In [32]:
target_df['mfa'] = target_df['mfa'].apply(lambda s: mark_jp(s))

target_df.to_csv(working_folder + '/' + working_folder + '-targets.csv', index = False)

target_df.head()

<ttara nibanmeedakara>
<genki>
<kaza>
<butsudan>
<daitai>
<niban>
<iciban>
<niban>
<kaza>


Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno,mfa
0,120865,127366,hata=nki m dama=nkí iti=ti (n tti) timunu sai ...,1,hatanki m damankí ititi n tti timunu sai sutaja
1,127800,133940,ndi=nki maru munu ni hamiru=ndí nta=ba unu ang...,2,ndinki maru munu ni hamirundí ntaba unu angami...
2,134285,138166,da=gara tundiru munu dama=nki=ja hiranu=ti umi,3,dagara tundiru munu damankija hiranuti umi
3,139380,147611,aci-taburu=ndi taburu=du a=nga ata=ba umi udi ...,4,acitaburundi taburudu anga ataba umi udi anbir...
4,148651,157866,ja ubuda=nga nda=ja ba-nta=ja=ju ttui=ja h-i=d...,5,ja ubudanga ndaja bantajaju ttuija hidu macind...


# Use audiolabel to fill TextGrids with MFA transcription
### First create TextGrids for all the files with the TextGridMaker Praat script
- Use the TextGridMaker.praat script on the target folder
- If the annotations are right after running this cell, replace the empty TextGrids

In [33]:
# Make directory for filled TextGrids if it does not exist
try: os.mkdir(target_folder + '/filled')
except FileExistsError: pass
    
for f in sorted(glob.glob(target_folder + '/*.TextGrid')):
    stem = Path(f).stem
    
    # Get sentence from targets dataframe
    sentence = target_df.loc[target_df['fileno'] == stem, 'mfa'].values[0]
    
    # Read TextGrid and replace text
    with open(target_folder + '/' + stem + '.TextGrid', 'r') as f:
        replace = re.sub('text = ".*"', 'text = "' + sentence + '"', f.read())
    
    # Print to new file
    with open(target_folder + '/filled/' + stem + '.TextGrid', 'w') as w:
        w.write(replace)

### Make dictionary for MFA to phonemes

In [34]:
mfa_digraph = {
    'si': 'SH IY1',
    'sj': 'SH Y',
    'zi': 'JH Y',
    'zj': 'JH Y',
}

mfa_C = {
    'q': '',
    'ng': 'NG',
    'nk': 'NG K',
    'nm': 'M',
    'nn': 'N',
    'np': 'M P',
    'nb': 'M B',
    'j': 'Y',
    'c': 'CH',
    'kk': 'K',
    'tt': 'T',
    'h': 'HH'
}

mfa_VV = {
    'aa': 'AA1',
    'ee': 'EY1',
    'ii': 'IY1',
    'oo': 'OW1',
    'uu': 'UW1'
}

mfa_V = {
    'a': 'AA1',
    'e': 'EY1',
    'i': 'IY1',
    'o': 'OW1',
    'u': 'UW1'
}

# Add spaces on either end
for d in [mfa_digraph, mfa_C, mfa_VV, mfa_V]:
    for k, v in d.items():
        d[k] = ' ' + v + ' '

### Function for turning words to MFA pronunciation

In [35]:
def to_mfa(word):
    '''Turns word to MFA pronunciation'''

    for d in [mfa_digraph, mfa_C, mfa_VV, mfa_V]:
        word = (multiple_replace(d, word))
    
    mfa = ''
    
    for l in word:
        if l.islower():
            mfa += ' ' + l.upper() + ' '
        else:
            mfa += l
    
    return(' '.join(mfa.split()))

### Collect the unique words that show up in the data

In [36]:
# # IF WE NEED TO REREAD THE TARGET FILES

# for folder in sorted(next(os.walk('.'))[1]):
#     if folder != '.ipynb_checkpoints':
#         target_df = pd.read_csv(folder + '/' + folder + '-targets.csv', keep_default_na = False)
        
#         words = []

#         for s in target_df['mfa']:
#             for w in s.split():
#                 if w not in words: words.append(w)

#         #print(sorted(words))

#         with open(folder + '/dictionary.txt', 'w') as f:
#         #with open(working_folder + '/dictionary.txt', 'w') as f:

#             for w in sorted(words): 
#                 f.write(w + '  ' + to_mfa(w) + '\n')

In [37]:
words = []

for s in target_df['mfa']:
    for w in s.split():
        if w not in words: words.append(w)
            
print(sorted(words))

['abuta', 'abutaja', 'abutajama', 'abutandi', 'abutanga', 'abutanki', 'acitaburundi', 'ai', 'anbi', 'anbiru', 'anga', 'angaminta', 'angamintaja', 'angamintangadu', 'angamiti', 'angamitintaja', 'angamitintanga', 'arungara', 'ataba', 'atara', 'atattumuti', 'atingai', 'atingaidu', 'badija', 'banta', 'bantaja', 'bantajaju', 'banu', 'budu', 'bungadu', 'buru', 'burundi', 'burungarandi', 'buruta', 'burutaatingai', 'burutasi', 'busa', 'butaru', 'butaruatingai', 'buuru', 'buurusi', 'cidi', 'citi', 'da', 'daa', 'dagara', 'damagara', 'damanki', 'damankija', 'damankí', 'dani', 'danidu', 'danki', 'din', 'dugui', 'dunannu', 'dusabinki', 'dusi', 'haatu', 'hadimiti', 'hai', 'hamirundí', 'hamirungarajo', 'hananki', 'hatanki', 'hataratidu', 'hataratiti', 'hidu', 'hiranuti', 'hjuru', 'ibi', 'icin', 'ija', 'ijanga', 'inamunundi', 'iriti', 'irumunujandi', 'isiti', 'ititi', 'ja', 'kaisi', 'ki', 'kidu', 'kinu', 'kirarirundi', 'kiti', 'kkuiti', 'kkuru', 'kkurunu', 'kunutaba', 'm', 'ma', 'maa', 'maasiku', 'mab

### Create dictionary

In [38]:
with open(working_folder + '/dictionary.txt', 'w') as f:

    for w in sorted(words): 
        f.write(w + '  ' + to_mfa(w) + '\n')

# Next steps
- The last prep work to be done for MFA alignment is isolating one channel (MFA doesn't deal well with two-channel files)
- You can run the shell script extractchannel.sh in the command line to extract one channel (extract the channel where the speaker's voice is louder). The usage is in the script, but is repeated here

        bash extractchannel.sh folder channel
        
        
- Make sure the correct channel was extracted, then move the original target .wav files back to the clipped_folder. Then move these mono files to the target_folder
- You can now run the MFA! See usage in mfa_usage.txt
- After alignment, you can move the unaligned files into a folder called "unaligned"

# Layout

Below is a screenshot of how my folder layout looks after MFA realignment.

- aligned : folder with target .wav files and MFA force aligned .TextGrids
- clips : folder with original clipped files
- unaligned : folder with target .TextGrids before forced alignment

- dictionary.txt : dictionary for MFA forced alignment

- [FILENAME].csv : .csv file containing metadata
- [FILENAME].TextGrid : Praat .TextGrid extracted from ELAN .eaf file

<img src='folderlayout.png'>

## Import read_label from audiolabel to get TextGrid time info

In [279]:
from audiolabel import read_label

### Get preceding and following words for each file

In [280]:
def get_contexts(word_data, phone_data):

    '''Takes a word and phone dataframe and updates it for context'''
    
    # Word context
    
    prev_words = ['sentence_start']
    prev_word_starts = ['sentence_start']
    prev_word_ends = ['sentence_start']
    
    next_words = []
    next_word_starts = []
    next_word_ends = []
    
    # Times for previous

    for i in range(0, len(word_data)):
        
        if i != 0:
            prev_words.append(word_data.loc[i - 1, 'label'])
            prev_word_starts.append(word_data.loc[i - 1, 't1'])
            prev_word_ends.append(word_data.loc[i - 1, 't2'])
            
        if i != len(word_data) - 1:
            next_words.append(word_data.loc[i + 1, 'label'])
            next_word_starts.append(word_data.loc[i + 1, 't1'])
            next_word_ends.append(word_data.loc[i + 1, 't2'])

    next_words.append('sentence_end')
    next_word_starts.append('sentence_end')
    next_word_ends.append('sentence_end')

    # Now add lists to datasets
    word_data['prev_word'] = prev_words
    word_data['prev_word_start'] = prev_word_starts
    word_data['prev_word_end'] = prev_word_ends
    
    word_data['next_word'] = next_words
    word_data['next_word_start'] = next_word_starts
    word_data['next_word_end'] = next_word_ends
    
    # Get sentence and get rid of extra whitespaces
    sentence = ' '.join((' '.join(word_data['label'].values)).split())
    
    word_data['sentence'] = sentence
    
    ###
    
    phone_data['word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['label'].item(), axis = 1)
    
    phone_data['prev_word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word'].item(), axis = 1)
    phone_data['prev_word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word_start'].item(), axis = 1)
    phone_data['prev_word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word_end'].item(), axis = 1)
    
    phone_data['next_word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word'].item(), axis = 1)
    phone_data['next_word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word_start'].item(), axis = 1)
    phone_data['next_word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word_end'].item(), axis = 1)
    
    phone_data['word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['t1'].item(), axis = 1)
    phone_data['word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['t2'].item(), axis = 1)
    
    phone_data['sentence'] = sentence
    
    # Phone context

    prev_phons = ['sentence_start']
    prev_phon_starts = ['sentence_start']
    prev_phon_ends = ['sentence_start']
    
    next_phons = []
    next_phon_starts = []
    next_phon_ends = []

    for i in range(0, len(phone_data)):
        
        if i != 0:
            prev_phons.append(phone_data.loc[i - 1, 'label'])
            prev_phon_starts.append(phone_data.loc[i - 1, 't1'])
            prev_phon_ends.append(phone_data.loc[i - 1, 't2'])
            
        if i != len(phone_data) - 1:
            next_phons.append(phone_data.loc[i + 1, 'label'])
            next_phon_starts.append(phone_data.loc[i + 1, 't1'])
            next_phon_ends.append(phone_data.loc[i + 1, 't2'])
            
    next_phons.append('sentence_end')
    next_phon_starts.append('sentence_end')
    next_phon_ends.append('sentence_end')

    phone_data['prev_phon'] = prev_phons
    phone_data['prev_phon_start'] = prev_phon_starts
    phone_data['prev_phon_end'] = prev_phon_ends
    
    phone_data['next_phon'] = next_phons
    phone_data['next_phon_start'] = next_phon_starts
    phone_data['next_phon_end'] = next_phon_ends
    
    return(word_data, phone_data)

In [281]:
word_df = pd.DataFrame()
phone_df = pd.DataFrame()

for f in glob.glob('*/aligned/*.TextGrid'):
    
    [sub_wdf, sub_pdf] = read_label(f, ftype = 'praat')
    
    [updated_wdf, updated_pdf] = get_contexts(sub_wdf, sub_pdf)
    
    word_df = pd.concat([word_df, updated_wdf])
    phone_df = pd.concat([phone_df, updated_pdf])

In [282]:
# Add phoneme and first syllable info

# Define syllable
def get_firstsyll(word):

    vowels = ['a', 'i', 'u', 'e', 'o']

    syll = ''

    # Flags for onset, nucleus and coda
    onset = 'incomplete'
    nucleus = 'incomplete'

    # If word is 2 for fewer letters long, just take it
    if len(word) <= 2:

        return(word)

    # Keep moving i until we find a vowel
    i = 0

    # If first sound is vowel, no longer in onset
    if word[0] in vowels:
        onset = 'complete'

    while onset != 'complete':

        i += 1

        syll += word[i-1]

        if word[i] in vowels: 

            onset = 'complete'

    # Look for end of word or coda
    while nucleus != 'complete':

        i += 1

        syll += word[i-1]

        if i == len(word):

            return(syll)

        if word[i] not in vowels:

            nucleus = 'complete'

    # If sound is n
    if word[i] == 'n':

        i += 1

        # If whole word, then return
        if i == len(word):

            return(word)

        # If next sound is consonant, add
        if word[i] not in vowels:

            syll += word[i-1]

    return(syll)

###

def findonset(word):
    
    nuclei = ['a', 'e', 'i', 'o', 'u', 'w', 'j']
    
    onset = ''
    
    for phone in word:
        
        if phone not in nuclei:
            
            onset += phone
        
        else: return(onset)

In [283]:
phone_df['onset'] = phone_df['word'].apply(lambda w: findonset(w))
phone_df['first_syll'] = phone_df['word'].apply(lambda w: get_firstsyll(w))

In [284]:
len(phone_df)

36169

In [285]:
phone_df.head()

Unnamed: 0,t1,t2,label,fname,word,prev_word,prev_word_start,prev_word_end,next_word,next_word_start,...,word_end,sentence,prev_phon,prev_phon_start,prev_phon_end,next_phon,next_phon_start,next_phon_end,onset,first_syll
0,0.0,0.06,sil,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,,sentence_start,sentence_start,sentence_start,unu,0.06,...,0.06,unu ttuja nnidu bunsuja,sentence_start,sentence_start,sentence_start,UW1,0.06,0.18,,
1,0.06,0.18,UW1,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,unu,,0,0.06,ttuja,0.35,...,0.35,unu ttuja nnidu bunsuja,sil,0,0.06,N,0.18,0.22,,u
2,0.18,0.22,N,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,unu,,0,0.06,ttuja,0.35,...,0.35,unu ttuja nnidu bunsuja,UW1,0.06,0.18,UW1,0.22,0.35,,u
3,0.22,0.35,UW1,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,unu,,0,0.06,ttuja,0.35,...,0.35,unu ttuja nnidu bunsuja,N,0.18,0.22,T,0.35,0.45,,u
4,0.35,0.45,T,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,ttuja,unu,0.06,0.35,,0.86,...,0.86,unu ttuja nnidu bunsuja,UW1,0.22,0.35,UW1,0.45,0.49,tt,ttu


In [286]:
phone_df['onset'].value_counts()

k      4051
m      3819
       3549
b      3200
n      1976
h      1741
t      1679
c      1227
d      1188
tt      983
q       963
s       778
nn      652
nd      414
nt      317
kk      309
qk      309
nm      297
ts      233
g       200
ngg     169
qt      161
p       154
qs      141
nb      116
ns      114
qn       99
ss       60
nk       56
qm       56
qh       43
qc       41
qts      34
qb       30
ng       20
qz       18
qr       12
qy       11
mb       11
qtt      10
nts      10
pp        9
cc        3
qnn       3
qd        2
Name: onset, dtype: int64

In [287]:
word_df.to_csv('words.csv', index = False)
phone_df.to_csv('phones.csv', index = False)

# Get target words

In [288]:
target_words = word_df[(word_df['label'].str.len() > 1) & (word_df['label'].str.contains('^[tkcpmn(ts)]'))]

target_phones_all = phone_df[(phone_df['word'].str.len() > 1) & (phone_df['word'].str.contains('^[tkcpmn(ts)]')) & (phone_df['label'].isin(['T', 'K', 'CH', 'P', 'M', 'N'])) & (phone_df['word_start'] == phone_df['t1'])]

target_phones_all['onset'] = target_phones_all['word'].apply(lambda w: findonset(w))
target_phones_all['first_syll'] = target_phones_all['word'].apply(lambda w: get_firstsyll(w))

# Reorder columns
col_order = ['fname', 'onset', 'label', 't1', 't2',
             'word', 'word_start', 'word_end',
             'first_syll', 'sentence',
             'prev_phon', 'prev_phon_start', 'prev_phon_end', 'next_phon', 'next_phon_start', 'next_phon_end', 
             'prev_word', 'prev_word_start', 'prev_word_end', 'next_word', 'next_word_start', 'next_word_end'
             ]

target_phones_all = target_phones_all[col_order]

# Rename t1 and t2 to more useful names
target_phones_all = target_phones_all.rename(columns = {'t1':'phon_start', 't2':'phon_end'})

# Isolate phones that are preceded by vowels
target_phones = target_phones_all[target_phones_all['prev_phon'].isin(['AA1', 'IY1', 'UW1', 'EY1', 'OW1', 'N'])].reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [289]:
print(len(target_phones))

target_phones.head()

886


Unnamed: 0,fname,onset,label,phon_start,phon_end,word,word_start,word_end,first_syll,sentence,...,prev_phon_end,next_phon,next_phon_start,next_phon_end,prev_word,prev_word_start,prev_word_end,next_word,next_word_start,next_word_end
0,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,tt,T,0.35,0.45,ttuja,0.35,0.86,ttu,unu ttuja nnidu bunsuja,...,0.35,UW1,0.45,0.49,unu,0.06,0.35,,0.86,2.55
1,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,nn,N,0.97,1.21,nniburu,0.97,1.62,nni,qterebidu nniburu,...,0.97,IY1,1.21,1.3,qterebidu,0.38,0.97,,1.62,1.905
2,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,tt,T,0.69,0.76,ttutu,0.69,1.06,ttu,unu ttutu kunu ttutuja qttangadangara nni bunsuja,...,0.69,UW1,0.76,0.85,unu,0.38,0.69,kunu,1.06,1.37
3,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,k,K,1.06,1.16,kunu,1.06,1.37,ku,unu ttutu kunu ttutuja qttangadangara nni bunsuja,...,1.06,UW1,1.16,1.23,ttutu,0.69,1.06,ttutuja,1.37,2.03
4,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,tt,T,1.37,1.45,ttutuja,1.37,2.03,ttu,unu ttutu kunu ttutuja qttangadangara nni bunsuja,...,1.37,UW1,1.45,1.53,kunu,1.06,1.37,,2.03,2.32


In [290]:
print(len(target_phones_all))

target_phones_all.head()

2863


Unnamed: 0,fname,onset,label,phon_start,phon_end,word,word_start,word_end,first_syll,sentence,...,prev_phon_end,next_phon,next_phon_start,next_phon_end,prev_word,prev_word_start,prev_word_end,next_word,next_word_start,next_word_end
4,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,tt,T,0.35,0.45,ttuja,0.35,0.86,ttu,unu ttuja nnidu bunsuja,...,0.35,UW1,0.45,0.49,unu,0.06,0.35,,0.86,2.55
9,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,nn,N,2.55,2.79,nnidu,2.55,3.03,nni,unu ttuja nnidu bunsuja,...,2.55,IY1,2.79,2.9,,0.86,2.55,bunsuja,3.03,3.86
9,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,nn,N,0.97,1.21,nniburu,0.97,1.62,nni,qterebidu nniburu,...,0.97,IY1,1.21,1.3,qterebidu,0.38,0.97,,1.62,1.905
4,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,tt,T,0.69,0.76,ttutu,0.69,1.06,ttu,unu ttutu kunu ttutuja qttangadangara nni bunsuja,...,0.69,UW1,0.76,0.85,unu,0.38,0.69,kunu,1.06,1.37
8,YO0020c_die_see_seem-yoneshiro-20140709/aligne...,k,K,1.06,1.16,kunu,1.06,1.37,ku,unu ttutu kunu ttutuja qttangadangara nni bunsuja,...,1.06,UW1,1.16,1.23,ttutu,0.69,1.06,ttutuja,1.37,2.03


In [163]:
target_phones.to_csv('target_phones.csv', index = False)

In [291]:
target_phones_all.to_csv('target_phones_all.csv', index = False)

In [292]:
target_phones['onset'].value_counts()

m     277
k     188
n     122
t      97
tt     56
nn     43
nd     20
c      19
nt     16
nm     16
ns     10
kk      8
ts      5
nb      4
p       3
mb      1
pp      1
Name: onset, dtype: int64

In [293]:
target_phones_all['onset'].value_counts()

k      653
m      597
n      345
t      288
c      254
tt     241
nn     142
nd      81
nm      62
kk      55
nt      45
ts      35
nb      22
p       22
ns      17
mb       1
cc       1
pp       1
nts      1
Name: onset, dtype: int64

# Check counts

In [294]:
targets_all.groupby(['onset'])['word'].value_counts().reset_index(name = 'count').to_csv('word_count_all.csv', index = False)

In [296]:
target_phones.groupby(['onset'])['word'].value_counts().reset_index(name = 'count').to_csv('word_count.csv', index = False)

In [298]:
target_phones.groupby(['onset'])['first_syll'].value_counts().reset_index(name = 'count').to_csv('first_syll_count.csv', index = False)

In [299]:
targets_all.groupby(['onset'])['first_syll'].value_counts().reset_index(name = 'count').to_csv('first_syll_count_all.csv', index = False)

# Matlab

In [169]:
import numpy as np
from scipy.io import loadmat

In [171]:
mat = loadmat('YoneshiroTable.mat')

In [193]:
mdata = mat['None']
mdtype = mdata.dtype

mdtype

[n for n in mdtype.names]

['s0', 's1', 's2', 'arr']

In [213]:
mdata['s0']

MatlabOpaque([b'tbl'], dtype=object)

In [214]:

# * SciPy reads in structures as structured NumPy arrays of dtype object
# * The size of the array is the size of the structure array, not the number
#   elements in any particular field. The shape defaults to 2-dimensional.
# * For convenience make a dictionary of the data using the names from dtypes
# * Since the structure has only one element, but is 2-D, index it at [0, 0]
ndata = {n: mdata[n][0] for n in mdtype.names}

In [220]:
# Reconstruct the columns of the data table from just the time series
# Use the number of intervals to test if a field is a column or metadata
columns = [n for n, v in ndata.items()] #if v.size == ndata['numIntervals']]


In [221]:
# now make a data frame, setting the time stamps as the index
df = pd.DataFrame(np.concatenate([ndata[c] for c in columns], axis=1),
                  index=[datetime(*ts) for ts in ndata['timestamps']],
                  columns=columns)

ValueError: zero-dimensional arrays cannot be concatenated