# Processing Yonaguni transcriptions for MFA forced aligner
- This script processes Yonaguni transcriptions from ELAN in the following ways:
    
    - Modifies the ELAN transcriptions .csv file into a dataframe with start and end times and MFA transcriptions for each annotation

    - Moves the transcriptions that have the target phonemes to a folder for alignment

    - Fills in the textgrids automatically with MFA friendly transcriptions

    - Builds a dictionary mapping the transcriptions the MFA can process
     
- Before starting this script, you should

    1) Have your ELAN file fully annotated. If you have empty annotations segmented out, fill them in with "xxx" for now
    
    2) In ELAN, export your .eaf file as a Praat .TextGrid and as a .csv file (Tab Delimited is the option under File)
    
    3) Load the whole .wav file as a LongSound File and the .TextGrid file you just exported into Praat 
    
    4) Make a clips folder and use the save_intervals_to_wav_sound_files.praat script (skip empty annotations) to cut the .wav file into the annotated chunks
    
    5) This should produce a folder with the clipped .wav files numbered 1 through however many annotations you have + a .txt file with all the annotations

- Now name the following folders

    - working_folder : the overall folder you are working with
    - clipped_folder : folder where you have the original clipped files from the script
    - target_folder : folder for the files with the target segments to be moved to
    
## You can scroll down to the bottom to see a screenshot of what the folders could look like

In [20]:
# CHOOSE THE FILE
file = 'YO0004_yoneshiro_imperative_conjunction_20130220'

working_folder = '../files/' + file
clipped_folder = working_folder + '/clips'
target_folder = working_folder + '/targets-stereo'
aligned_folder = working_folder + '/aligned'

In [6]:
from __future__ import nested_scopes

import pandas as pd
import glob, os
import numpy as np
import audiolabel as al
import re

from pathlib import Path

In [7]:
def multiple_replace(adict, text):
    '''Takes a dictionary and a string and replaces keys with values'''
    # Create a regular expression from all of the dictionary keys
    regex = re.compile("|".join(map(re.escape, adict.keys(  ))))
    
    # For each match, look up the corresponding value in the dictionary
    return regex.sub(lambda match: adict[match.group(0)], text)

# Read in file that we are managing

In [8]:
f = working_folder + '/' + file + '.csv'

df = pd.read_csv(f, keep_default_na = False)

# Drop unnamed columns
df = df.drop([c for c in df.columns if 'Unnamed' in c], axis = 1)

df['fileno'] = df.index + 1
df['fileno'] = df['fileno'].apply(str)

# Add column for number corresponding to file number
df.head()

Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno
0,21047,21447,k-u-n,1
1,60661,61696,agai=du bur-u,2
2,68433,68943,da,3
3,83276,83851,hir-u-n,4
4,98664,99986,tabi=nki hir-u-n,5


In [9]:
if 'YS_Transcription-txt-rys' in df.columns:
    df = df.rename(columns = {'YS_Transcription-txt-rys':'Yonaguni'})

# Only want files that begin with these phonemes

In [10]:
targets = ['p', 't', 'k', 'c', 'n', 'm', 'ts']

In [11]:
def target_first(sentence):
    '''Breaks up the sentence by spaces and looks at the first letter for each word'''
    # Get rid of Japanese code switching or uncertainties (parentheses)
    sentence_nojp = re.sub('<[^(.+>.+<)]+>', '', sentence)
    sentence_nojp = re.sub('\([^(.+\).+\()]+\)', '', sentence_nojp)
    
    # Look at the first letter in each word. Ignore n
    first_letters = [w[0] for w in sentence_nojp.split() if len(w) > 1]
    # And first 2, in case ts
    first_digraph = [w[0:2] for w in sentence_nojp.split() if len(w) > 1]
    
    # Only take those that are overlapped
    inter = set.intersection(set(first_letters), set(targets))
    
    return(len(inter) > 0)

### Apply function to subset dataframe to just those tokens that contain our targets

In [12]:
# Subset by taking only those that don't end up being zero
target_df = df[df['Yonaguni'].apply(lambda s: target_first(s))].reset_index(drop = True)

print(len(target_df))

target_df.head()

223


Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno
0,21047,21447,k-u-n,1
1,98664,99986,tabi=nki hir-u-n,5
2,111279,112254,kkar-ir-u-n,6
3,123518,124213,kka-ni-nu-n,7
4,191060,191960,kum-i bur-a-nu,9


### Move files to target folder

In [13]:
try: os.mkdir(target_folder)
except FileExistsError: pass

for f in sorted(glob.glob(clipped_folder + '/*.wav')):
    stem = Path(f).stem
    if stem in target_df['fileno'].values:
        os.rename(f, target_folder + '/' + stem + '.wav')

### Make Yonaguni transcription MFA friendly

In [14]:
to_mfa = {
    '1': '',
    '=': '',
    '-': '',
    '\(': '',
    '\)': '',
    '\?': '',
    '\.': ''
}

target_df['mfa'] = target_df['Yonaguni'].replace(to_mfa, regex = True)

We need to mark Japanese words so they won't be analyzed incorrectly as Yonaguni
- Use regex to search for everything between < >
- We will mark each Japanese word as beginning with "q" (MFA doesn't like symbols like < >) so that we can later filter them out

In [15]:
def mark_jp(sentence):
    '''Takes sentence and looks for words between <> and adds q in front of each word'''
    
    # Find all instances of codeswitching
    # [^(.+>.+<)]+ lets us get all instances if there is multiple codeswitching
    matches = [m for m in re.finditer('<[^(.+>.+<)]+>', sentence)]
    
    # Then for parentheses
    matches.extend([m for m in re.finditer('\([^(.+\).+\()]+\)', sentence)])
    
    if len(matches) == 0:
        
        return(sentence)
    
    else:
    
        # Initialize variable to keep track of how far into the word we are and our new sentence
        i = 0
        new_s = ''
        
        # Replace matches with q at the beginning of each word, so we know what the Japanese words are
        for m in matches:
            
            # Shows what the code-switched segments are
            print(m.group())

            # Break up code-switched sentence into words
            code_switch = m.group()[1:-1].split()

            # Add q to beginning of every word
            jp_marked = ' '.join(['q' + w for w in code_switch])
            
            # Conjoin to form new sentence
            new_s += sentence[i:m.span()[0]] + jp_marked

            # Update index
            i = m.span()[1]

        new_s += sentence[i:]
        
        return(new_s)

In [16]:
# Sanity check
target_df['mfa'].apply(lambda s: mark_jp(s))

<terebi>
<terebi>
<terebi>
<terebi>
<jasai>
<urawa>
<tokajo>
<jeega>
<jeega>
<jeega>
<m sono tokimo ieru>
<zjuuichizi>
<zjuuichizi>
<zjuuichizi>
<aujo>
<toga ittandessjo>
<to iukara>
<katazuke>


0                           kun
1                 tabinki hirun
2                      kkarirun
3                      kkaninun
4                   kumi buranu
                 ...           
218         tagabiti maaminutan
219                ki hindagijo
220    maaminubiti tagadu ataru
221                tagadu ataru
222                tagadu ataru
Name: mfa, Length: 223, dtype: object

In [17]:
target_df['mfa'] = target_df['mfa'].apply(lambda s: mark_jp(s))

target_df.to_csv(working_folder + '/' + file + '-targets.csv', index = False)

target_df.head()

<terebi>
<terebi>
<terebi>
<terebi>
<jasai>
<urawa>
<tokajo>
<jeega>
<jeega>
<jeega>
<m sono tokimo ieru>
<zjuuichizi>
<zjuuichizi>
<zjuuichizi>
<aujo>
<toga ittandessjo>
<to iukara>
<katazuke>


Unnamed: 0,Begin Time - msec,End Time - msec,Yonaguni,fileno,mfa
0,21047,21447,k-u-n,1,kun
1,98664,99986,tabi=nki hir-u-n,5,tabinki hirun
2,111279,112254,kkar-ir-u-n,6,kkarirun
3,123518,124213,kka-ni-nu-n,7,kkaninun
4,191060,191960,kum-i bur-a-nu,9,kumi buranu


# Use audiolabel to fill TextGrids with MFA transcription
### First create TextGrids for all the files with the TextGridMaker Praat script
- Use the TextGridMaker.praat script on the target folder
- If the annotations are right after running this cell, replace the empty TextGrids

In [21]:
# Make directory for filled TextGrids if it does not exist
try: os.mkdir(aligned_folder + '/filled')
except FileExistsError: pass
    
for f in sorted(glob.glob(aligned_folder + '/*.TextGrid')):
    stem = Path(f).stem
    
    # Get sentence from targets dataframe
    sentence = target_df.loc[target_df['fileno'] == stem, 'mfa'].values[0]
    
    # Read TextGrid and replace text
    with open(aligned_folder + '/' + stem + '.TextGrid', 'r') as f:
        replace = re.sub('text = ".*"', 'text = "' + sentence + '"', f.read())
    
    # Print to new file
    with open(aligned_folder + '/filled/' + stem + '.TextGrid', 'w') as w:
        w.write(replace)

### Make dictionary for MFA to phonemes

In [22]:
mfa_digraph = {
    'si': 'SH IY1',
    'sj': 'SH Y',
    'zi': 'JH Y',
    'zj': 'JH Y',
}

mfa_C = {
    'q': '',
    'ng': 'NG',
    'nk': 'NG K',
    'nm': 'M',
    'nn': 'N',
    'np': 'M P',
    'nb': 'M B',
    'j': 'Y',
    'c': 'CH',
    'kk': 'K',
    'tt': 'T',
    'h': 'HH'
}

mfa_VV = {
    'aa': 'AA1',
    'ee': 'EY1',
    'ii': 'IY1',
    'oo': 'OW1',
    'uu': 'UW1'
}

mfa_V = {
    'a': 'AA1',
    'e': 'EY1',
    'i': 'IY1',
    'o': 'OW1',
    'u': 'UW1'
}

# Add spaces on either end
for d in [mfa_digraph, mfa_C, mfa_VV, mfa_V]:
    for k, v in d.items():
        d[k] = ' ' + v + ' '

### Function for turning words to MFA pronunciation

In [23]:
def to_mfa(word):
    '''Turns word to MFA pronunciation'''

    for d in [mfa_digraph, mfa_C, mfa_VV, mfa_V]:
        word = (multiple_replace(d, word))
    
    mfa = ''
    
    for l in word:
        if l.islower():
            mfa += ' ' + l.upper() + ' '
        else:
            mfa += l
    
    return(' '.join(mfa.split()))

### Collect the unique words that show up in the data

In [91]:
# # IF WE NEED TO REREAD THE TARGET FILES

# for folder in sorted(next(os.walk('.'))[1]):
#     if folder != '.ipynb_checkpoints':
#         target_df = pd.read_csv(folder + '/' + folder + '-targets.csv', keep_default_na = False)
        
#         words = []

#         for s in target_df['mfa']:
#             for w in s.split():
#                 if w not in words: words.append(w)

#         #print(sorted(words))

#         with open(folder + '/dictionary.txt', 'w') as f:
#         #with open(working_folder + '/dictionary.txt', 'w') as f:

#             for w in sorted(words): 
#                 f.write(w + '  ' + to_mfa(w) + '\n')

In [24]:
words = []

for s in target_df['mfa']:
    for w in s.split():
        if w not in words: words.append(w)
            
print(sorted(words))

['abatidu', 'abatinnajo', 'abatiti', 'abjaminunggara', 'abunki', 'abunkidu', 'abunkija', 'agami', 'aigundo', 'aitando', 'aitarasi', 'aitarasidu', 'amidu', 'amiri', 'amirijo', 'amiti', 'amitidu', 'angami', 'angamidu', 'ansuja', 'arai', 'araiti', 'araitigara', 'arando', 'aranuna', 'arataru', 'aru', 'arunggara', 'atadu', 'ataru', 'atsadarunggara', 'atsaru', 'bagaruta', 'barasadu', 'bata', 'buna', 'bunga', 'bungara', 'buranu', 'buru', 'burungara', 'burunggara', 'butaru', 'butarugai', 'ci', 'cikaran', 'cira', 'ciradu', 'dagara', 'danki', 'dannaidu', 'din', 'dinadu', 'dinnga', 'ducitu', 'dutuni', 'gakunki', 'habagin', 'habain', 'habainja', 'haitigara', 'hajagu', 'hangara', 'hanuti', 'hataba', 'hi', 'hima', 'hindagijo', 'hinnajo', 'hiri', 'hirijo', 'hirudo', 'hirun', 'hirundo', 'hjundo', 'huganunga', 'hui', 'i', 'ibitati', 'isi', 'kaci', 'kacidu', 'kacija', 'kadin', 'kaidu', 'kaindi', 'kaisi', 'kanundo', 'ki', 'kidu', 'kinnajo', 'kiranunga', 'kirunga', 'kitando', 'kitanga', 'kjando', 'kkaninu

### Create dictionary

In [25]:
with open(working_folder + '/dictionary.txt', 'w') as f:

    for w in sorted(words): 
        f.write(w + '  ' + to_mfa(w) + '\n')

# Next steps
- The last prep work to be done for MFA alignment is isolating one channel (MFA doesn't deal well with two-channel files)
- You can run the shell script extractchannel.sh in the command line to extract one channel (extract the channel where the speaker's voice is louder). The usage is in the script, but is repeated here

        bash extractchannel.sh folder channel
        
        
- Make sure the correct channel was extracted, then move the original target .wav files back to the clipped_folder. Then move these mono files to the target_folder
- You can now run the MFA! See usage in mfa_usage.txt
- After alignment, you can move the unaligned files into a folder called "unaligned"

# Layout

Below is a screenshot of how my folder layout looks after MFA realignment.

- aligned : folder with target .wav files and MFA force aligned .TextGrids
- clips : folder with original clipped files
- unaligned : folder with target .TextGrids before forced alignment

- dictionary.txt : dictionary for MFA forced alignment

- [FILENAME].csv : .csv file containing metadata
- [FILENAME].TextGrid : Praat .TextGrid extracted from ELAN .eaf file

<img src='folderlayout.png'>

## Import read_label from audiolabel to get TextGrid time info

In [2]:
from audiolabel import read_label

### Get preceding and following words for each file

In [3]:
def get_contexts(word_data, phone_data):

    '''Takes a word and phone dataframe and updates it for context'''
    
    # Word context
    
    prev_words = ['sentence_start']
    prev_word_starts = ['sentence_start']
    prev_word_ends = ['sentence_start']
    
    next_words = []
    next_word_starts = []
    next_word_ends = []
    
    # Times for previous

    for i in range(0, len(word_data)):
        
        if i != 0:
            prev_words.append(word_data.loc[i - 1, 'label'])
            prev_word_starts.append(word_data.loc[i - 1, 't1'])
            prev_word_ends.append(word_data.loc[i - 1, 't2'])
            
        if i != len(word_data) - 1:
            next_words.append(word_data.loc[i + 1, 'label'])
            next_word_starts.append(word_data.loc[i + 1, 't1'])
            next_word_ends.append(word_data.loc[i + 1, 't2'])

    next_words.append('sentence_end')
    next_word_starts.append('sentence_end')
    next_word_ends.append('sentence_end')

    # Now add lists to datasets
    word_data['prev_word'] = prev_words
    word_data['prev_word_start'] = prev_word_starts
    word_data['prev_word_end'] = prev_word_ends
    
    word_data['next_word'] = next_words
    word_data['next_word_start'] = next_word_starts
    word_data['next_word_end'] = next_word_ends
    
    # Get sentence and get rid of extra whitespaces
    sentence = ' '.join((' '.join(word_data['label'].values)).split())
    
    word_data['sentence'] = sentence
    
    ###
    
    phone_data['word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['label'].item(), axis = 1)
    
    phone_data['prev_word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word'].item(), axis = 1)
    phone_data['prev_word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word_start'].item(), axis = 1)
    phone_data['prev_word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['prev_word_end'].item(), axis = 1)
    
    phone_data['next_word'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word'].item(), axis = 1)
    phone_data['next_word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word_start'].item(), axis = 1)
    phone_data['next_word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['next_word_end'].item(), axis = 1)
    
    phone_data['word_start'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['t1'].item(), axis = 1)
    phone_data['word_end'] = phone_data.apply(lambda row : word_data[(word_data['t1'] <= row['t1']) & (word_data['t2'] >= row['t2'])]['t2'].item(), axis = 1)
    
    phone_data['sentence'] = sentence
    
    # Phone context

    prev_phons = ['sentence_start']
    prev_phon_starts = ['sentence_start']
    prev_phon_ends = ['sentence_start']
    
    next_phons = []
    next_phon_starts = []
    next_phon_ends = []

    for i in range(0, len(phone_data)):
        
        if i != 0:
            prev_phons.append(phone_data.loc[i - 1, 'label'])
            prev_phon_starts.append(phone_data.loc[i - 1, 't1'])
            prev_phon_ends.append(phone_data.loc[i - 1, 't2'])
            
        if i != len(phone_data) - 1:
            next_phons.append(phone_data.loc[i + 1, 'label'])
            next_phon_starts.append(phone_data.loc[i + 1, 't1'])
            next_phon_ends.append(phone_data.loc[i + 1, 't2'])
            
    next_phons.append('sentence_end')
    next_phon_starts.append('sentence_end')
    next_phon_ends.append('sentence_end')

    phone_data['prev_phon'] = prev_phons
    phone_data['prev_phon_start'] = prev_phon_starts
    phone_data['prev_phon_end'] = prev_phon_ends
    
    phone_data['next_phon'] = next_phons
    phone_data['next_phon_start'] = next_phon_starts
    phone_data['next_phon_end'] = next_phon_ends
    
    return(word_data, phone_data)

In [4]:
word_df = pd.DataFrame()
phone_df = pd.DataFrame()

for f in glob.glob('*/aligned/*.TextGrid'):
    
    [sub_wdf, sub_pdf] = read_label(f, ftype = 'praat')
    
    [updated_wdf, updated_pdf] = get_contexts(sub_wdf, sub_pdf)
    
    word_df = pd.concat([word_df, updated_wdf])
    phone_df = pd.concat([phone_df, updated_pdf])

In [5]:
# Add phoneme and first syllable info

# Define syllable
def get_firstsyll(word):

    vowels = ['a', 'i', 'u', 'e', 'o']

    syll = ''

    # Flags for onset, nucleus and coda
    onset = 'incomplete'
    nucleus = 'incomplete'

    # If word is 2 for fewer letters long, just take it
    if len(word) <= 2:

        return(word)

    # Keep moving i until we find a vowel
    i = 0

    # If first sound is vowel, no longer in onset
    if word[0] in vowels:
        onset = 'complete'

    while onset != 'complete':

        i += 1

        syll += word[i-1]

        if word[i] in vowels: 

            onset = 'complete'

    # Look for end of word or coda
    while nucleus != 'complete':

        i += 1

        syll += word[i-1]

        if i == len(word):

            return(syll)

        if word[i] not in vowels:

            nucleus = 'complete'

    # If sound is n
    if word[i] == 'n':

        i += 1

        # If whole word, then return
        if i == len(word):

            return(word)

        # If next sound is consonant, add
        if word[i] not in vowels:

            syll += word[i-1]

    return(syll)

###

def findonset(word):
    
    nuclei = ['a', 'e', 'i', 'o', 'u', 'w', 'j']
    
    onset = ''
    
    for phone in word:
        
        if phone not in nuclei:
            
            onset += phone
        
        else: return(onset)

In [6]:
phone_df['onset'] = phone_df['word'].apply(lambda w: findonset(w))
phone_df['first_syll'] = phone_df['word'].apply(lambda w: get_firstsyll(w))

In [7]:
len(phone_df)

86834

In [8]:
phone_df.head()

Unnamed: 0,t1,t2,label,fname,word,prev_word,prev_word_start,prev_word_end,next_word,next_word_start,...,word_end,sentence,prev_phon,prev_phon_start,prev_phon_end,next_phon,next_phon_start,next_phon_end,onset,first_syll
0,0.0,0.09,sil,YO0010c_yoneshiro_europestory_20130424/aligned...,,sentence_start,sentence_start,sentence_start,cigaija,0.09,...,0.09,cigaija kija kirunga qjappari unu munu qbakuda...,sentence_start,sentence_start,sentence_start,CH,0.09,0.17,,
1,0.09,0.17,CH,YO0010c_yoneshiro_europestory_20130424/aligned...,cigaija,,0.0,0.09,,0.92,...,0.92,cigaija kija kirunga qjappari unu munu qbakuda...,sil,0.0,0.09,IY1,0.17,0.26,c,ci
2,0.17,0.26,IY1,YO0010c_yoneshiro_europestory_20130424/aligned...,cigaija,,0.0,0.09,,0.92,...,0.92,cigaija kija kirunga qjappari unu munu qbakuda...,CH,0.09,0.17,G,0.26,0.32,c,ci
3,0.26,0.32,G,YO0010c_yoneshiro_europestory_20130424/aligned...,cigaija,,0.0,0.09,,0.92,...,0.92,cigaija kija kirunga qjappari unu munu qbakuda...,IY1,0.17,0.26,AA1,0.32,0.39,c,ci
4,0.32,0.39,AA1,YO0010c_yoneshiro_europestory_20130424/aligned...,cigaija,,0.0,0.09,,0.92,...,0.92,cigaija kija kirunga qjappari unu munu qbakuda...,G,0.26,0.32,IY1,0.39,0.49,c,ci


In [9]:
phone_df['onset'].value_counts()

       11066
k       8330
m       6940
b       6498
n       5961
h       4217
t       4054
d       3221
q       2577
tt      2441
c       2135
s       2108
nn      1863
nd      1127
ns       863
qk       840
kk       738
ts       723
nt       529
qs       515
qt       484
nm       477
ngg      437
qh       433
g        376
qn       308
p        304
qz       284
qm       254
nb       230
qb       211
qd       192
nk       186
qts      182
qg       152
qc       106
ng        74
ss        60
qr        54
mb        11
qy        11
qtt       10
nts       10
pp         8
qf         4
cc         3
qnn        3
ttt        3
Name: onset, dtype: int64

In [10]:
word_df.to_csv('../data/words.csv', index = False)
phone_df.to_csv('../data/phones.csv', index = False)

# Get target words

In [11]:
target_words = word_df[(word_df['label'].str.len() > 1) & (word_df['label'].str.contains('^[tkcpmn(ts)]'))]

target_phones_all = phone_df[(phone_df['word'].str.len() > 1) & (phone_df['word'].str.contains('^[tkcpmn(ts)]')) & (phone_df['label'].isin(['T', 'K', 'CH', 'P', 'M', 'N'])) & (phone_df['word_start'] == phone_df['t1'])]

target_phones_all['onset'] = target_phones_all['word'].apply(lambda w: findonset(w))
target_phones_all['first_syll'] = target_phones_all['word'].apply(lambda w: get_firstsyll(w))

# Reorder columns
col_order = ['fname', 'onset', 'label', 't1', 't2',
             'word', 'word_start', 'word_end',
             'first_syll', 'sentence',
             'prev_phon', 'prev_phon_start', 'prev_phon_end', 'next_phon', 'next_phon_start', 'next_phon_end', 
             'prev_word', 'prev_word_start', 'prev_word_end', 'next_word', 'next_word_start', 'next_word_end'
             ]

target_phones_all = target_phones_all[col_order]

# Rename t1 and t2 to more useful names
target_phones_all = target_phones_all.rename(columns = {'t1':'phon_start', 't2':'phon_end'})

# Isolate phones that are preceded by vowels
target_phones = target_phones_all[target_phones_all['prev_phon'].isin(['AA1', 'IY1', 'UW1', 'EY1', 'OW1', 'N'])].reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
set(target_phones.fname.apply(lambda x: x.split('_')[0]))

{'YO0001',
 'YO0002',
 'YO0004',
 'YO0005',
 'YO0006',
 'YO0008c',
 'YO0008d',
 'YO0008e',
 'YO0009b',
 'YO0010a',
 'YO0010b',
 'YO0010c',
 'YO0011a',
 'YO0011b',
 'YO0012b',
 'YO0014b',
 'YO0015',
 'YO0016',
 'YO0017a',
 'YO0017b',
 'YO0019a',
 'YO0019b',
 'YO0020a',
 'YO0020b',
 'YO0020c',
 'YO0021',
 'YO0024b',
 'YO0025a'}

In [15]:
target_phones.to_csv('../data/target_phones.csv', index = False)
target_phones_all.to_csv('../data/target_phones_all.csv', index = False)

# Check counts

In [19]:
target_phones_all.groupby(['onset'])['word'].value_counts().reset_index(name = 'count').to_csv('../data/word_count_all.csv', index = False)

In [20]:
target_phones.groupby(['onset'])['word'].value_counts().reset_index(name = 'count').to_csv('../data/word_count.csv', index = False)

In [21]:
target_phones.groupby(['onset'])['first_syll'].value_counts().reset_index(name = 'count').to_csv('../data/first_syll_count.csv', index = False)

In [22]:
target_phones_all.groupby(['onset'])['first_syll'].value_counts().reset_index(name = 'count').to_csv('../data/first_syll_count_all.csv', index = False)