# Script to go through Karuk files for forced alignment
- This script builds the metadata for the Karuk files, cleans up the transcriptions for the Montreal Forced Aligner (MFA), and creates a dictionary for the MFA
- Karuk audio files and XML data gotten from http://linguistics.berkeley.edu/~karuk/

In [1]:
# For parsing xml
from bs4 import BeautifulSoup

# For navigating folders and creating things
import sys
import os
import glob
import re
import fnmatch

# For reading and handling dataframe
import pandas as pd

# For dealing with TextGrids
import audiolabel as al

## Make csv file with file names, speakers, and sentences

In [79]:
### Do not run if already done!!!
### For next time around--write to csv with pandas
### Also, need to tag language so we can get rid of English sentences

# # Audio
# files = [f for f in sorted(glob.glob('../corpus/audio-sentences/*/*.wav', recursive = True))]

# # Metadata
# f = open('../corpus/karuk-texts.xml', 'r')
# contents = f.read()
# soup = BeautifulSoup(contents,'xml')

# # csv file write to
# with open('../data/audiodata_draft.csv', 'w') as w:
    
#     w.write(','.join(['audio', 'transcription']))
    
#     # Search in metadata for transcription of audio
#     for f in files:
#         f_ext = os.path.basename(f)
#         f_noext = f_ext[:-4]

#         # Search by the tag "s" for speaker and attribute "audio" that matches the filename
#         # Then find the text and remove all the extra whitespaces by splitting
#         try:
#             tran = soup.find("s", {"audio" : f_noext}).find('text-plain').string.split()
#             clean_tran = ' '.join(tran)
#         except AttributeError:
#             clean_tran = 'NA'
        
#         w.write('\n' + ';'.join([f_noext, clean_tran]))

In [289]:
# # FIX SOME
# df = pd.read_csv('../data/audiodata.csv', sep = ',')

# # 1782 is messed up
# to_fix = df.iloc[1782, :]['transcription'].split('\n')

# # Get rid of first one for now
# split_by_comma = [i.split(',') for i in to_fix][1:]

# # Rejoin the last ones
# rejoin = [[i[0], ','.join(i[1:])] for i in split_by_comma]

# # Make into df to attach
# sub_df = pd.DataFrame(rejoin, columns = ['audio', 'transcription'])

# # Add the ending to the last one
# sub_df.iloc[-1,-1] = sub_df.iloc[-1,-1] + " kiri xuus kun'uum."

# # Fix the messy row
# messy_row = pd.DataFrame({'audio':'LA78.1-002_013', 'transcription':' aakich ivaakirihti.'}, index =[0]) 

# sub_df = pd.concat([messy_row, sub_df]).reset_index(drop = True)

# # Now insert the sub_df in
# df_new = pd.concat([df.iloc[:1782, :], sub_df, df.iloc[1783:, :]]).reset_index(drop = True)

# df_new.to_csv('../data/audio_new.csv')

## Read in the file we just wrote and fix commas

In [472]:
df = pd.read_csv('../data/audiodata.csv', sep = ',')

# Show NA's
df.loc[df.transcription.isnull()]

df['realtran'] = df[df.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

# Get rid of English sentences
for i in [107, 123, 137, 139, 1108]:
    df.iloc[i, -1] = ''

# Drop all except the new combined transcription
df = df.drop(df.columns[range(1,len(df.columns)-1)], axis =1)
df = df.rename(columns={'realtran': 'transcription'})

df.to_csv('../data/audiodata_fixed.csv', index = False)

# Now read in file and fix for Montreal Forced Aligner

In [473]:
# Dictionary for replacing with MFA friendly characters
to_MFA = {
    # Other
    u'.': '',
    u',': '',
    u'?': '',
    u'!': '',
    u'"': '',
    u'-': '',
    u'’': '',
    # Tone
    u'́': '1',
    u'̂': '2',
    # Glottal
    u'\'': 'q',
    # Other unicode
    u'â': 'a2',
    u'á': 'a1',
    u'í': 'i1',
    u'î': 'i2',
    u'ú': 'u1',
    u'é': 'e1',
    u'ó': 'o1',
    u'ê': 'e2',
}

df = pd.read_csv('audiodata_fixed.csv', sep = ',')

df['transcription_mfa'] = df['transcription']

for orig, new in to_MFA.items():
    df['transcription_mfa'] = df['transcription_mfa'].str.replace(orig, new)
    
# Get rid of things inside ( ) and [ ]
for i in range(len(df)):
    tran = df.iloc[i, -1]
    
    if isinstance(tran, str):
        # Remove parentheses
        tran = re.sub("\(.*\)", "", tran)
        # Remove brackets
        tran = re.sub("\[.*\]", "", tran)
        # Remove all whitespace at side and in between
        tran = ' '.join(tran.split())

        # Now replace with new sentence
        df.iloc[i, -1] = tran

df.to_csv('../data/audio_metadata.csv', index = False)

## Now modify empty textgrids by filling in with sentences
Use the make_tg.sh shell script to make empty TextGrids. Praat doesn't like folders with spaces so make sure the path doesn't have any.

In [474]:
df = pd.read_csv('../data/audio_metadata.csv')

files = [f for f in sorted(glob.glob('../corpus/audio-sentences/*/*.wav', recursive = True))]

for f in files:
    # Find what the transcription should be
    audio = os.path.basename(f).replace('.wav', '')
    
    tran = df.loc[df['audio'] == audio]['transcription_mfa'].iloc[-1]
    
    if not isinstance(tran, str):
        tran = ''
    
    # Read in textgrid
    tg_file = f.replace('wav', 'TextGrid')
    tg = al.LabelManager(from_file=tg_file, from_type='praat')

    # Add in transcription
    tg.tier('transcription')[0].text = tran
    
    # Open TextGrid and replace with newly annotated one: 
    with open(tg_file, 'w', encoding='utf-8') as new_tg:
        new_tg.write(tg.as_string('praat_long'))

## Create MFA dictionary

In [477]:
# MFA dictionary
# Create dictionary for MFA
tones = {
    '1': '',
    '2': ''
}

digraphs = {
    # Consonants
    'sh': 'SH',
    'ch': 'CH',
    'th': 'TH',
    # Vowels
    'ii': 'IY1',
    'uu': 'UW1',
    'aa': 'AA1',
    'ee': 'EY1',
    'oo': 'OW1',
}

monographs = {
    # Vowels
    'i': 'IH0',
    'u': 'UH0',
    'a': 'AH0',
    'e': 'EH0',
    'o': 'OW0',
    # Consonants
    'p': 'P',
    't': 'T',
    'k': 'K',
    'q': 'T',
    'm': 'M',
    'n': 'N',
    'f': 'F',
    's': 'SH',
    'x': 'HH',
    'h': 'HH',
    'v': 'V',
    'r': 'R',
    'y': 'Y',
    # Extra from English
    'b': 'B',
    'd': 'D',
    'g': 'G',
    'l': 'L',
    'w': 'W',
    'c': 'K',
    'j': 'JH',
    ':': ''
}

# Populate dictionary
df = pd.read_csv('../data/audiodata_mfa.csv')

array = df['transcription_mfa'].str.split()

words = set([x.lower() for l in array if isinstance(l, list) for x in l])

with open('../corpus/dictionary.txt', 'w') as dictionary:
    for w in sorted(words):
        pronunciation = w
        # Replace
        # Tone
        for tran, mfa in tones.items():
            pronunciation = pronunciation.replace(tran, mfa)
        # Digraphs
        for tran, mfa in digraphs.items():
            pronunciation = pronunciation.replace(tran, mfa + ' ')
        # Monographs
        for tran, mfa in monographs.items():
            pronunciation = pronunciation.replace(tran, ' ' + mfa + ' ')
        
        #print(w.strip())
        for x in pronunciation.split():
            
            # Check if any transcriptions have not been translated
            if x not in list(digraphs.values()) + list(monographs.values()):
                print(pronunciation, ' has the symbol ', x, ', Fix in the dictionary above')
        
        pronunciation = ' '.join(pronunciation.split())
        
        print(w, pronunciation)
            
        dictionary.write('  '.join([w, pronunciation])+'\n')

a AH0
a1ah AA1 HH
a1ahkuriheen AA1 HH K UH0 R IH0 HH EY1 N
a1athik AA1 TH IH0 K
a1hup AH0 HH UH0 P
a1hupqaasvuuti AH0 HH UH0 P T AA1 SH V UW1 T IH0
aachipvari AA1 CH IH0 P V AH0 R IH0
aah AA1 HH
aak AA1 K
aakich AA1 K IH0 CH
aama AA1 M AH0
aamtih AA1 M T IH0 HH
aapun AA1 P UH0 N
aas AA1 SH
aasak AA1 SH AH0 K
aasaqeech AA1 SH AH0 T EY1 CH
aasishrih AA1 SH IH0 SH R IH0 HH
aav AA1 V
aax AA1 HH
aaxkunishichas AA1 HH K UH0 N IH0 SH IH0 CH AH0 SH
aayas AA1 Y AH0 SH
achpuus AH0 CH P UW1 SH
achviiv AH0 CH V IY1 V
ah AH0 HH
ahapiimich AH0 HH AH0 P IY1 M IH0 CH
ahup AH0 HH UH0 P
ahupqasip AH0 HH UH0 P T AH0 SH IH0 P
ahupqasipak AH0 HH UH0 P T AH0 SH IH0 P AH0 K
akaay AH0 K AA1 Y
akee AH0 K EY1
aknap AH0 K N AH0 P
akvaat AH0 K V AA1 T
amayav AH0 M AH0 Y AH0 V
amkiravasih AH0 M K IH0 R AH0 V AH0 SH IH0 HH
anavikya2atich AH0 N AH0 V IH0 K Y AA1 T IH0 CH
andrew AH0 N D R EH0 W
apapiichyuupich AH0 P AH0 P IY1 CH Y UW1 P IH0 CH
apapkam AH0 P AH0 P K AH0 M
apim AH0 P IH0 M
apmaan AH0 P M AA1 N
apmaanak