In [1]:
import pandas as pd
import audiolabel as al

# Fix file to reflect hierarchy

In [115]:
folder = 'vowels'
segs = 20

df = pd.read_csv('VS/' + folder + '/output-' + str(segs) + '.txt', sep = '\t')

# Get rid of unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Get duration
df['dur'] = df['seg_End'] - df['seg_Start']

# Sort df by filename THEN seg start
df = df.sort_values(by = ['Filename', 'seg_Start']).reset_index(drop = True)

# Get truth values for whether it is sequence name or not--if it is, we need to change the hierachy
is_seq = df.Label.str.islower()

# Collect sequences
seq = []

# Make unique ID for each sequence too
seq_id_count = 0
seq_id = []

for i in range(len(is_seq)):
    val = is_seq[i]
    
    # If not sequence, copy
    if val == True:
        j = i
        seq_id_count += 1
    
    seq.append(df.Label[j])
    seq_id.append(seq_id_count)

# Add to data frame
df['seq'] = seq
df['seq_id'] = seq_id

# Get rid of sequence labels
df = df[df['Label'] != df['seq']]

# Mark as V1, C, or V2

In [116]:
# Reset index
df = df.reset_index(drop = True)

def map_seg(seg, folder):
    '''
    Maps segment to V1, C, or V2
    '''
    # VCV sequences
    if folder in ['metathesis', 'vowels_double_u-i']:
    
        to_seg = {0: 'V1',
                  1: 'C',
                  2: 'V2'}
        
        return(to_seg[seg % 3])

    # CV sequences
    elif folder in ['vowels']:
    
        to_seg = {0: 'C',
                  1: 'V'}
        
        return(to_seg[seg % 2])

df['part'] = df.apply(lambda x: map_seg(x.name, folder), axis = 1)

### Make before and after series
before = []
after = []
word = []
sentence = []

for i in range(len(df)):
    tg = al.LabelManager(from_file='./VS/' + folder + '/files_realigned/' + df['Filename'][i].split('.')[0] + '.TextGrid', from_type='praat')

    #print(df['Filename'][i])

    # Round and add .0001 to make sure you are IN the interval
    seg_st = round(df.at[i, 'seg_Start']/1000, 4) + .0001

    #print(seg_st)

    #print(tg.tier('transcription - phones'))
    #print(tg.tier('transcription - phones').label_at(seg_st))
    word.append(tg.tier('transcription - words').label_at(seg_st).text)
    sentence.append(' '.join(' '.join([label.text for label in tg.tier('transcription - words')]).split()))
    before.append(tg.tier('transcription - phones').prev(tg.tier('transcription - phones').label_at(seg_st)).text)
    after.append(tg.tier('transcription - phones').next(tg.tier('transcription - phones').label_at(seg_st)).text)

df['before'] = before
df['after'] = after
df['word'] = word
df['sentence'] = sentence

# Add column for adjusted V1
issues = pd.read_csv('VS/' + folder + '/issues.csv')

def get_issue(row):
    '''
    Look up issues in issues.csv file
    '''
    file = row.Filename[:-4]
    #return(' '.join(issues.loc[issues['file'] == file].issues.values))
    return(' '.join(issues.loc[issues['audio'] == file].issues.values))
    
df['issue'] = df.apply(lambda x: get_issue(x), axis = 1)

# Now fix V1 and V2 values based on issues
df['dur_mod'] = df.apply(lambda x: 0 if x.issue in [x.part + ' absent', x.part + ' absent?'] else x.dur, axis = 1)

# Final df
cols = ['Filename', 'seq', 'seq_id', 'Label', 'part', 'word', 'sentence', 'before', 'after', 'dur_mod']
df = df[cols + [c for c in df if c not in cols]]

df.to_csv('VS/' + folder + '/output-' + str(segs) + '_rev.csv', index = False)

## Fix tier names

In [117]:
# import glob
# import os

# files = [f for f in sorted(glob.glob('./VS/files_realigned/*.TextGrid', recursive = True))]

# for f in files:
#     # BASENAME: 
#     newf = './VS/files_fixedtiernames/' + os.path.basename(f)
    
#     # Read in the TextGrid to fix the tier names
#     with open(f, 'r') as r:
#         with open(newf, 'w') as w:
#             # Make counter
#             i = 1
#             for line in r:
#                 # If tier name doesn't exist
#                 if 'name = ""' in line:
#                     # First tier will be target seq
#                     if i == 1:

#                         line = line.replace('""', '"targetseq"')
#                         # Add to make sure we replace with target phone
#                         i += 1
#                     elif i == 2: 
#                         line = line.replace('""', '"targetphone"')
#                         i += 1
#                 w.write(line)

# Read in fixed file

In [102]:
df = pd.read_csv('./VS/' + folder + '/output-' + str(segs) + '_rev.csv')

## Turn from wide data to long
stubnames are the timepoints we want
i are the ID names we want to subdivide by (speaker)
j is the timepoint, since it is the suffix of the stubnames

In [111]:
# Measure we are working with
# pF1_means = first formant
# pF2_means = second formant
premeasure = 'pF2'
measure = premeasure + '_means'

# Concatenate 
cols_a = df.loc[:, ['Filename', 'seq', 'seq_id', 'Label', 'word', 'sentence', 'before', 'after', 'part', 'issue', 'dur_mod', 'dur'] ]
cols_b = df.loc[:, [measure + "{:03}".format(i+1) for i in range(segs)] ]

wide_df = pd.concat([cols_a, cols_b], axis=1)

# Get V1 duration
# Get V2 max - min F2

wide_df['C_dur'] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'C')].dur_mod.values[0])

if folder == 'vowels':
    wide_df['V_dur'] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V')].dur_mod.values[0])

    wide_df['V_max' + premeasure] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V')][[measure + "{:03}".format(i+1) for i in range(segs)]].max(axis = 1).values[0])
    wide_df['V_max' + premeasure + '_tpt'] = wide_df.seq_id.apply(lambda x: pd.to_numeric(wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V')][[measure + "{:03}".format(i+1) for i in range(segs)]].idxmax(axis=1).values[0][-3:]))
    wide_df['V_min' + premeasure] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V')][[measure + "{:03}".format(i+1) for i in range(segs)]].min(axis = 1).values[0])
    wide_df['V_min' + premeasure + '_tpt'] = wide_df.seq_id.apply(lambda x: pd.to_numeric(wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V')][[measure + "{:03}".format(i+1) for i in range(segs)]].idxmin(axis=1).values[0][-3:]))

    wide_df['V_' + premeasure + 'diff'] = wide_df['V_max' + premeasure] - wide_df['V_min' + premeasure]
    
else:
    wide_df['V1_dur'] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V1')].dur_mod.values[0])
    wide_df['V2_dur'] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V2')].dur_mod.values[0])

    wide_df['V2_max' + premeasure] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V2')][[measure + "{:03}".format(i+1) for i in range(segs)]].max(axis = 1).values[0])
    wide_df['V2_max' + premeasure + '_tpt'] = wide_df.seq_id.apply(lambda x: pd.to_numeric(wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V2')][[measure + "{:03}".format(i+1) for i in range(segs)]].idxmax(axis=1).values[0][-3:]))
    wide_df['V2_min' + premeasure] = wide_df.seq_id.apply(lambda x: wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V2')][[measure + "{:03}".format(i+1) for i in range(segs)]].min(axis = 1).values[0])
    wide_df['V2_min' + premeasure + '_tpt'] = wide_df.seq_id.apply(lambda x: pd.to_numeric(wide_df.loc[(wide_df['seq_id'] == x) & (wide_df['part'] == 'V2')][[measure + "{:03}".format(i+1) for i in range(segs)]].idxmin(axis=1).values[0][-3:]))

    wide_df['V2_' + premeasure + 'diff'] = wide_df['V2_max' + premeasure] - wide_df['V2_min' + premeasure]

In [112]:
# Now transform into long dataset
if folder == 'vowels':
    long_df = pd.melt(wide_df, id_vars=['Filename', 'seq', 'seq_id', 'part', 'issue', 'dur_mod', 'dur', 'Label', 'word', 'sentence', 'before', 'after', 'V_dur', 'C_dur', 'V_max' + premeasure, 'V_max' + premeasure + '_tpt', 'V_min' + premeasure, 'V_min' + premeasure + '_tpt', 'V_' + premeasure + 'diff'], var_name='timepoint', value_name=measure)

else:
    long_df = pd.melt(wide_df, id_vars=['Filename', 'seq', 'seq_id', 'part', 'issue', 'dur_mod', 'dur', 'Label', 'word', 'sentence', 'before', 'after', 'V1_dur', 'C_dur', 'V2_dur', 'V2_max' + premeasure, 'V2_max' + premeasure + '_tpt', 'V2_min' + premeasure, 'V2_min' + premeasure + '_tpt', 'V2_' + premeasure + 'diff'], var_name='timepoint', value_name=measure)

# z-score normalized by speaker
#zscore = lambda x: (x - x.mean()) / x.std()
#long_df.insert(len(long_df.columns), 'z-score', long_df.groupby(['sp'])[measure].transform(zscore))

# TURN LAST THREE DIGITS INTO INTEGER
long_df['timepoint'] = pd.to_numeric(long_df["timepoint"].str[-3:]) 

In [113]:
cons = ['p', 'f', 'x', 'k']

if folder == 'vowels':
    
    long_df['C'] = long_df.seq.str[0]
    long_df['V'] = long_df.seq.str[1:]
    long_df['V_qual'] = long_df.V.str[0]

else:

    def get_vc(seq):
        for c in cons:
            if c in seq:
                surr_v = seq.split(c)

                return(surr_v[0], c, surr_v[1])
            else:
                next

    # Add V1, C, and V2
    vcv = long_df.seq.apply(get_vc)

    new_col_list = ['V1', 'C', 'V2']
    for n, col in enumerate(new_col_list):
        long_df[col] = vcv.apply(lambda x: x[n])

    long_df['V1_qual'] = long_df.V1.str[0]
    long_df['V2_qual'] = long_df.V2.str[0]

In [114]:
long_df.to_csv('VS/' + folder + '/output-' + str(segs) + '_long_' + premeasure + '.csv', index = False)