# Calculate Durations of Segments

## Import Libraries

In [52]:
import re, os, sys
import pandas as pd

from config import INPUT_PATH, PHONE_TIER, OUTPUT_PATH

## Classes

In [3]:
# to do: may need to add info about previous and following segment, 
# ------ location of segment within word, syllable structure, etc

class Interval:
    """
    Class to store start point, end point, duration, and label of an interval
    """
    def __init__(self, start_point = 0.0, end_point = 0.0, label = ""):
        self.start_point = float(start_point)
        self.end_point = float(end_point)
        self.duration = -1
        self.label = str(label)
    def calculate_duration(self):
        return (self.end_point - self.start_point)
    def __str__(self):
        return f'start_point: {self.start_point}\nend_point: {self.end_point}\nlabel: {self.label}\nduration: {self.duration}'



## Helper Functions

In [4]:
def extract_info(interval, line):
    """
    Extract relevant information from line and add it to Interval object
    """
    if 'xmin = ' in line:
        start_point = re.sub(r'.*xmin = (\d*\.*\d*)', '\\1', line)
        interval.start_point = float(start_point)
    elif 'xmax = ' in line:
        end_point = re.sub(r'.*xmax = (\d*\.*\d*)', '\\1', line)
        interval.end_point = float(end_point)
    elif 'text = ' in line:
        label = re.sub(r'.*text = "(.*)"', '\\1', line)
        label = label.strip()
        if label:
            interval.label = label
    return interval


In [5]:
def read_input(in_folder_name = INPUT_PATH):
    """
    read in all textgrid files from folder as list of Interval objects
    """
    intervals = []
    for file_name in os.listdir(in_folder_name):
        if not file_name.endswith('.TextGrid'):
            continue

        with open(os.path.join(in_folder_name, file_name), 'r', encoding='utf8') as in_file:
            is_phone_tier = False
            current_interval = Interval()
            for line in in_file:
                # entered tier with phone segments
                if PHONE_TIER in line:
                    is_phone_tier = True
                # no longer in phone tier
                elif 'name = ' in line:
                    is_phone_tier = False
                
                # extract info if in phone tier
                if is_phone_tier:
                    current_interval = extract_info(current_interval, line)
                # label has been filled, this interval is complete
                if current_interval.label:
                    # calculate duration and store it in the object
                    current_interval.duration = current_interval.calculate_duration()
                    # add to list of interval objects and clear current interval
                    intervals.append(current_interval)
                    current_interval = Interval()
    return intervals


### Main Function

In [6]:

# read in list of interval objects
segment_intervals = read_input()

# convert list of interval objects to dataframe
master_df = pd.DataFrame([interval.__dict__ for interval in segment_intervals])

# create dictionary of dataframes based on values in 'label' column
all_segments_df = {elem : pd.DataFrame() for elem in master_df.label.unique()}

# fill values based on key
for key in all_segments_df.keys():
    all_segments_df[key] = master_df[:][master_df.label == key]



### Calculate Stats

In [7]:
def df_greater_than_mean(df, column_name = 'duration'):
    """
    return dataframe greater than the mean of the column
    """
    return df[df[column_name] > df[column_name].mean()]

In [8]:
# example output - access dataframe by label as key
all_segments_df['r']

master_df.label.unique()

array(['<p:>', 'd', 'i:', '?', 'o:', 'm', 'a', 'h', 't', '@', 'b', 'S',
       'l', 'O', 's', 'z', 'C', 'p', 'f', 'E', 'aU', '6', 'g', 'u:', 'x',
       'v', 'r', 'aI', 'I', 'n', 'a:', 'e:', 'U', 'N', 'ts', 'OY', 'k',
       'o', 'u', 'j', 'e', 'dZ'], dtype=object)

In [9]:
# plosives

# extract plosives as dictionary of dataframes
plosive_frames_dict = {k: all_segments_df.get(k, None) for k in ('d', 't', 'b', 'p', 'g', 'k')}

# concatenate the dataframes
plosive_df = pd.concat(plosive_frames_dict)

# extract rows that have duration larger than the mean
# todo: this calculates mean across all plosives, will need to separate these by each plosive I think
df_greater_than_mean(plosive_df)



Unnamed: 0,Unnamed: 1,start_point,end_point,duration,label
d,17,1.77712,1.82712,0.05,d
d,53,3.83712,3.92712,0.09,d
d,77,0.71712,0.77712,0.06,d
d,171,2.04712,2.10712,0.06,d
d,225,0.23712,0.34712,0.11,d
...,...,...,...,...,...
k,17504,1.70712,1.81712,0.11,k
k,17527,2.86712,2.95712,0.09,k
k,17554,1.67712,1.83712,0.16,k
k,18261,4.42712,4.48712,0.06,k


In [49]:
def save_output(output_dir, output_filename, df):
    """
    Save output
    """
    if not os.path.exists(output_dir):
        print(output_dir)
        os.makedirs(output_dir)
    outputFilePath = f'{output_dir}/{output_filename}.csv'
    df.to_csv(outputFilePath)

In [53]:
average_df = pd.read_csv('kiel_average_durations.csv')

for key in all_segments_df:
    if key == "<p:>":
        continue
    average_segment_value = average_df[average_df["segment"].str.contains(key)].iloc[0]['duration']
    filepath = f'{key}_greater_than_average'
    greater_than_average_df = all_segments_df[key][all_segments_df[key]['duration'] > average_segment_value]
    save_output(OUTPUT_PATH, filepath, greater_than_average_df)
    

error: nothing to repeat at position 0

In [47]:
average_df.loc[average_df['segment'] == '$z']
average_df[average_df["segment"].str.contains("z")]
all_segments_df[key]['duration']

1        0.04
17       0.05
45       0.03
53       0.09
77       0.06
         ... 
18452    0.05
18470    0.03
18486    0.06
18501    0.03
18508    0.05
Name: duration, Length: 918, dtype: float64