# Calculate Durations of Segments

## Import Libraries

In [13]:
import re, os, sys
import pandas as pd

from config import INPUT_PATH, PHONE_TIER

## Classes

In [30]:
# to do: may need to add info about previous and following segment, 
# ------ location of segment within word, syllable structure, etc

class Interval:
    """
    Class to store start point, end point, duration, and label of an interval
    """
    def __init__(self, start_point = 0.0, end_point = 0.0, label = ""):
        self.start_point = float(start_point)
        self.end_point = float(end_point)
        self.duration = -1
        self.label = str(label)
    def calculate_duration(self):
        return (self.end_point - self.start_point)
    def __str__(self):
        return f'start_point: {self.start_point}\nend_point: {self.end_point}\nlabel: {self.label}\nduration: {self.duration}'



## Helper Functions

In [15]:
def extract_info(interval, line):
    """
    Extract relevant information from line and add it to Interval object
    """
    if 'xmin = ' in line:
        start_point = re.sub(r'.*xmin = (\d*\.\d*)', '\\1', line)
        interval.start_point = float(start_point)
    elif 'xmax = ' in line:
        end_point = re.sub(r'.*xmax = (\d*\.\d*)', '\\1', line)
        interval.end_point = float(end_point)
    elif 'text = ' in line:
        label = re.sub(r'.*text = "(.*)"', '\\1', line)
        label = label.strip()
        if label:
            interval.label = label
    return interval


In [34]:
def df_greater_than_mean(df, column_name = 'duration'):
    """
    return dataframe greater than the mean of the column
    """
    return df[df[column_name] > df[column_name].mean()]

In [16]:
def read_input(in_folder_name = INPUT_PATH):
    """
    read in all textgrid files from folder as list of Interval objects
    """
    for file_name in os.listdir(in_folder_name):
        if not file_name.endswith('.TextGrid'):
            continue

        with open(os.path.join(in_folder_name, file_name), 'r', encoding='utf8') as in_file:
            is_phone_tier = False
            current_interval = Interval()
            intervals = []
            for line in in_file:
                # entered tier with phone segments
                if PHONE_TIER in line:
                    is_phone_tier = True
                # no longer in phone tier
                elif 'name = ' in line:
                    is_phone_tier = False
                
                # extract info if in phone tier
                if is_phone_tier:
                    current_interval = extract_info(current_interval, line)
                # label has been filled, this interval is complete
                if current_interval.label:
                    # calculate duration and store it in the object
                    current_interval.duration = current_interval.calculate_duration()
                    # add to list of interval objects and clear current interval
                    intervals.append(current_interval)
                    current_interval = Interval()

            return intervals


### Main Function

In [17]:

# read in list of interval objects
segment_intervals = read_input()

# convert list of interval objects to dataframe
master_df = pd.DataFrame([interval.__dict__ for interval in segment_intervals])

# create dictionary of dataframes based on values in 'label' column
all_segments_df = {elem : pd.DataFrame() for elem in master_df.label.unique()}

# fill values based on key
for key in all_segments_df.keys():
    all_segments_df[key] = master_df[:][master_df.label == key]



In [18]:
# example output - access dataframe by label as key
all_segments_df['w']

master_df.label.unique()

array(['w', 'ɐ', 'n', 'θ', 'ɹ', 'iː', 'ɛ', 'ð', 'ə', 'cʰ', 'ej', 'k', 'z',
       'd', 'ɒ', 'p', 't', 'f', 'l', 'æ', 'ɑː', 's', 'tʰ', 'ʉː', 'ɪ',
       'aj'], dtype=object)

In [35]:
# plosives

# extract plosives as dictionary of dataframes
plosive_frames_dict = {k: all_segments_df.get(k, None) for k in ('p', 't', 'tʰ', 'k', 'cʰ')}

# concatenate the dataframes
plosive_df = pd.concat(plosive_frames)

# extract rows that have duration larger than the mean
df_greater_than_mean(plosive_df)


Unnamed: 0,Unnamed: 1,start_point,end_point,duration,label
p,23,5.32,5.45,0.13,p
p,74,13.04,13.17,0.13,p
t,28,5.96,6.12,0.16,t
t,79,13.71,13.83,0.12,t
k,16,4.74,4.91,0.17,k
k,38,6.98,7.12,0.14,k
k,67,12.52,12.65,0.13,k
k,89,14.74,14.88,0.14,k
cʰ,14,4.48,4.63,0.15,cʰ
cʰ,65,12.27,12.41,0.14,cʰ
