In [1]:
import pandas as pd
import re
from pathlib import Path
import numpy as np
import seaborn as sns

In [2]:
def extract_sbs_info(file_name,vs_asr=True):
    '''
    Returns the names of the origins of the transcripts that were compared to create the sbs file,
    the interview name, and whether one of the transcripts was produced by an ASR (= vs_asr)
    based on the file path name
    Expected path name format (if vs_asr): ResultSbs/coraal10_openai-async/*.txt/ATL_se0_ag1_f_01_1.txt
    Else: /Users/aheuser/Documents/CORAAL/Condensed/ResultSbs/aa-rev_amber10/DCB_se3_ag4_m_02_5.sbs.txt
    '''
    asrs = ["openai-async","revai_v2-async"]
    path_list = Path(file_name).parts
    if vs_asr: #have *.txt part if vs_asr
        hyp_spl = path_list[-3].split('_')
    else:
        hyp_spl = path_list[-2].split('_')
    if len(hyp_spl) == 3:
        ref = hyp_spl[0]
        hyp = hyp_spl[1]+"_"+hyp_spl[2]
    else:
        ref,hyp = hyp_spl
    if ref[-1].isdigit():
        ref = ref[:-2] #Assuming we'll always cut off a number between 10-99
    if hyp[-1].isdigit():
        hyp = hyp[:-2]
    per_ind = path_list[-1].find(".")
    return [ref,hyp,path_list[-1][:per_ind],vs_asr]

def load_sbs(sbs):
    '''
    Returns pandas dataframes for all four types of error reporting in the sbs txt file
    errors is the main df used in the rest of this notebook so we add extra rows for easier downstream processing
    '''
    sep_inds = []
    with open(sbs) as file:
        lines = file.readlines()
        for line,i in zip(lines,range(len(lines))):
            if line[0] == '-':
                sep_inds.append(i)
    ref_hyp = pd.read_csv(sbs,delimiter='\t',nrows=sep_inds[0]-1)
    ref_hyp.index = ref_hyp.index+2 #because the line value in the errors is consistently off by 2
    errors = pd.read_csv(sbs,delimiter='\t',skiprows=sep_inds[0]+1,nrows=sep_inds[1]-sep_inds[0]-2)
    unigrams = pd.read_csv(sbs,delimiter='\t',skiprows=sep_inds[1]+1,nrows=sep_inds[2]-sep_inds[1]-2)
    bigrams = pd.read_csv(sbs,delimiter='\t',skiprows=sep_inds[2]+1)
    dfs = ref_hyp,errors,unigrams,bigrams
    for df in dfs:
        df.columns = [col.strip() for col in df.columns]
    splat = errors.Group.str.split("<->",expand=True)
    errors["Left"] = splat[0]
    errors["Right"] = splat[1]
    return dfs

In [3]:
def get_remaining_errors(sbs,accounted_dict,vs_asr):
    info = extract_sbs_info(sbs,vs_asr)
    dfs = load_sbs(sbs)
    ref_hyp,errors,unigrams,bigrams = dfs
    if sbs not in accounted_dict:
        return errors,ref_hyp
    else:
        return errors.loc[~errors.Line.isin(accounted_dict[sbs])],ref_hyp

def compile_remaining_errors(sbs,accounted_dict,vs_asr):
    info = extract_sbs_info(sbs,vs_asr)
    errors_df,_ = get_remaining_errors(sbs,accounted_dict,vs_asr)
    errors_df['ref_transcript'] = info[0]
    errors_df['hyp_transcript'] = info[1]
    errors_df['interview_name'] = info[2]
    errors_df["asr_hyp"] = info[3]
    return errors_df

In [4]:
def add_to_accounted_dict(sbs,new_lines,accounted_dict):
    if sbs not in accounted_dict:
        accounted_dict[sbs] = new_lines
    else:
        accounted_dict[sbs]|=new_lines
    return accounted_dict
    
def new_results_df(sbs,accounted_dict,test_funcs,diff_type,vs_asr):
    '''
     Labels and collects different types of errors based on test_funcs (list of functions)
     An error can only count as one type because once a test catches it, its index is added to the accounted_dict
     These indices are removed before the next test is run
     Returns the accounted for errors (errors_df) and indices (accounted_dict, organized by the sbs file names)
    '''
    #no summary df - we'll compile this later
    first = True
    info = extract_sbs_info(sbs,vs_asr)
    errors,ref_hyp = get_remaining_errors(sbs,accounted_dict,vs_asr)
    for func in test_funcs:
        if first: 
            subsec = errors.loc[errors.apply(lambda x: test_funcs[func](x,ref_hyp),axis=1)].copy() 
        else:
            subsec = unaccounted_for_errors.loc[unaccounted_for_errors.apply(lambda x: test_funcs[func](x,ref_hyp),axis=1)].copy() 
        subsec["ref_transcript"] = info[0]
        subsec["hyp_transcript"] = info[1]
        subsec["asr_hyp"] = info[3]
        subsec["interview_name"] = info[2]
        subsec["diff_name"] = func
        subsec["diff_typ"] = diff_type
        if not first:
            errors_df = pd.concat([errors_df,subsec]).reset_index(drop=True)
        else:
            errors_df = subsec.reset_index(drop=True)
            first=False
        errors_merge = pd.merge(errors, errors_df, how='outer', on=['Line','Left','Right','Group'],indicator=True)
        unaccounted_for_errors = errors_merge.loc[errors_merge["_merge"] == "left_only"].iloc[:,:4]
    accounted_dict = add_to_accounted_dict(sbs,set(errors_df.Line.values),accounted_dict)
    return errors_df,accounted_dict

In [5]:
def build_summary_df(errors_df,total_errors_dict):
    '''
    Compiles summary statistics per ref/hyp transcript combination 
    and per test groups (e.g. morpho-syntactic, reductions, etc.)
    '''
    summary_df = pd.DataFrame(columns=['ref_transcript','hyp_transcript','diff_name','diff_type','frequency','percentage'])
    ref_hyp_combos = set(zip(errors_df['ref_transcript'],errors_df['hyp_transcript']))
    for ref_hyp_combo in ref_hyp_combos:
        ref,hyp = ref_hyp_combo
        just_that_combo = errors_df.loc[(errors_df['ref_transcript'] == ref) & (errors_df['hyp_transcript'] == hyp)]
        total_errors = total_errors_dict[f"{ref}|{hyp}"]
        for diff_name in just_that_combo.diff_name.unique():
            subsec = just_that_combo.loc[just_that_combo['diff_name'] == diff_name]
            freq = len(subsec)
            diff_type = subsec.diff_typ.unique()[0]
            summary_df.loc[len(summary_df)] = [ref,hyp,diff_name,diff_type,freq,freq/total_errors]
        for diff_type in errors_df.diff_typ.unique():
            subsec = just_that_combo.loc[just_that_combo['diff_typ'] == diff_type]
            freq = len(subsec)
            diff_type = subsec.diff_typ.unique()[0]
            summary_df.loc[len(summary_df)] = [ref,hyp,"all_combined",diff_type,freq,freq/total_errors]
    return summary_df

In [6]:
#Based on the user manual
#Only added some alternatives, though these are never looked for with the functions, only added them for potential future reference

#CORAAL filler words
fillers = {"uh-huh":["uh huh"], "uh-uh":["uh uh","nuh-uh"],"nuh-uh":["nuh uh","nuh-huh","uh-uh"], "mm-hm":["mm hm","mm hmm"], "mm":["hm"], 
           "mm-mm":["mm mm"], "okay":["ok", "kay"],"mkay":["mm kay"], "yep":["yup"], "yup":["yep"], "nah":["naw"],"naw":["nah"],"ooh":["oo"]}
filler_no_alts = ["oh","ayo","hoo","uh","um","ah"]

#added forms for "going to" "have to" "used to" "until" "about" "around" "talking about" "got you" "remember"
red_forms = {"must have": ["musta"], "should have": ["shoulda"], "would have": ["woulda"], "could have": ["coulda"], "might have": ["mighta"],
            "going to": ["gonna","I'm'a", "imma"], "have to": ["halfta", "havta"], "trying to": ["tryna"], "supposed to":["sposta"], 
             "fixing to":["finna"],"got to":["gotta"], "want to":["wanna"],"ought to":["oughta"], "because":["cause","cuz"],
            "until":["til","'til"], "them": ["'em"], "let me":["lemme"], "what do you": ["watchu","watcha", "watchya"], 
             "what are you": ["whatchu","whatcha","watchya"], "got you": ["gotcha","gotchya"], "around":["'round"], "about":["'bout"],
            "talking about": ["talkin' about", "talking 'bout", "talkin' 'bout", "talkin 'bout", "talkin' bout", "talkin bout"],
            "remember":["'member"]}

#Need a list of what CORAAL allows - should be easy to make with the info there
red_forms_coraal = ["must have","musta","should have","shoulda","would have","woulda","could have","coulda","might have","mighta",
                    "going to","gonna","I'm'a", "have to","halfta","trying to","tryna","supposed to", "sposta", "fixing to", "finna", 
                    "got to", "gotta", "want to", "wanna", "ought to", "oughta", "because", "cause", "until", "til", "about", "remember",
                    "around", "talking about", "them", "'em", "let me", "lemme", "what do you", "whatchu", "what are you", "whatcha",
                    "got you", "gotcha"]

coraal_lex = {"aight":"alright","aks":"ask", "'bacca":"tobacco", "bih":"bitch", "brazy":"crazy", "bruh":"bro", "cuz":"cousin","'em":"them",
              "fella":"fellow", "hisself":"himself", "mama":"momma", "turnt":"turned", "and them": ["nem","dem"], "youngin": "youngen"} 
#More for "and them" as shown in function
coraal_lex_no_alts = ["ay", "bougie", "go go", "jai", "effed up", "mumbo sauce", "murk", "racthet", "shorty", "wilding", 
                      "wont"]

In [7]:
def check_row(row):
    left = row["Left"].strip()
    right = row["Right"].strip()
    return left,right

def no_chars(phrase):
    #Helper function to check that a string just has numbers and number-relevant punctuation
    for char in phrase:
        if not str.isdigit(char) and not char in ",.-":
            return False
    return True

def written_numbers(row):
    #Check if the error involves one of the lines just being numbers
    left,right = check_row(row)
    if no_chars(left) or no_chars(right):
        return True
    return False

def is_coraal_filler(row):
    #Check if the error involves a CORAAL-defined filler
     #Checking right side as well for cross comparison - even though CORAAL will likely always be on the left
    left,right = check_row(row)
    if left in fillers or left in filler_no_alts or right in fillers or right in filler_no_alts:
        return True
    return False

#Covered by is_coraal_filler
def del_filler_coraal(row):
    #Check if a word defined as a filler by CORAAL is deleted in the other transcript
    left,right = check_row(row)
    if right == "***":
        if is_coraal_filler(left):
            return True
    if left == "***":
        if is_coraal_filler(right):
            return True
    return False

def del_restart(row,ref_hyp):
    #Check if a line ending with a restart was deleted in either transcript
    left,right = check_row(row)
    if right == "***":
        if left[-1] == "-":
            return True
    if left == "***":
        if right[-1] == "-":
            return True
    return False

def is_coraal_red_form(row):
    #Checks whether a word that may be reduced and then transcribed differently in the CORAAL transcript is involved in the error
    #Note that the error may not have to do with transcriotion of the reduced vs full form, the error may be unrelated
    left,right = check_row(row)
    if left in red_forms_coraal or right in red_forms_coraal:
        return True
    return False

def is_restart(row,ref_hyp):
    #restarts - "[word]-" vs "word"
    #Checks whether the error is the result of one of the lines being marked as a restart by having a trailing dash
    left,right = check_row(row)
    if left+"-" == right or left==right+"-":
        return True
    return False

def is_coraal_lex(row):
    #Checks whether the error involves one of the words being defined as a dialect specific lexical item by CORAAL
    left,right = check_row(row)
    if left in coraal_lex or left in coraal_lex_no_alts or right in coraal_lex or right in coraal_lex_no_alts:
        return True
    return False

In [8]:
all_fillers = []
for key in fillers:
    all_fillers.append(key)
    all_fillers+=fillers[key]
all_fillers+=filler_no_alts

def filler_sub(row,ref_hype):
    left,right = check_row(row)
    if left in all_fillers and right in all_fillers:
        return True
    return False

def filler_del(row,ref_hype):
    left,right = check_row(row)
    if (left in all_fillers and right == "***") or (left == "***" and right in all_fillers):
        return True
    return False

In [9]:
def del_rep(row,ref_hyp):
    left,right = check_row(row)
    if left == "***":
        if row.Line > 2:
            if ref_hyp.loc[row.Line-1]["ref_token"] == right and ref_hyp.loc[row.Line-1]["hyp_token"] == right:
                return True
        if row.Line < len(ref_hyp)+1:
            if ref_hyp.loc[row.Line+1]["ref_token"] == right and ref_hyp.loc[row.Line+1]["hyp_token"] == right:
                return True
    if right == "***":
        if row.Line > 2:
            if ref_hyp.loc[row.Line-1]["hyp_token"] == left and ref_hyp.loc[row.Line-1]["ref_token"] == left:
                return True
        if row.Line < len(ref_hyp)+1:
            if ref_hyp.loc[row.Line+1]["hyp_token"] == left and ref_hyp.loc[row.Line+1]["ref_token"] == left:
                return True
    return False

In [10]:
all_red_forms = []
for key in red_forms:
    all_red_forms.append(key)
    all_red_forms+=red_forms[key]

def is_red_form(row,ref_hyp):
    left,right = check_row(row)
    if left in all_red_forms or right in all_red_forms:
        return True
    return False

def x_reduction(left_words,right_words,x,modals):
    contr_len = len(x)
    if len(left_words) > len(right_words):
        if left_words[-1] in modals and right_words[-1][-(contr_len+1):] == f"'{x}":
            return True
    elif len(left_words) < len(right_words):
        if right_words[-1] in modals and left_words[-1][-(contr_len+1):] == f"'{x}":
            return True
    return False

d_reduction = lambda left,right: x_reduction(left,right,'d',["did","had","would"])
s_reduction = lambda left,right: x_reduction(left,right,'s',["is","has","us"])
ve_reduction = lambda left,right: x_reduction(left,right,'ve',["have"])
ll_reduction = lambda left,right: x_reduction(left,right,'ll',["will","shall"])
re_reduction = lambda left,right: x_reduction(left,right,'re',["are"])
t_reduction = lambda left,right: x_reduction(left,right,'t',["not"])

def is_contraction(row,ref_hyp):
    left,right = check_row(row)
    left_words = left.split()
    right_words = right.split()
    contr_funcs = [d_reduction,s_reduction,ve_reduction,ll_reduction,re_reduction,t_reduction]
    for test in contr_funcs:
        if test(left_words,right_words):
            return True
    return False

all_lex_forms = []
for key in coraal_lex:
    all_lex_forms.append(key)
    all_lex_forms+=coraal_lex[key]
all_lex_forms+=coraal_lex_no_alts

def is_lexical(row,ref_hyp):
    left,right = check_row(row)
    if left in all_lex_forms or right in all_lex_forms:
        return True
    return False

In [11]:
verbatim = {"filler_sub":filler_sub, "filler_del":filler_del,"is_restart":is_restart, "del_restart":del_restart, "del_rep":del_rep}
reductions = {"is_red_form":is_red_form, "is_contraction":is_contraction}

In [12]:
#Morphosyntactic difference functions
def dropped_apos_s(row):
    left,right = check_row(row)
    if right == f"{left}'s" or left == f"{right}'s":
        return True
    return False

#Reformulation based on Rickford 1999
def drop_final_cons_1_2_4(row): #should check based on IPA transcription because no and now and a and an are not good examples
    left,right = check_row(row)
    if len(left) > len(right) and left[-1] != 's':
        if left[:-1] == right:
            return True
    if right[-1] == "'" and left[-1] != 's':
        if left[:-1] == right[:-1]:
            return True
    if len(right) > len(left) and right[-1] != 's':
        if right[:-1] == left:
            return True
    if left[-1] == "'" and right[-1] != 's':
        if right[:-1] == left[:-1]:
            return True
    return False
    
#Probably also matching possessive s dropping
def cop_aux_del_19a(row,ref_hyp):
    if dropped_apos_s(row):
        return True
    left,right = check_row(row)
    #will also cover some deletions
    if left == "***":
        if right in ["is","are"]:
            return True
    if right == "***": #need to check the other side? Depends on whether we consider one gold standard
        if left in ["is","are"]:
            return True
    return False 

#One "do" deletion doesn't seem to be invariant "be": "you come home and you got your homework do what you have-"
#vs "you come home and you got your homework what y'all"
#Can't check for "she'd be" vs "she be" because t
def invariant_be_19b(row,ref_hyp):
    #E.g. "She don't be sick, do she?" vs "She isn't sick, is she?" depends on alignment
    #"don't be" for "isn't" 
    bes = ["be", "do", "don't", "don't be", "do be"]
    se_conj = ["is","are", "isn't","aren't"] 
    left,right = check_row(row)
    if left in bes and right in se_conj or right in bes and left in se_conj:
        return True
    if left in bes and right == "***" or right in bes and left == "***":
        return True
    #19.c3 from Spears 2017
    try:
        if ref_hyp.loc[row.Line+1]["ref_token"] == "be":
            if left == right+"'d" or right == left+"'d":
                return True
            if left == "would" and right == "***" or left == "***" and right == "would":
                return True
    except:
        pass
    return False

def future_ll_del_19c(row,ref_hyp):
    left,right = check_row(row)
    if left+"'ll" == right or right+"'ll" == left:
        return True
    #Also covering deletion of will
    if left == "will" and right == "***" or left == "***" and right == "will":
        return True
    return False

#No reason a transcription would delete the word steady so won't look for 19d
#Stress isn't orthographically transcribed so skipping 19e too

#Got 0 examples which might be right but need to check that there really are none
def pre_been_insertion_19ef(row,ref_hyp): #need to check this, will correct ref_hyp be available? Does line need to be converted to int?
    line = row["Line"]
    left,right = check_row(row)
    try:
        if ref_hyp.loc[line+1]["ref_token"] == "been": #hyp_token will be the same
            if left == "***" and right == "has" or left == "has" and right == "***":
                return True
        return False
    except:
        return False

def done_del_19g(row,ref_hyp):
    left,right = check_row(row)
    if (left == "done" and right == "***") or (left == "***" and right == "done"):
        return True
    return False

#"She be done had her baby" for "She will have had her baby" - probably would not go so far from the actual signal
#Possible corrections: 
#"She had her baby" - deletion of "be done" need to check for this, "She's had her baby" seems further from the signal and unlikely
#"She done had her baby" or "She's done had her baby" - check for these with 19b function
#"She be had her baby" - weird and doubt anyone would transcribe it like this but caught with 19g function

#Got 0 examples so need to search to make sure that's accurate
#Added to this to cover Spears 2017 examples for 19.h3
def be_done_del_19h(row,ref_hyp):
    left,right = check_row(row)
    if (left == "be done" and right == "***") or (left == "***" and right == "be done"):
        return True
    #check number of words in the substituions
    left_words = left.split()
    right_words = right.split()
    #this might catch errors beyond this, but not easily
    if len(left_words) > len(right_words):
        if left_words[-2:] == ["be", "done"] or left_words[:2] == ["be", "done"]:
            return True
    else:
        if right_words[-2:] == ["be", "done"] or right_words[:2] == ["be", "done"]:
            return True
    return False

#"He finna go" -> "He's gonna go" most likely?
#Deletion of finna seems more likely in plural case e.g. "They finna go" vs "They go" but "They're gonna go" seems more likely

#Got 0 examples - this is accurate based on CORAAL search, need to check with a sbs file that has it
def finna_19i(row,ref_hyp): #don't need to worry about how it was corrected for
    left,right = check_row(row)
    if "finna" in left and not "finna" in right:
        return True
    return False

#Got 0 examples - this is accurate based on CORAAL search, need to check with a sbs file that has it
def double_modals_19l(row,ref_hyp):
    left,right = check_row(row)
    left_words = left.split()
    right_words = right.split()
    first_modal = False
    modals1 = ["may","might","must"]
    modals2 = ["can","could","don't"]
    one_double_modal = False
    word_lists = [left_words, right_words]
    for word_list in word_lists:
        for word in word_list:
             if word in modals1:
                 first_modal = True
             elif first_modal:
                if word in modals2:
                    one_double_modal = !one_double_modal
                first_modal = False
        first_modal = False
    if one_double_modal:
        return True
    try:
        if ref_hyp.loc[row.Line-1]["ref_token"] in modals1:
            if left == "***" and right in modals2 or left in modals2 and right == "***":
                return True
    except: pass
    try:
        if ref_hyp.loc[row.Line+1]["ref_token"] in modals2:
            if left == "***" and right in modals1 or left in modals1 and right == "***":
                return True
    except: pass
    return False

#Simple substitution e.g. run vs runs, no way to verify the word's a verb without POS tagging so it'll catch more than verbs
def final_s_del_20a_21b(row,ref_hyp):
    left,right = check_row(row)
    if left+"s" == right or left == right+"s":
        return True
    return False

#Look specifically for "have" and "don't"??
def switch_sin_plu_20b(row,ref_hyp):
    left,right = check_row(row)
    if (left == "is" and right == "are") or (left == "are" and right == "is"):
        return True
    if (left == "was" and right == "were") or (left == "were" and right == "was"):
        return True
    return False

#Only counting regular verbs, even though irregular verbs are very frequent
#This will cacth 20f too and be unable to distinguish them
def stem_for_past_20e(row,ref_hyp):
    left,right = check_row(row)
    if left+"ed" == right or left == right+"ed":
        return True
    return False

#def lack_plural_noun_21b(row):
    #return dropped_s(row)

#21c, many possible spelling variations, e.g. "an' 'em," "(')?nem," "an' them," "and them" 
#Don't expect to see this much if at all - found 0 examples in the file with the CORAAL search as well
def assos_plural_21c(row,ref_hyp):
    left,right = check_row(row)
    variations = ["an' 'em", "an em", "an' em", "an 'em", "nem" "'nem", "and them", "and 'em", "an' them"]
    if left in variations and right in variations:
        return True
    return False

#May not be actual examples of this - not sure what the right surrounding sentence structure should be/how to search for it
def add_pronoun_21d(row,ref_hyp):
    left,right = check_row(row)
    pronouns = ["she","he","it", "they"] #all third person because others can't have other referents
    if (left in pronouns and right == "***") or (left == "***" and right in pronouns):
        return True
    return False

def yall_they_poss_21e(row,ref_hyp):
    left,right = check_row(row)
    yall_vers = ["y'all", "yall", "you all"]
    if (left in yall_vers and right == "your") or (left == "your" and right in yall_vers):
        return True
    if (left == "they" and right == "their") or (left == "their" and right == "they"):
        return True
    return False

def drop_rel_pro_21g(row,ref_hyp):
    left,right = check_row(row)
    rel_pros = ["who","which","what","that"]
    if left in rel_pros and right == "***" or left == "***" and right in rel_pros:
        return True
    return False

def aint_sub_22a(row,ref_hyp):
    left,right = check_row(row)
    neg_mod = ["am not", "isn't", "is not", "aren't", "are not", "hasn't", "has not", "haven't", "have not", "did not", "didn't"]
    aint_vars = ["ain't", "ain'", "aint", "ain"]
    if left in neg_mod and right in aint_vars or left in aint_vars and right in neg_mod:
        return True
    return False

def mult_neg_22b(row,ref_hyp):
    left,right = check_row(row)
    neg_indef = ["nothing", "nobody", "no one", "nowhere"]
    alt_indef = ["anything", "anybody", "any one", "anywhere"]
    if left in neg_indef and right in alt_indef or left in alt_indef and right in neg_indef:
        return True
    return False

#Found 0 examples of this - need to check with an sbs file that has this/create an artificial example
def if_whether_del_23b(row,ref_hyp):
    left,right = check_row(row)
    if_whether = ["if", "whether"]
    if left in if_whether and right == "***" or left == "***" and right in if_whether:
        return True
    return False

def it_there_sub_24a(row,ref_hyp):
    left,right = check_row(row)
    it_var = ["it's", "it is", "it was", "ain't", "ain'", "ain", "isn't", "is not"]
    there_var = ["there's", "there is", "there was" "there isn't", "there ain't", "there is not", "there ain'", "there aint"]
    if left in it_var and right in there_var or left in there_var and right in it_var:
        return True
    return False

#Found 0 examples of this - need to check with an sbs file that has this/create an artificial example
def they_got_there_are_sub_24b(row,ref_hyp):
    left,right = check_row(row)
    if left == "they got" and right == "there are" or left == "there are" and right == "they got":
        return True
    return False

#No longer captures generic go deletion - gets 0 matches which is consistent with CORAAL search 
#Need to check on other sbs file or make an artificial example
def here_go_24c(row,ref_hyp):
    left,right = check_row(row)
    mods = ["is", "are"] #always assuming positive for presentational
    if left == "go" and right in mods or left in mods and right == "go":
        return True
    if left == "here's" and right == "here go" or left == "here go" and right == "here's":
        return True
    return False

#need to add nominative for possessive substitution corresponding to part of 8
def they_their_sub_8ish(row,ref_hyp):
    left,right = check_row(row)
    if left == "they" and right == "their" or left == "their" and right == "they":
        return True
    return False

def gone_go_19n(row,ref_hyp):
    left,right = check_row(row)
    if left in ["gone", "go"]:
        if right in ["gonna","***"]:
            return True
    if right in ["gone", "go"]:
        if left in ["gonna","***"]:
            return True
    return False

def gone_come_19o(row,ref_hyp):
    left,right = check_row(row)
    left_words = left.split()
    right_words = right.split()
    if left_words[-2:] == ["gone","come"]:
        return True
    if right_words[-2:] == ["gone","come"]:
        return True
    try:
        if ref_hyp.loc[row.Line-1]["ref_token"] == "gone":
            if left == "come" and right == "***" or left == "***" and right == "come":
                return True
    except: pass
    try:
        if ref_hyp.loc[row.Line+1]["ref_token"] == "come":
            if left == "gone" and right == "***" or left == "***" and right == "gone":
                return True
    except: pass
    return False

In [13]:
morpho_syntax = {"cop_aux_del_19a":cop_aux_del_19a, "invariant_be_19b":invariant_be_19b, "future_ll_del_19c":future_ll_del_19c,
"pre_been_insertion_19ef":pre_been_insertion_19ef, "done_del_19g":done_del_19g, "be_done_del_19h": be_done_del_19h, "finna_19i": finna_19i,
"double_modals_19l":double_modals_19l,"final_s_del_20a_21b": final_s_del_20a_21b, "switch_sin_plu_20b": switch_sin_plu_20b,
"stem_for_past_20e": stem_for_past_20e, "assos_plural_21c": assos_plural_21c,"add_pronoun_21d": add_pronoun_21d, 
"aint_sub_22a":aint_sub_22a,"if_whether_del_23b": if_whether_del_23b,"it_there_sub_24a": it_there_sub_24a, 
"they_got_there_are_sub_24b": they_got_there_are_sub_24b, "here_go_24c": here_go_24c, "they_their_sub_8ish": they_their_sub_8ish,
                 "gone_go_19n":gone_go_19n, "gone_come_19o":gone_come_19o, "is_red_form":is_red_form,"is_lexical":is_lexical}

In [14]:
all_buckets = {"morpho_syntax":morpho_syntax,"reductions":reductions,"verbatim":verbatim}
just_verbatim = {"verbatim":verbatim}

In [15]:
sbs_dir = "/Users/aheuser/Documents/CORAAL/Condensed/ResultSbs"
human_dirs = ["aa-rev_amber10", "aa-rev_coraal10", "aa-rev_rev10", "coraal_amber10", "rev_amber10", "rev_coraal10"]
asr_dirs = ["aa-rev10_", "coraal10_", "rev10_", "amber_","openai-async_"]
asrs = ["openai-async","revai_v2-async"]
total_errors_dict = {}
for dir in human_dirs:
    path = Path(sbs_dir+"/"+dir)
    for file in path.glob('*.*'):
        if file.is_file():
            info = extract_sbs_info(file,False)
            dfs = load_sbs(file)
            ref_hyp,errors,unigrams,bigrams = dfs
            ref_hyp_string = f"{info[0]}|{info[1]}"
            if ref_hyp_string not in total_errors_dict:
                total_errors_dict[ref_hyp_string] = len(errors)
            else:
                total_errors_dict[ref_hyp_string]+=len(errors)
for dir in asr_dirs:
    for i in range(2):
        path = Path(sbs_dir+"/"+dir+asrs[i]+"/*.txt")
        for file in path.glob('*.*'):
            if file.is_file():
                info = extract_sbs_info(file,True)
                dfs = load_sbs(file)
                ref_hyp,errors,unigrams,bigrams = dfs
                ref_hyp_string = f"{info[0]}|{info[1]}"
                if ref_hyp_string not in total_errors_dict:
                    total_errors_dict[ref_hyp_string] = len(errors)
                else:
                    total_errors_dict[ref_hyp_string]+=len(errors)

In [16]:
total_errors_dict

{'aa-rev|amber': 11173,
 'aa-rev|coraal': 8226,
 'aa-rev|rev': 7448,
 'coraal|amber': 12035,
 'rev|amber': 11169,
 'rev|coraal': 8747,
 'aa-rev|openai-async': 9484,
 'aa-rev|revai_v2-async': 9484,
 'coraal|openai-async': 9994,
 'coraal|revai_v2-async': 7797,
 'rev|openai-async': 9454,
 'rev|revai_v2-async': 7049,
 'amber|openai-async': 10073,
 'amber|revai_v2-async': 9519,
 'openai-async|revai_v2-async': 8330}

In [17]:
accounted_dict = {}
first = True
for dir in human_dirs:
    path = Path(sbs_dir+"/"+dir)
    for file in path.glob('*.*'):
        for bucket in all_buckets:
            if file.is_file():
                errors,accounted_dict = new_results_df(file,accounted_dict,all_buckets[bucket],bucket,False)
                if first: 
                    errors_df = errors.copy()
                    first = False
                else:
                    errors_df = pd.concat([errors_df,errors]).reset_index(drop=True)
for dir in asr_dirs:
    for i in range(2):
        path = Path(sbs_dir+"/"+dir+asrs[i]+"/*.txt")
        for file in path.glob('*.*'):
            for bucket in all_buckets:
                if file.is_file():
                    errors,accounted_dict = new_results_df(file,accounted_dict,all_buckets[bucket],bucket,True)
                    errors_df = pd.concat([errors_df,errors]).reset_index(drop=True)              

#Commented out the code with the original final name, current code is for testing 
#errors_df.to_csv("/Users/aheuser/Documents/CORAAL/Condensed/ResultSbs/bucket_errors_new.csv")
errors_df.to_csv("/Users/aheuser/Documents/CORAAL/Condensed/ResultSbs/bucket_errors_new_test.csv")

In [18]:
summary_df = build_summary_df(errors_df,total_errors_dict)
#summary_df.to_csv("/Users/aheuser/Documents/CORAAL/Condensed/ResultSbs/bucket_summary_new.csv")
summary_df.to_csv("/Users/aheuser/Documents/CORAAL/Condensed/ResultSbs/bucket_summary_new_test.csv")