In [None]:
import os
import glob

import pandas as pd
import numpy as np

import re
import inflect
from text2digits import text2digits

from jiwer import wer

from nltk import ngrams
import math

import collections

In [None]:
os.getcwd()
os.chdir('..')
base_folder = os.getcwd()+'/' #'~/fair-speech/release/'

# Import snippet transcripts and demographic data

## VOC data

In [None]:
# Import VOC transcript snippets

voc_snippets = pd.read_csv(base_folder + 'inputs/VOC/voc_snippets.tsv', sep='\t', index_col=None)

In [None]:
# Correct VOC metadata for mis-spellings of names
voc_metadata = pd.read_csv(base_folder + 'inputs/VOC/CSVOC_demo_info.csv', index_col=None)

voc_metadata['update_speaker_code'] = voc_metadata['speaker_code']

voc_metadata['update_speaker_code'] = np.where(voc_metadata['speaker_code'] == 'HUM_McLoughlin_Rachel', 'HUM_McLaughlin_Rachel', voc_metadata['update_speaker_code'])
voc_metadata['update_speaker_code'] = np.where(voc_metadata['speaker_code'] == 'SAC_Vrilakas_Ron', 'SAC_Vrikalis_Ron', voc_metadata['update_speaker_code'])
voc_metadata['update_speaker_code'] = np.where(voc_metadata['speaker_code'] == 'SAC_Arghittu_Allen', 'SAC_Argittu_Allen', voc_metadata['update_speaker_code'])
voc_metadata['update_speaker_code'] = np.where(voc_metadata['speaker_code'] == 'SAC_Werner_Savannah', 'SAC_WernerRitchie_Savannah', voc_metadata['update_speaker_code'])
voc_metadata['update_speaker_code'] = np.where(voc_metadata['speaker_code'] == 'HUM_Yoho_Meadow ', 'HUM_Yoho_Meadow', voc_metadata['update_speaker_code'])

In [None]:
# Merge VOC data with demographic attributes

voc_df = voc_snippets.merge(voc_metadata, left_on=['basefile'], right_on=['update_speaker_code'], how = 'left')
voc_df = voc_df[['basefile', 'start_time', 'end_time', 'content', 'age_interview', 'gender',
       'race_ethnicity', 'recording_quality', 'duration', 'segment_filename']]
voc_df.columns = ['basefile', 'start_time', 'end_time', 'content', 'age', 'gender',
       'race_ethnicity', 'recording_quality', 'duration', 'segment_filename']
voc_df['source'] = voc_df['segment_filename'].str[:3]

# Generate separate dataframes for each interview location
humboldt_df = voc_df[voc_df['source']=='HUM']
sacramento_df = voc_df[voc_df['source']=='SAC']

## CORAAL data

In [None]:
# Import CORAAL transcript snippets

coraal_snippets = pd.read_csv(base_folder + 'inputs/CORAAL/coraal_snippets.tsv', sep='\t')

In [None]:
# Merge CORAAL audio quality usability 

coraal_audio_quality = pd.read_csv(base_folder + 'inputs/CORAAL/AAVE_audio_quality.csv', index_col=None)
coraal_df = coraal_snippets.merge(coraal_audio_quality, left_on = ['basefile'], right_on = ['file_name'], how = 'left')
coraal_df['source'] = coraal_df['segment_filename'].str[:3]
coraal_df['quality'] = np.where(coraal_df['source'] == 'ROC', 'usable', coraal_df['quality'])

In [None]:
# Merge CORAAL demographics from metadata

filenames = [base_folder + 'inputs/CORAAL/' + 'DCB_metadata_2018.10.06.txt',
            base_folder + 'inputs/CORAAL/' + 'PRV_metadata_2018.10.06.txt',
            base_folder + 'inputs/CORAAL/' + 'ROC_metadata_2018.10.06.txt']

coraal_metadata = pd.concat([pd.read_csv(filename, sep='\t') for filename in filenames], sort=False)
coraal_sub_meta = coraal_metadata[coraal_metadata['CORAAL.Spkr']==coraal_metadata['CORAAL.File'].str[:-2]]

coraal_df = coraal_df.merge(coraal_sub_meta, left_on=['basefile'], right_on=['CORAAL.File'], how = 'left')

In [None]:
# Include CORAAL racial attribute
coraal_df = coraal_df[['basefile', 'start_time', 'end_time', 'content', 'Age', 'Gender',
       'quality', 'duration', 'segment_filename', 'source']]
coraal_df.columns = ['basefile', 'start_time', 'end_time', 'content', 'age', 'gender',
       'recording_quality', 'duration', 'segment_filename', 'source']
coraal_df['race_ethnicity'] = 'Black'

# Generate separate dataframes for each interview location
dcb_df = coraal_df[coraal_df['source']=='DCB']
prv_df = coraal_df[coraal_df['source']=='PRV']
roc_df = coraal_df[coraal_df['source']=='ROC']

# Import ASR transcriptions

In [None]:
# Import IBM, Amazon, Google, and Apple transcriptions

def import_asr_transcripts(asr):
    paths = ['inputs/CORAAL/dcb_'+asr+'_transcribe_5.csv',
             'inputs/CORAAL/prv_'+asr+'_transcribe_5.csv',
             'inputs/CORAAL/roc_'+asr+'_transcribe_5.csv',
             'inputs/VOC/humboldt_'+asr+'_transcribe_5.csv',
             'inputs/VOC/sacramento_'+asr+'_transcribe_5.csv']
    dcb_csv = pd.DataFrame()
    prv_csv = pd.DataFrame()
    roc_csv = pd.DataFrame()
    humboldt_csv = pd.DataFrame()
    sacramento_csv = pd.DataFrame()

    csv = [dcb_csv, prv_csv, roc_csv, humboldt_csv, sacramento_csv]
    
    for i in range(len(paths)):
        pathlink = base_folder + paths[i]
        trans = csv[i]
        trans = pd.read_csv(pathlink,
               names = ['index', 'segment_filename', asr+'_transcription'])
        print(len(trans))
        trans['source'] = trans['segment_filename'].str[:3]
        trans = trans[['source','segment_filename', asr+'_transcription']]
        csv[i] = trans
    transcripts = pd.concat(csv, axis=0)
    
    return transcripts

print("---Microsoft----")
msft_transcripts = import_asr_transcripts('msft')
print("---Google----")
google_transcripts = import_asr_transcripts('google')
print("---IBM----")
ibm_transcripts = import_asr_transcripts('ibm')
print("---Amazon----")
amazon_transcripts = import_asr_transcripts('amazon')
print("---Apple----")
apple_transcripts = import_asr_transcripts('ios')

In [None]:
# Merge human transcripts with ASR transcripts

human_transcripts = pd.concat([coraal_df, voc_df], axis=0, sort=False)

asr_transcripts = msft_transcripts.merge(ibm_transcripts, left_on=['source','segment_filename'], 
                                         right_on=['source','segment_filename'], how='left')
asr_transcripts = asr_transcripts.merge(amazon_transcripts, left_on=['source','segment_filename'], right_on=['source','segment_filename'], how='left')
asr_transcripts = asr_transcripts.merge(google_transcripts, left_on=['source','segment_filename'], right_on=['source','segment_filename'], how='left')
asr_transcripts = asr_transcripts.merge(apple_transcripts, left_on=['source','segment_filename'], right_on=['source','segment_filename'], how='left')

all_transcripts = human_transcripts.merge(asr_transcripts, left_on=['source','segment_filename'], 
                                         right_on=['source','segment_filename'], how='left')

# Relabel columns
all_transcripts.columns = ['basefile', 'start_time', 'end_time', 'content', 'age', 'gender',
       'recording_quality', 'duration', 'segment_filename', 'source',
       'race_ethnicity', 'msft_transcription', 'ibm_transcription',
       'amazon_transcription', 'google_transcription', 'apple_transcription']

# Restrict snippets to relevant subsample

In [None]:
# Restrict snippets to only white and black ethnicity
white_black_snippets = all_transcripts[(all_transcripts['race_ethnicity']=='White') | (
    all_transcripts['race_ethnicity']=='Black')]

# Restrict snippets to usable recording quality
usable_snippets = white_black_snippets[(white_black_snippets['recording_quality']=='usable')]

# Restrict snippets to speakers over age 18
usable_snippets = usable_snippets[usable_snippets['age']>=18]
print(len(usable_snippets))

In [None]:
# Potential variant: no proper noun locations

location_words = ['princeville', 'tarboro', 'edgecombe', 'landover',
                  'humboldt', 'humbolt', 'wiyot', 'hydesville', 'loleta', 'redding', 'hoopa', 'weitchpec', 
                  'garberville', 'mendocino', 'mckinleyville', 'arcata', 'arcada', 'tolowa', 'orangevale', 
                  'yurok', 'eureka', 'klamath', 'obispo']
pattern = '|'.join([r'(?i)'+loc for loc in location_words])
no_location_snippets = usable_snippets[usable_snippets['content'].str.contains(pattern) == False]
# using 'no_location_snippets' rather than 'usable_snippets' yields similar results, despite the fact that 
# the majority of difficult-to-spell locations are uttered by white speakers

# Clean human transcriptions for CORAAL and VOC

In [None]:
# Remove any text within markers, e.g. 'We(BR) went' -> 'We went'

def remove_markers(line, markers):
    ## markers = list of pairs, e.g. ['()', '[]'] denoting breath or noise in transcripts
    for s, e in markers:
         line = re.sub(" ?\\" + s + "[^" + e + "]+\\" + e, "", line)
    return line

In [None]:
# Clean CORAAL human transcript

def clean_coraal(baseline_snippets):
    # Restrict to CORAAL rows
    baseline_coraal = baseline_snippets[baseline_snippets['race_ethnicity']=='Black']
    
    # Replace original unmatched CORAAL transcript square brackets with squiggly bracket
    baseline_coraal.loc[:,'clean_content'] = baseline_coraal.loc[:,'content'].copy()
    baseline_coraal.loc[:,'clean_content'] = baseline_coraal['clean_content'].str.replace('\[','\{')
    baseline_coraal.loc[:,'clean_content'] = baseline_coraal['clean_content'].str.replace('\]','\}')
    
    def clean_within_coraal(text):

        # Relabel CORAAL words. For consideration: aks -> ask?
        split_words = text.split()
        split_words = [x if x != 'busses' else 'buses' for x in split_words]
        split_words = [x if x != 'aks' else 'ask' for x in split_words]
        split_words = [x if x != 'aksing' else 'asking' for x in split_words]
        split_words = [x if x != 'aksed' else 'asked' for x in split_words]
        text = ' '.join(split_words)
        
        # remove CORAAL unintelligible flags
        text = re.sub("\/(?i)unintelligible\/",'',''.join(text))
        text = re.sub("\/(?i)inaudible\/",'',''.join(text))
        text = re.sub('\/RD(.*?)\/', '',''.join(text))
        text = re.sub('\/(\?)\1*\/', '',''.join(text))
        
        # remove nonlinguistic markers
        text = remove_markers(text, ['<>', '()', '{}'])

        return text

    baseline_coraal['clean_content'] = baseline_coraal.apply(lambda x: clean_within_coraal(x['clean_content']), axis=1)
    
    return baseline_coraal

In [None]:
# Clean VOC human transcript

def clean_voc(baseline_snippets):
    # Restrict to CORAAL rows
    baseline_voc = baseline_snippets[baseline_snippets['race_ethnicity']=='White']
    
    pre_list = ['thier', 'humbolt', 'arcada', 'ninteen', 'marajuana', 'theatre', 'portugeuse', 'majorca']
    post_list = ['their', 'Humboldt', 'Arcata', 'nineteen', 'marijuana', 'theater', 'portuguese', 'mallorca']
    def clean_within_voc(text):

        # Relabel misspellings
        split_words = text.split()
        for i in range(len(pre_list)):
            split_words = [x if x.lower() != pre_list[i] else post_list[i] for x in split_words]
        text = ' '.join(split_words)   
        #new_words = [x if x != 'thier' else 'their' for x in split_words]

        # remove nonlinguistic markers
        text = remove_markers(text, ['<>', '{}'])

        return text

    baseline_voc['clean_content'] = baseline_voc.apply(lambda x: clean_within_voc(x['content']), axis=1)
    
    return baseline_voc

# Apply cleaning rules to both ASR and human-generated transcripts

In [None]:
# Standardize state abbreviations

states = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

def fix_state_abbrevs(text):
    ix = 0
    state_result = []
    wordlist = text.split()
    while ix < len(wordlist):
        word = wordlist[ix].lower().capitalize()
        if word in states.keys(): # is this correct check?
            new_word = states[word]
        elif (ix < len(wordlist)-1) and ((word + ' ' + wordlist[ix+1].lower().capitalize()) in states.keys()):
            new_word = states[(word + ' ' + wordlist[ix+1].lower().capitalize())]
            ix += 1
        else:
            new_word = word
        state_result.append(new_word)
        ix += 1
    text = ' '.join(state_result)
    return text

In [None]:
# Standardize number parsing and dollars

p = inflect.engine()
t2d = text2digits.Text2Digits()

def fix_numbers(text):
    split_words_num = text.split()
    new_list = []
    for i in range(len(split_words_num)):
        x = split_words_num[i]
        
        # deal with years
        if x.isdigit():
            if (1100 <= int(x) < 2000) or (2010 <= int(x) < 2100) or (int(x) == 5050):
                # deal with years as colloquially spoken
                new_word = p.number_to_words(x[:2]) + " " + p.number_to_words(x[2:])
            elif "and" in p.number_to_words(x):
                # remove 'and' from e.g. 'four hundred and ninety five'
                output = p.number_to_words(x)
                resultwords  = [word for word in output.split() if word not in ['and']]    
                new_word = ' '.join(resultwords)
            else:
                new_word = p.number_to_words(x)
            
        # deal with cases like 1st, 2nd, etc.
        elif re.match(r"(\d+)(\w+)", x, re.I):
            single_digits = ['1st', '2nd', '3rd', '5th', '8th', '9th']
            double_digits = ['12th']
            single_num = ['1', '2', '3', '5', '8', '9']
            double_num = ['12']
            single_digit_labels = ['first', 'second', 'third', 'fifth', 'eighth', 'ninth']
            double_digit_labels = ['twelfth']
            all_digits = single_digits + double_digits
            all_labels = single_digit_labels + double_digit_labels
            if x in all_digits:
                new_word = all_labels[all_digits.index(x)]
            else:
                items = re.match(r"(\d+)(\w+)", x, re.I).groups()
                if (items[1] not in ['s', 'th', 'st', 'nd', 'rd']):
                    new_word = fix_numbers(items[0]) + " " + items[1]
                elif (items[0][-2:] in double_num):
                    new_word = fix_numbers(str(100*int(items[0][:-2]))) + " " + fix_numbers(items[0][-2:]+items[1])
                elif ((items[0][-1:] in single_num) and items[0][-2:-1] != '1'):
                    try:
                        new_word = fix_numbers(str(10*int(items[0][:-1]))) + " " + fix_numbers(items[0][-1:]+items[1])
                    except:
                        new_word = fix_numbers(items[0]) + items[1]
                # deal with case e.g. 80s
                elif (items[1] in ['s', 'th']) and (p.number_to_words(items[0])[-1] == 'y'):
                    new_word = fix_numbers(items[0])[:-1] + "ie" + items[1]
                else:
                    new_word = fix_numbers(items[0]) + items[1]
                    
        # deal with dollars
        elif re.match(r"\$[^\]]+", x, re.I):
            # deal with $ to 'dollars'
            money = fix_numbers(x[1:])
            if x[1:] in ["1", "a"]:
                new_word = money + " dollar"
            else:
                new_word = money + " dollars"
                
        elif re.match(r"\£[^\]]+", x, re.I):
            # deal with £ to 'pounds'
            money = fix_numbers(x[1:])
            if x[1:] in ["1", "a"]:
                new_word = money + " pound"
            else:
                new_word = money + " pounds"
                
        else:
            new_word = x       
        
        new_list.append(new_word)
        
    text = ' '.join(new_list)
    text =re.sub(r'[^\s\w$]|_', ' ',text)
    
    # Deal with written out years (two thousand and ten -> twenty ten)
    for double_dig in range(10, 100):
        double_dig_str = p.number_to_words(double_dig)
        text = re.sub('two thousand and ' + double_dig_str, 'twenty ' + double_dig_str, text.lower())
        text = re.sub('two thousand ' + double_dig_str, 'twenty ' + double_dig_str, text.lower())

    # Change e.g. 101 to 'one oh one' -- good for area codes
    single_dig_list = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    for j in single_dig_list:
        text = re.sub('thousand and ' + j, 'thousand ' + j, text.lower())
        for k in single_dig_list:
            #print(j + ' hundred ' + k)
            text = re.sub(j + ' hundred ' + k + ' ', j + ' oh ' + k + ' ', text.lower())
            text = re.sub(j + ' hundred ' + k + '$', j + ' oh ' + k, text.lower())
    
    text = re.sub("\s+"," ",''.join(text)) # standardize whitespace
    
    return text

In [None]:
# Function for text cleaning on all ASR transcriptions, as well as human transcriptions of VOC and CORAAL

def clean_all_transcripts(baseline_snippets):

    new_baseline = baseline_snippets.copy()
    new_baseline['google_transcription'] = new_baseline['google_transcription'].replace(np.nan, '', regex=True)  
    new_baseline['ibm_transcription'] = new_baseline['ibm_transcription'].replace(np.nan, '', regex=True)  
    new_baseline['amazon_transcription'] = new_baseline['amazon_transcription'].replace(np.nan, '', regex=True)  
    new_baseline['msft_transcription'] = new_baseline['msft_transcription'].replace(np.nan, '', regex=True)  
    new_baseline['apple_transcription'] = new_baseline['apple_transcription'].replace(np.nan, '', regex=True)  
    
    swear_words = ['nigga', 'niggas', 'shit', 'bitch', 'damn', 'fuck', 'fuckin', 'fucking', 'motherfuckin', 'motherfucking']
    filler_words = ['um', 'uh', 'mm', 'hm', 'ooh', 'woo', 'mhm', 'huh', 'ha']
    
    pre_cardinal = ['N', 'E', 'S', 'W', 'NE', 'NW', 'SE', 'SW']
    post_cardinal = ['North', 'East', 'South', 'West', 'Northeast', 'Northwest', 'Southeast', 'Southwest']
    
    pre_list = ['cuz', 'ok', 'o', 'till', 'yup', 'imma', 'mister', 'doctor',
                'gonna', 'tryna',
               'carryout', 'sawmill', 'highschool', 'worldclass',
               'saint', 'street', 'state',
                'avenue', 'road', 'boulevard',
               'theatre', 'neighbour', 'neighbours', 'neighbourhood', 'programme']
    post_list = ['cause', 'okay', 'oh', 'til', 'yep', 'ima', 'mr', 'dr',
                 'going to', 'trying to',
                'carry out', 'saw mill', 'high school', 'world class',
                 'st', 'st', 'st',
                 'ave', 'rd', 'blvd',
                 'theater', 'neighbor', 'neighbors', 'neighborhood', 'program']

    def clean_within_all(text):
        
        # remove hesitation from IBM transcript
        text = re.sub('%HESITATION',' ',''.join(text))
        
        # fix spacing in certain spellings
        text = re.sub('T V','TV',''.join(text))
        text = re.sub('D C','DC',''.join(text))
        
        # remove remaining floating non-linguistic words
        single_paren = ['<','>', '(',')', '{','}','[',']']
        for paren in single_paren:
            linguistic_words  = [word for word in text.split() if paren not in word]    
            text = ' '.join(linguistic_words)
              
        # general string cleaning
        text = re.sub(r"([a-z])\-([a-z])", r"\1 \2", text , 0, re.IGNORECASE) # replace inter-word hyphen with space
        text = re.sub("'",'',''.join(text)) # remove apostrophe
        text =re.sub(r'[^\s\w$]|_', ' ',text) # replace special characters with space, except $
        text = re.sub("\s+"," ",''.join(text)) # standardize whitespace
        
        # update numeric numbers to strings and remove $
        text = re.sub("ft ²", "square feet", ''.join(text))
        text = fix_numbers(text)
        text = re.sub("\$",'dollars',''.join(text))
        text = re.sub("\£",'pounds',''.join(text))
        
        # standardize spellings
        split_words = text.split()
        for i in range(len(pre_list)):
            split_words = [x if x.lower() != pre_list[i] else post_list[i] for x in split_words]
        text = ' '.join(split_words)        
        
        # deal with cardinal directions
        split_words_dir = text.split()
        for i in range(len(pre_cardinal)):
            split_words_dir = [x if x != pre_cardinal[i] else post_cardinal[i] for x in split_words_dir]
        text = ' '.join(split_words_dir)
        
        # deal with state abbreviations
        text = fix_state_abbrevs(text)
        text = text.lower()
   
        # update spacing in certain spellings
        spacing_list_pre = ['north east', 'north west', 'south east', 'south west', 'all right']
        spacing_list_post = ['northeast', 'northwest', 'southeast', 'southwest', 'alright']
        for i in range(len(spacing_list_pre)):
            text = re.sub(spacing_list_pre[i], spacing_list_post[i],''.join(text))

        # remove filler words and swear words
        remove_words = swear_words + filler_words
        resultwords  = [word for word in text.split() if word not in remove_words]    
        result = ' '.join(resultwords)
        
        return result
    
    new_baseline['clean_content'] = new_baseline.apply(lambda x: clean_within_all(x['clean_content']), axis=1)
    new_baseline['clean_google'] = new_baseline.apply(lambda x: clean_within_all(x['google_transcription']), axis=1)
    new_baseline['clean_ibm'] = new_baseline.apply(lambda x: clean_within_all(x['ibm_transcription']), axis=1)
    new_baseline['clean_amazon'] = new_baseline.apply(lambda x: clean_within_all(x['amazon_transcription']), axis=1)
    new_baseline['clean_msft'] = new_baseline.apply(lambda x: clean_within_all(x['msft_transcription']), axis=1)
    new_baseline['clean_apple'] = new_baseline.apply(lambda x: clean_within_all(x['apple_transcription']), axis=1)

    return new_baseline

In [None]:
# Apply cleaning rules to CORAAL and VOC

def clean_everything(df):
    coraal_usable = clean_coraal(df)
    voc_usable = clean_voc(df)
    all_usable = pd.concat([coraal_usable, voc_usable], axis=0)
    clean_all = clean_all_transcripts(all_usable)
    return clean_all

clean_usable_snippets = clean_everything(usable_snippets)

In [None]:
# Post-cleaning restriction to snippets that have more than 5 words

clean_usable_snippets['wordcount'] = clean_usable_snippets['clean_content'].str.split().str.len()
clean_usable_snippets = clean_usable_snippets[clean_usable_snippets['wordcount']>=5]
print(len(clean_usable_snippets))

# Final restriction to rows without word list uttered as part of cleaned speech

word_list_snippets = ['SAC_Sindle_Rhea_3652225_3692307.wav', 'SAC_Wyley_Hannah_3361328_3405912.wav']
clean_usable_snippets = clean_usable_snippets[~clean_usable_snippets['segment_filename'].isin(word_list_snippets)]

In [None]:
# Check number of unintelligible snippets in each of CORAAL and VOC datasets

black_counts = clean_usable_snippets[clean_usable_snippets['race_ethnicity']=='Black']
white_counts = clean_usable_snippets[clean_usable_snippets['race_ethnicity']=='White']
    
black_unintel_regex = '\/unintelligible\/|\/inaudible\/|\/RD(.*?)\/|\/(\?)\1*\/'
black_unintel = black_counts[black_counts['content'].str.contains(black_unintel_regex)]

white_unintel_regex = '\(\(\)\)|\(\((\s)\1*\)\)'
white_unintel = white_counts[white_counts['content'].str.contains(white_unintel_regex)]

print("Black unintelligible snippets: ", len(black_unintel), " out of ", len(black_counts), "which is ", 
      100*len(black_unintel)/len(black_counts), "%")
print("White unintelligible snippets: ", len(white_unintel), " out of ", len(white_counts), "which is ", 
      100*len(white_unintel)/len(white_counts), "%")

# WER Calculation

In [None]:
# Calculate WER

def wer_calc(transcripts, human_clean_col, asr_clean_col):
    new_transcripts = transcripts.copy()
    temp1 = transcripts[human_clean_col].tolist()
    for col in asr_clean_col:
        new_transcripts[col] = new_transcripts[col].replace(np.nan, '', regex=True)
        temp2 = new_transcripts[col].tolist()
        wer_list = []
        for i in range(len(temp1)):
            wer_list.append(wer(temp1[i], temp2[i]))
        new_transcripts[col+"_wer"] = wer_list

    return new_transcripts

In [None]:
# Create ASR list for WER calculations

clean_asr_trans_list = ['clean_google',
                        'clean_ibm',
                        'clean_amazon',
                        'clean_msft',
                        'clean_apple']

# Run WER calculations on all usable snippets, with cleaning
clean_transcripts_wer = wer_calc(clean_usable_snippets, 'clean_content', clean_asr_trans_list)

In [None]:
# Export WER calculations

clean_transcripts_wer.to_csv(base_folder + 'output/transcribed_wer_usable.csv')

In [None]:
# Split WER dataset into AAVE and white speakers

def split_white_black(all_trans):
    black_stack = all_trans[all_trans['race_ethnicity']=='Black']
    white_stack = all_trans[all_trans['race_ethnicity']=='White']
    return black_stack, white_stack

clean_black_stack, clean_white_stack = split_white_black(clean_transcripts_wer)
print(len(clean_black_stack))
print(len(clean_white_stack))

# Include separate (less heavy) cleaning for LM perplexity calculation

In [None]:
# Clean CORAAL human transcript -- ignore cases & punctuation

def apostrophe_clean_coraal(baseline_snippets):
    baseline_coraal = baseline_snippets
    # Replace original unmatched CORAAL transcript square brackets with squiggly bracket
    baseline_coraal.loc[:,'apostrophe_clean_content'] = baseline_coraal.loc[:,'content'].copy()
    baseline_coraal.loc[:,'apostrophe_clean_content'] = baseline_coraal['apostrophe_clean_content'].str.replace('\[','\{')
    baseline_coraal.loc[:,'apostrophe_clean_content'] = baseline_coraal['apostrophe_clean_content'].str.replace('\]','\}')
    
    def apostrophe_clean_within_coraal(text):

        # Relabel CORAAL words. For consideration: aks -> ask?
        split_words = text.split()
        split_words = [x if x != 'busses' else 'buses' for x in split_words]
        split_words = [x if x != 'aks' else 'ask' for x in split_words]
        split_words = [x if x != 'aksing' else 'asking' for x in split_words]
        split_words = [x if x != 'aksed' else 'asked' for x in split_words]
        text = ' '.join(split_words)
        
        # remove CORAAL unintelligible flags
        text = re.sub("\/(?i)unintelligible\/",'',''.join(text))
        text = re.sub("\/(?i)inaudible\/",'',''.join(text))
        text = re.sub('\/RD(.*?)\/', '',''.join(text))
        text = re.sub('\/(\?)\1*\/', '',''.join(text))
        
        # remove nonlinguistic markers
        text = remove_markers(text, ['<>', '()', '{}'])

        return text

    baseline_coraal['apostrophe_clean_content'] = baseline_coraal.apply(lambda x: apostrophe_clean_within_coraal(x['apostrophe_clean_content']), axis=1)
    
    return baseline_coraal

In [None]:
# Clean VOC human transcript

def apostrophe_clean_voc(baseline_snippets):
    # Restrict to CORAAL rows
    baseline_voc = baseline_snippets
    
    pre_list = ['thier', 'humbolt', 'arcada', 'ninteen', 'marajuana', 'theatre', 'portugeuse', 'majorca']
    post_list = ['their', 'Humboldt', 'Arcata', 'nineteen', 'marijuana', 'theater', 'portuguese', 'mallorca']
    def apostrophe_clean_within_voc(text):

        # Relabel misspellings
        split_words = text.split()
        for i in range(len(pre_list)):
            split_words = [x if x.lower() != pre_list[i] else post_list[i] for x in split_words]
        text = ' '.join(split_words)   
        #new_words = [x if x != 'thier' else 'their' for x in split_words]

        # remove nonlinguistic markers
        text = remove_markers(text, ['<>', '{}'])

        return text

    baseline_voc['apostrophe_clean_content'] = baseline_voc.apply(lambda x: apostrophe_clean_within_voc(x['content']), axis=1)
    
    return baseline_voc

In [None]:
# Account for number parsing and dollars with punctuation and original uppercasing
p = inflect.engine()
t2d = text2digits.Text2Digits()

def apostrophe_fix_numbers(text):
    split_words_num = text.split()
    new_list = []
    for i in range(len(split_words_num)):
        x = split_words_num[i]
        
        # deal with years
        if x.isdigit():
            if (1100 <= int(x) < 2000) or (2010 <= int(x) < 2100) or (int(x) == 5050):
                # deal with years as colloquially spoken
                new_word = p.number_to_words(x[:2]) + " " + p.number_to_words(x[2:])
            elif "and" in p.number_to_words(x):
                # remove 'and' from e.g. 'four hundred and ninety five'
                output = p.number_to_words(x)
                resultwords  = [word for word in output.split() if word not in ['and']]    
                new_word = ' '.join(resultwords)
            else:
                new_word = p.number_to_words(x)
            
        # deal with cases like 1st, 2nd, etc.
        elif re.match(r"(\d+)(\w+)", x, re.I):
            single_digits = ['1st', '2nd', '3rd', '5th', '8th', '9th']
            double_digits = ['12th']
            single_num = ['1', '2', '3', '5', '8', '9']
            double_num = ['12']
            single_digit_labels = ['first', 'second', 'third', 'fifth', 'eighth', 'ninth']
            double_digit_labels = ['twelfth']
            all_digits = single_digits + double_digits
            all_labels = single_digit_labels + double_digit_labels
            if x in all_digits:
                new_word = all_labels[all_digits.index(x)]
            else:
                items = re.match(r"(\d+)(\w+)", x, re.I).groups()
                if (items[1] not in ['s', 'th', 'st', 'nd', 'rd']):
                    new_word = fix_numbers(items[0]) + " " + items[1]
                elif (items[0][-2:] in double_num):
                    new_word = fix_numbers(str(100*int(items[0][:-2]))) + " " + fix_numbers(items[0][-2:]+items[1])
                elif ((items[0][-1:] in single_num) and items[0][-2:-1] != '1'):
                    try:
                        new_word = fix_numbers(str(10*int(items[0][:-1]))) + " " + fix_numbers(items[0][-1:]+items[1])
                    except:
                        new_word = fix_numbers(items[0]) + items[1]
                # deal with case e.g. 80s
                elif (items[1] in ['s', 'th']) and (p.number_to_words(items[0])[-1] == 'y'):
                    new_word = fix_numbers(items[0])[:-1] + "ie" + items[1]
                else:
                    new_word = fix_numbers(items[0]) + items[1]
                    
        # deal with dollars
        elif re.match(r"\$[^\]]+", x, re.I):
            # deal with $ to 'dollars'
            money = fix_numbers(x[1:])
            if x[1:] in ["1", "a"]:
                new_word = money + " dollar"
            else:
                new_word = money + " dollars"
        else:
            new_word = x       
        
        new_list.append(new_word)
        
    text = ' '.join(new_list)
    text =re.sub(r'[^\s\w$\'\.\?\,\!]|_', ' ',text)
    
    # Deal with written out years (two thousand and ten -> twenty ten)
    for double_dig in range(10, 100):
        double_dig_str = p.number_to_words(double_dig)
        text = re.sub('two thousand and ' + double_dig_str, 'twenty ' + double_dig_str, text)
        text = re.sub('two thousand ' + double_dig_str, 'twenty ' + double_dig_str, text)
        text = re.sub('Two thousand and ' + double_dig_str, 'Twenty ' + double_dig_str, text)
        text = re.sub('Two thousand ' + double_dig_str, 'Twenty ' + double_dig_str, text)

    # Change e.g. 101 to 'one oh one' -- good for area codes
    single_dig_list = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    for j in single_dig_list:
        text = re.sub('thousand and ' + j, 'thousand ' + j, text.lower())
        for k in single_dig_list:
            #print(j + ' hundred ' + k)
            text = re.sub(j + ' hundred ' + k + ' ', j + ' oh ' + k + ' ', text.lower())
            text = re.sub(j + ' hundred ' + k + '$', j + ' oh ' + k, text.lower())
    
    text = re.sub("\s+"," ",''.join(text)) # standardize whitespace
    
    return text

In [None]:
# Account for state abbreviations with punctuation and original uppercasing

def apostrophe_fix_state_abbrevs(text):
    ix = 0
    state_result = []
    wordlist = text.split()
    while ix < len(wordlist):
        orig_word = wordlist[ix]
        word = wordlist[ix].lower().capitalize()
        if word in states.keys():
            new_word = states[word]
        elif (ix < len(wordlist)-1) and ((word + ' ' + wordlist[ix+1].lower().capitalize()) in states.keys()):
            new_word = states[(word + ' ' + wordlist[ix+1].lower().capitalize())]
            ix += 1
        else:
            new_word = orig_word
        state_result.append(new_word)
        ix += 1
    text = ' '.join(state_result)
    return text

In [None]:
# Clean all transcripts

def apostrophe_clean_all_transcripts(baseline_snippets):

    new_baseline = baseline_snippets.copy()
    
    swear_words = ['nigga', 'niggas', 'shit', 'bitch', 'damn', 'fuck', 'fuckin', 'fucking', 'motherfuckin', 'motherfucking']
    filler_words = ['um', 'uh', 'mm', 'hm', 'ooh', 'woo', 'mhm', 'huh', 'ha']
    
    pre_cardinal = ['N', 'E', 'S', 'W', 'NE', 'NW', 'SE', 'SW']
    post_cardinal = ['North', 'East', 'South', 'West', 'Northeast', 'Northwest', 'Southeast', 'Southwest']
    
    pre_list = ['cuz', 'ok', 'o', 'till', 'yup', 'imma', 'mister', 'doctor',
                'gonna', 'tryna',
               'carryout', 'sawmill', 'highschool', 'worldclass',
               'theatre', 'neighbour', 'neighbours', 'neighbourhood', 'programme']
    post_list = ['cause', 'okay', 'oh', 'til', 'yep', 'ima', 'mr', 'dr',
                 'going to', 'trying to',
                'carry out', 'saw mill', 'high school', 'world class',
                 'theater', 'neighbor', 'neighbors', 'neighborhood', 'program']

    def apostrophe_clean_within_all(text):
        
        # remove hesitation from IBM transcript
        text = re.sub('%HESITATION',' ',''.join(text))
        
        # fix spacing in certain spellings
        text = re.sub('T V','TV',''.join(text))
        text = re.sub('D C','DC',''.join(text))
        
        # remove remaining floating non-linguistic words
        single_paren = ['<','>', '(',')', '{','}','[',']']
        for paren in single_paren:
            linguistic_words  = [word for word in text.split() if paren not in word]    
            text = ' '.join(linguistic_words)
              
        # general string cleaning
        text = re.sub(r"([a-z])\-([a-z])", r"\1 \2", text , 0, re.IGNORECASE) # replace inter-word hyphen with space
        #text = re.sub("'",'',''.join(text)) # remove apostrophe
        text =re.sub(r'[^\s\w$\'\.\?\,\!]|_', ' ',text) # replace special characters with space, except $ and apostrophe
        text = re.sub("\s+"," ",''.join(text)) # standardize whitespace
        
        # update numeric numbers to strings and remove $
        text = re.sub("ft ²", "square feet", ''.join(text))
        #text = apostrophe_fix_numbers(text)
        text = re.sub("\$",'dollars',''.join(text))
        
        # standardize spellings
        split_words = text.split()
        for i in range(len(pre_list)):
            split_words = [x if re.sub('\,','',x.lower()) != pre_list[i] else post_list[i] for x in split_words]
        text = ' '.join(split_words)        
        
        # deal with cardinal directions
        split_words_dir = text.split()
        for i in range(len(pre_cardinal)):
            split_words_dir = [x if re.sub('\,','',x) != pre_cardinal[i] else post_cardinal[i] for x in split_words_dir]
        text = ' '.join(split_words_dir)
        
        # deal with state abbreviations
        text = apostrophe_fix_state_abbrevs(text)
        #text = text.lower()
   
        # update spacing in certain spellings
        spacing_list_pre = ['north east', 'north west', 'south east', 'south west', 'all right']
        spacing_list_post = ['northeast', 'northwest', 'southeast', 'southwest', 'alright']
        for i in range(len(spacing_list_pre)):
            text = re.sub(spacing_list_pre[i], spacing_list_post[i],''.join(text))

        # remove filler words and swear words
        remove_words = swear_words + filler_words
        resultwords  = [word for word in text.split() if re.sub('\,','',word.lower()) not in remove_words]
        #resultwords  = [word for word in text.split() if word.lower()[:-1] not in remove_words]
        result = ' '.join(resultwords)
        
        # capitalize first word, remove extra space before comma
        result = re.sub("\s+\,",",",''.join(result))
        result = result[0].capitalize() + result[1:]
        if result[-1] not in ['.','?','!',',']:
            result = result + '.'
        if result[-1] == ',':
            result = result[:-1] + '.'
        
        result = re.sub("\s+\.",".",''.join(result))
        result = re.sub("\s+\!","!",''.join(result))
        result = re.sub("\s+\?","?",''.join(result))
        
        result = re.sub(" Cause"," cause",''.join(result))
        
        return result
    
    new_baseline['apostrophe_clean_content'] = new_baseline.apply(lambda x: apostrophe_clean_within_all(x['apostrophe_clean_content']), axis=1)

    return new_baseline

In [None]:
# Apply apostrophe-ignoring cleaning rules (for human transcriptions) to the same subset of snippets as determined above

apostrophe_coraal = apostrophe_clean_coraal(clean_black_stack)
apostrophe_voc = apostrophe_clean_voc(clean_white_stack)
apostrophe_usable_input = pd.concat([apostrophe_coraal, apostrophe_voc], axis=0)

In [None]:
# Apply apostrophe-ignoring cleaning rules as done previously, on human transcriptions again

apostrophe_clean_usable_snippets = apostrophe_clean_all_transcripts(apostrophe_usable_input)

In [None]:
# Export WER to csv for LM perplexity calculation

apostrophe_clean_usable_snippets.to_csv(base_folder + 'output/transcribed_wer_usable_punctuation.csv')

# Average WER calculations

In [None]:
# Sanity check only
# Get average WER for each ASR by race for overall (unmatched) set of samples

def calc_avg_wer(df, num, asr_list):
    black_mean_list = []
    white_mean_list = []
    for asr in asr_list:
        wer_col = asr+"_wer"
        black_df = df[df['race_ethnicity']=='Black']
        white_df = df[df['race_ethnicity']=='White']
        
        black_mean = black_df[wer_col].mean()
        white_mean = white_df[wer_col].mean()
        
        black_mean_list.append(black_mean)
        white_mean_list.append(white_mean)

    out_df = pd.DataFrame()
    out_df['ASR'] = asr_list
    out_df['Avg_Black_WER'+num] = black_mean_list
    out_df['Avg_White_WER'+num] = white_mean_list
    
    return out_df

# Compare WER across different ASR services
clean_usable_avg_wer = calc_avg_wer(clean_transcripts_wer, '', clean_asr_trans_list)
clean_usable_avg_wer