# Calculate share of CORAAL/VOC text in ASR lexicon

In [None]:
import os
import pandas as pd
import numpy as np
import collections

In [None]:
os.getcwd()
os.chdir('..')
base_folder = os.getcwd()+'/' #'~/fair-speech/release/'

In [None]:
# Import previously generated transcript WER values

clean_transcripts_wer = pd.read_csv(base_folder + 'output/transcribed_wer_usable_matched_punctuation.csv')
clean_transcripts_wer = clean_transcripts_wer.replace(np.nan, '', regex=True)

In [None]:
# Separate into AAVE and White samples

def split_white_black(all_trans):
    black_stack = all_trans[all_trans['black_flag']==1]
    white_stack = all_trans[all_trans['black_flag']==0]
    #black_stack = all_trans[all_trans['race_ethnicity']=='Black']
    #white_stack = all_trans[all_trans['race_ethnicity']=='White']
    return black_stack, white_stack

clean_black_stack, clean_white_stack = split_white_black(clean_transcripts_wer)

In [None]:
# Get total count of cleaned words (including repeated words) in CORAAL and VOC

def find_word_counts(transcript, column):
    transcript['words'] = transcript.apply(lambda x: x[column].split(), axis=1)
    word_sum = transcript.words.sum()
    output=collections.Counter(word_sum)
    df = pd.DataFrame.from_dict(output, orient='index').reset_index()
    df.columns = ['word', column+'_count']
    print(column, len(df))
    return df

count_coraal_words = find_word_counts(clean_black_stack, 'clean_content')
count_voc_words = find_word_counts(clean_white_stack, 'clean_content')

# Find word counts in each ASR lexicon (where 'word' column contains unique words)
unique_google_words = find_word_counts(clean_transcripts_wer, 'clean_google')
unique_amazon_words = find_word_counts(clean_transcripts_wer, 'clean_amazon')
unique_msft_words = find_word_counts(clean_transcripts_wer, 'clean_msft')
unique_ibm_words = find_word_counts(clean_transcripts_wer, 'clean_ibm')
unique_apple_words = find_word_counts(clean_transcripts_wer, 'clean_apple')

In [None]:
# Merge to match CORAAL / VOC words to ASR lexicon

coraal_merge_words = count_coraal_words
voc_merge_words = count_voc_words

for df in [unique_google_words, unique_amazon_words, unique_msft_words, unique_ibm_words, unique_apple_words]:
    coraal_merge_words = coraal_merge_words.merge(df, on = 'word', how = 'left')
    voc_merge_words = voc_merge_words.merge(df, on = 'word', how = 'left')

In [None]:
# Get word sum

total_coraal_words = coraal_merge_words['clean_content_count'].sum()
total_voc_words = voc_merge_words['clean_content_count'].sum()

asr_list = ['apple', 'ibm', 'google', 'amazon', 'msft']
coraal_list = []
voc_list = []

for asr in asr_list:
    voc_words_in_corpus = voc_merge_words['clean_content_count'].where(
        ~voc_merge_words['clean_'+asr+'_count'].isna()).sum()
    voc_list.append(voc_words_in_corpus/total_voc_words)
    
    coraal_words_in_corpus = coraal_merge_words['clean_content_count'].where(
        ~coraal_merge_words['clean_'+asr+'_count'].isna()).sum()
    coraal_list.append(coraal_words_in_corpus/total_coraal_words)

asr_list_cap = ['Apple', 'IBM', 'Google', 'Amazon', 'Microsoft']
coraal_list_round = [str(round(100*elem, 2))+'%' for elem in coraal_list]    
voc_list_round = [str(round(100*elem, 2))+'%' for elem in voc_list]
pd.DataFrame(list(zip(asr_list_cap, coraal_list_round, voc_list_round)), columns = ['ASR', 
                                                                    'CORAAL % Words in ASR Corpus', 
                                                                    'VOC % Words in ASR Corpus'])

In [None]:
# Sanity check the count of Google ASR words

print(total_coraal_words) # Total number of words uttered by black speakers in our sample
print(coraal_merge_words['clean_content_count'].where(~coraal_merge_words['clean_google_count'].isna()).sum()) # Words that Google had in ASR from CORAAL

print(total_voc_words) # Total number of words uttered by black speakers in our sample
print(voc_merge_words['clean_content_count'].where(~voc_merge_words['clean_google_count'].isna()).sum()) # Words that Google had in ASR from VOC

In [None]:
# Find set intersection and differences between VOC and CORAAL

black_words = count_coraal_words['word']
white_words = count_voc_words['word']

black_not_in_white = black_words[~((black_words.isin(white_words)))]
white_not_in_black = white_words[~((white_words.isin(black_words)))]
white_black_intersection = white_words[((white_words.isin(black_words)))]

print(len(black_not_in_white), len(white_not_in_black), len(white_black_intersection))
print(len(black_words), len(white_words))

In [None]:
# Find set intersection and differences between VOC and CORAAL

black_not_in_white_counts = pd.DataFrame(black_not_in_white).merge(count_coraal_words, on = 'word')
black_not_in_white_counts.columns = ['word', 'coraal_count']
black_not_in_white_counts = black_not_in_white_counts.sort_values('coraal_count', ascending=False)

white_not_in_black_counts = pd.DataFrame(white_not_in_black).merge(count_voc_words, on = 'word')
white_not_in_black_counts.columns = ['word', 'voc_count']
white_not_in_black_counts = white_not_in_black_counts.sort_values('voc_count', ascending=False)

white_black_intersection_counts = (pd.DataFrame(white_black_intersection).merge(count_coraal_words, on = 'word')).merge(count_voc_words, on='word')
white_black_intersection_counts.columns = ['word', 'coraal_count', 'voc_count']
white_black_intersection_counts['total_count'] = white_black_intersection_counts['coraal_count'] + white_black_intersection_counts['voc_count']
white_black_intersection_counts = white_black_intersection_counts.sort_values('total_count', ascending=False)
white_black_intersection_counts

print(len(black_not_in_white_counts), len(white_not_in_black_counts), len(white_black_intersection_counts))

In [None]:
black_not_in_white_counts.to_csv('black_not_in_white_counts.csv', index = None)
white_not_in_black_counts.to_csv('white_not_in_black_counts.csv', index = None)
white_black_intersection_counts.to_csv('white_black_intersection_counts.csv', index = None)