In [1]:
import pandas as pd
import numpy as np
import itertools
from collections import defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize

from typing import Dict, List

# references: 
# https://github.com/stanford-crfm/helm/tree/main 
# https://arxiv.org/abs/2211.09110
# https://datacentricai.org/blog/the-hugging-face-data-measurements-tool/
# https://huggingface.co/spaces/huggingface/data-measurements-tool

# Word lists 

Store word lists in dictionaries

In [2]:
female_terms = [
    "she",
    "daughter",
    "hers",
    "her",
    "mother",
    "woman",
    "girl",
    "herself",
    "female",
    "sister",
    "daughters",
    "mothers",
    "women",
    "girls",
    "femen",
    "sisters",
    "aunt",
    "aunts",
    "niece",
    "nieces",
    "wife",
    "wives"
]

male_terms = [
    "he",
    "son",
    "his",
    "him",
    "father",
    "man",
    "boy",
    "himself",
    "male",
    "brother",
    "sons",
    "fathers",
    "men",
    "boys",
    "males",
    "brothers",
    "uncle",
    "uncles",
    "nephew",
    "nephews",
    "husband",
    "husbands"
]

adjectives = [
    "reactive",
    "caring",
    "gentle",
    "working"
]

professions = [
    "doctor",
    "nurse",
    "physician",
    "engineer", 
    "professor"
]

gender_dictionary = {
    "female": female_terms,
    "male": male_terms,
}

target_dictionary = {
    "adjectives": adjectives, 
    "professions": professions
    }

In [3]:
example_2 = [
"Once upon a time, there was a nurse named Emily.","She was known for her caring and gentle nature.", "Emily she is a nurse in a hospital", 
"her husband is working as an engineer in a manager position",
"In his free time,he plays football with his friends."
"One day, she met a doctor named David.", "He was a highly skilled physician.", 
"Emily and David worked together in the hospital and treated many patients.","This woman is a better nurse than doctor","She is a nurse.", "he was reactive"]

# Compute Demographic Statistics

Tries to answer:
- How many times do female/male terms appear in a given text?

In [4]:
tokens = []
for sentence in example_2:
    for word in (word_tokenize(sentence.lower())):
        tokens.append(word)

demographic_groups = list(gender_dictionary.keys()) 
demographic_groups # these are the demographic groups that we are interested in 

['female', 'male']

In [5]:
demographic_stats = defaultdict(int) # initialising with defaultdict() so that key error will not be raised

for group in demographic_groups:
    counts = []
    for token in tokens:
        if token in gender_dictionary[group]:
            counts.append(token)
    demographic_stats[group] = len(counts)

demographic_stats

defaultdict(int, {'female': 7, 'male': 6})

Convert to Dataframe to display

In [172]:
dataframe = pd.DataFrame.from_dict([demographic_stats]).T.rename(columns={0: 'values'})
dataframe.index.names=['demographic_group']
# dataframe.to_csv('demographic_stats.csv')
dataframe

Unnamed: 0_level_0,values
demographic_group,Unnamed: 1_level_1
female,7
male,6


# Compute Co-occurrence Matrix

Tries to answer:
- How many times do male / female terms appear with a given target word. 
- Target words can be professions, adjectives, etc

Limitations:
- Cases when a male **and** a female term appear with a target word 

eg: This woman is a better nurse than doctor --> both `(female, doctor)` and `(female, nurse)` are updated by a count of `1`

In [78]:
cooccurence_dict = defaultdict(int) # initialising with defaultdict() so that key error will not be raised

# target_words = target_dictionary["adjectives"] # initialise target terms
target_words = target_dictionary["professions"] # initialise target terms

demographic_groups = list(gender_dictionary.keys())  # initialise demographic groups

for sentence in example_2:
    tokens = word_tokenize(sentence.lower()) # tokenise the words in a given sentence in a list. eg: ["she", "was", "known", ..]
    for (target, demographic_group) in itertools.product(target_words, demographic_groups): # compute crossproducts between target and demographic terms
        demographic_group_word_count = [] 
        target_count = []
        demographic_group_words = gender_dictionary[demographic_group] # list of all terms in given demographic group, eg: for 'female' group, list would be ["she", "daughter", ..]
        for word in demographic_group_words:
            demographic_group_word_count.append(tokens.count(word)) # take count of each time any of the demographic terms appears in text
        target_count.append(tokens.count(target)) # take count of each time any of the target terms appears for the same text
        count = sum(demographic_group_word_count) * sum(target_count)
        cooccurence_dict[(target, demographic_group)] += count

# to print only the non-zero co-occuring pairs
# for index, value in enumerate((cooccurence_dict).values()):
#     if value != 0:
#         print(list((cooccurence_dict).keys())[index], cooccurence_dict[list((cooccurence_dict).keys())[index]])

cooccurence_dict

defaultdict(int,
            {('doctor', 'female'): 2,
             ('doctor', 'male'): 3,
             ('nurse', 'female'): 3,
             ('nurse', 'male'): 0,
             ('physician', 'female'): 0,
             ('physician', 'male'): 1,
             ('engineer', 'female'): 1,
             ('engineer', 'male'): 1,
             ('professor', 'female'): 0,
             ('professor', 'male'): 0})

Convert to pandas Dataframe to display

In [153]:
index = pd.MultiIndex.from_tuples(tuples=cooccurence_dict.keys(), names=['target_group', 'demographic_group'])
dataframe = pd.Series(cooccurence_dict, index=index).reset_index().rename(columns={0: 'values'})
dataframe = dataframe.pivot(index='target_group', columns='demographic_group', values='values')
dataframe


demographic_group,female,male
target_group,Unnamed: 1_level_1,Unnamed: 2_level_1
doctor,2,3
engineer,1,1
nurse,3,0
physician,0,1
professor,0,0
