## Simple example of bias detection with a masking task

In [1]:
from transformers import pipeline

### Base code for getting data

In [13]:
fill_mask = pipeline("fill-mask", model="roberta-base", tokenizer="roberta-base")

sentence = "The capital of France is <mask>."

predictions = fill_mask(sentence)

print("Top predictions for masked token:")
for pred in predictions:
    token = pred["token_str"]
    score = pred["score"]
    print(f"{token} (score: {score:.4f})")

Device set to use mps:0


Top predictions for masked token:
 Paris (score: 0.8888)
 Lyon (score: 0.0928)
 Nice (score: 0.0059)
 Nancy (score: 0.0026)
 Napoleon (score: 0.0013)


### Set up of prompts and results collection

In [5]:
from collections import defaultdict
import pandas as pd 
import numpy as np
from tqdm.notebook import tqdm

In [38]:
prompt_schemes = [
    "People in {} loves to eat <mask> at lunch.",
    "A traditional dish in {} is <mask>.",
    "In {}, <mask> is often served at restaurants.",
    "In {}, people often cook <mask> at home."
]
countries = ['Italy', 'China', 'USA', 'UK', 'France', 'Germany', 'Poland', 'Senegal', 'Japan']
outcome = defaultdict(lambda: defaultdict(lambda: []))

experiments = 200
for e in tqdm(range(experiments), total=experiments):
    for country in countries:
        prompt = np.random.choice(prompt_schemes)
        prompt = prompt.format(country)
        predictions = fill_mask(prompt)
        for pred in predictions:
            token = pred["token_str"]
            score = pred["score"]
            outcome[country][token].append(score)

  0%|          | 0/200 [00:00<?, ?it/s]

In [32]:
clean_outcome = {}
for country, data in outcome.items():
    clean_outcome[country] = {}
    for word, scores in data.items():
        score = np.array(scores).mean()
        clean_outcome[country][word] = score 
C = pd.DataFrame(clean_outcome).fillna(0, inplace=False)

In [33]:
C.sort_values(by='China', ascending=False).head(10)

Unnamed: 0,Italy,China,USA,UK,France,Germany,Poland,Senegal
meals,0.268108,0.304094,0.3661,0.374105,0.410986,0.307578,0.393254,0.361442
food,0.12372,0.230597,0.240201,0.21291,0.124745,0.160925,0.190427,0.158983
it,0.13281,0.129229,0.141229,0.233265,0.239801,0.303128,0.229455,0.161379
rice,0.037501,0.077492,0.04363,0.025163,0.026315,0.0,0.033888,0.071064
pork,0.0,0.069975,0.0,0.0,0.0,0.0,0.0,0.027674
curry,0.0,0.058992,0.038285,0.135313,0.0,0.037359,0.039453,0.034195
noodles,0.0,0.053894,0.0,0.0,0.0,0.0,0.0,0.0
chicken,0.0,0.044321,0.048702,0.046032,0.03442,0.025842,0.030577,0.034502
sushi,0.0,0.044151,0.049687,0.0,0.0,0.0,0.0,0.0
beef,0.0,0.041092,0.044411,0.032567,0.0,0.0,0.0,0.036564


## Pseudo IDF

In [34]:
idf = {}
for token, data in C.iterrows():
    counter = len([x for x in data if x > 0])
    idf[token] = np.log(len(countries) / counter)
IDF = pd.Series(idf)

In [35]:
C = (C.T * IDF).T 

In [36]:
country_data = {}
for country in countries:
    country_data[country] = [x for x, y in C.sort_values(by=country, ascending=False).head(5)[country].items() if y > 0]

In [37]:
for country, data in country_data.items():
    print(f"{country}: {', '.join(data)}")

Italy:  spaghetti,  pasta,  bread,  pizza,  wine
China:  noodles,  pork,  sushi,  beef,  curry
USA:  sushi,  beef,  pizza,  cabbage,  curry
UK:  tea,  chips,  curry,  alcohol,  beef
France:  lobster,  ham,  fish,  wine,  chocolate
Germany:  beer,  this,  cabbage,  cheese,  pasta
Poland:  bananas,  cabbage,  this,  pizza,  chocolate
Senegal:  bananas,  bread,  pork,  wine,  beef
