# Preprocessing and creating the dataset

In [9]:
import json

In [14]:
region_idterms = []

with open('../nlp-fairness-for-india/region_idterms.tsv', 'r') as f:
    for line in f:
        region_idterms.append(line.strip())
    region_idterms = region_idterms[1:]
    # capitalizing the first letter of each term
    region_idterms = [term[0].upper() + term[1:] for term in region_idterms]

In [15]:
religion_idterms = []

with open('../nlp-fairness-for-india/religion_idterms.tsv', 'r') as f:
    for line in f:
        religion_idterms.append(line.strip())
    religion_idterms = religion_idterms[1:]
    # capitalizing the first letter of each term
    religion_idterms = [term[0].upper() + term[1:] for term in religion_idterms]

In [None]:
type = 'vanilla'

In [4]:
prompts = json.load(open(f'{type}/prompts.json')) 

In [16]:
region_dataset = {}

for region in region_idterms:
    region_dataset[region] = {}
    for prompt_category in prompts:
        region_dataset[region][prompt_category] = []
        for prompt in prompts[prompt_category]:
            region_dataset[region][prompt_category].append(prompt.replace('[IDENTITY_TERM]', region))

In [18]:
religion_dataset = {}

for religion in religion_idterms:
    religion_dataset[religion] = {}
    for prompt_category in prompts:
        religion_dataset[religion][prompt_category] = []
        for prompt in prompts[prompt_category]:
            religion_dataset[religion][prompt_category].append(prompt.replace('[IDENTITY_TERM]', religion))

In [19]:
# save the datasets
json.dump(region_dataset, open(f'{type}/region_dataset.json', 'w'))
json.dump(religion_dataset, open(f'{type}/religion_dataset.json', 'w'))

# Running the masked language model and storing results

In [20]:
import os
from transformers import pipeline
from tqdm import tqdm

In [21]:
model_name = 'google/muril-base-cased'

In [None]:
mask_fill = pipeline('fill-mask', model=model_name)

In [None]:
test_sentence = 'The [MASK] is a beautiful place.'
mask_fill(test_sentence)

In [None]:
region_dataset = json.load(open(f'{type}/region_dataset.json'))
region_results = {}

for region in tqdm(region_dataset, desc='Regions'):
    region_results[region] = {}
    for prompt_category in region_dataset[region]:
        region_results[region][prompt_category] = []
        for prompt in region_dataset[region][prompt_category]:
            result = mask_fill(prompt)
            region_results[region][prompt_category].append(result)

json.dump(region_results, open('region_results.json', 'w'))

In [None]:
religion_dataset = json.load(open(f'{type}/religion_dataset.json'))
religion_results = {}

for religion in tqdm(religion_dataset, desc='Religions'):
    religion_results[religion] = {}
    for prompt_category in religion_dataset[religion]:
        religion_results[religion][prompt_category] = []
        for prompt in religion_dataset[religion][prompt_category]:
            result = mask_fill(prompt)
            religion_results[religion][prompt_category].append(result)

json.dump(religion_results, open('religion_results.json', 'w'))