# Preprocessing and creating the dataset

In [4]:
import json

In [5]:
region_idterms = []

with open('../nlp-fairness-for-india/region_idterms.tsv', 'r') as f:
    for line in f:
        region_idterms.append(line.strip())
    region_idterms = region_idterms[1:]
    # capitalizing the first letter of each term
    region_idterms = [term[0].upper() + term[1:] for term in region_idterms]

In [6]:
religion_idterms = []

with open('../nlp-fairness-for-india/religion_idterms.tsv', 'r') as f:
    for line in f:
        religion_idterms.append(line.strip())
    religion_idterms = religion_idterms[1:]
    # capitalizing the first letter of each term
    religion_idterms = [term[0].upper() + term[1:] for term in religion_idterms]

In [7]:
prompts = json.load(open(f'prompts.json')) 

In [9]:
combined_dataset = {}

for region in region_idterms:
    combined_dataset[region] = {}
    for religion in religion_idterms:
        combined_dataset[region][religion] = {}
        for prompt_category in prompts:
            combined_dataset[region][religion][prompt_category] = []
            for prompt in prompts[prompt_category]:
                combined_dataset[region][religion][prompt_category].append(prompt.replace('[IDENTITY_TERM]', region + ' ' + religion))

In [10]:
# save the datasets
json.dump(combined_dataset, open('combined_dataset.json', 'w'))

# Running the masked language model and storing results

In [20]:
import os
from transformers import pipeline
from tqdm import tqdm

In [21]:
model_name = 'google/muril-base-cased'

In [None]:
mask_fill = pipeline('fill-mask', model=model_name)

In [None]:
test_sentence = 'The [MASK] is a beautiful place.'
mask_fill(test_sentence)

In [None]:
combined_dataset = json.load(open('combined_dataset.json'))
combined_results = {}

for region in combined_dataset:
    combined_results[region] = {}
    for religion in tqdm(combined_dataset[region], desc=f'Religions in {region}'):
        combined_results[region][religion] = {}
        for prompt_category in combined_dataset[region][religion]:
            combined_results[region][religion][prompt_category] = []
            for prompt in combined_dataset[region][religion][prompt_category]:
                result = mask_fill(prompt)
                combined_results[region][religion][prompt_category].append(result)

json.dump(combined_results, open('combined_results.json', 'w'))