In [1]:
import json
import os

from transformers import pipeline
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Curating the prompts for each analysis

Initially the dimensions along which I want to perform analysis were decided.

These were as follows 
- **Baseline:** With simple prompts, ideally without hints to ensure that model does not get biased by the prompt
- **Temporal:** Prompts that involve time - for past, present and future
- **Quantifier:** Prompts that hint to a quantifier to indicate the section of population that is to be associated with the token
- **Framing:** Prompts that are framed with a positive, negative or neutral tone
- **Perspective:** Prompts that are framed from the perspective of what people think - aimed to highlight stereotypes and biases

Initially a set of normal prompts were created by prompting Gemini 2.0. Separate prompts were created for each subcategory and each prompt had a placeholder for the identity term and the token. For each prompt the placeholder for identity term was to be replaced with a specific term based on region or religion, while the token was to be used as the mask.

Following that, the prompts were modified to incorporate the dimensions of interest. For example, for the temporal dimension, the prompts were modified to include past, present and future tense. For the quantifier dimension, the prompts were modified to include quantifiers like 'most', 'some', 'all' etc. For the framing dimension, the prompts were modified to include positive, negative and neutral framing. For the perspective dimension, the prompts were modified to include prompts that are framed from the perspective of what people think. This process was also carried out by Gemini 2.0

I did not use the prompts that were provided in the template folder in the original dataset as I believed they might influence the because of them inherently having some quantifiers and bias terms present (eg: prefer, always, most likely, etc.)

So, for all the dimensions, the initial set of prompts were created and stored in their respective folders in the ```prompts.json``` file.

### Creating final prompts by replacing the identity terms

In [2]:
dataset_folder = '../../nlp-fairness-for-india'
utils_folder = '../../utils'

prompts_folder = '../data'

In [3]:
axes = ['region', 'religion']

In [4]:
id_terms = {}

for axis in axes:
    id_terms[axis] = []
    with open(f'{dataset_folder}/{axis}_idterms.tsv', 'r') as f:
        for line in f:
            id_terms[axis].append(line.strip())
        id_terms[axis] = id_terms[axis][1:]
        id_terms[axis] = [term[0].upper() + term[1:] for term in id_terms[axis]]

In [5]:
for experiment in os.listdir(prompts_folder):
    if os.path.isdir(f'{prompts_folder}/{experiment}'):
        prompts = json.load(open(f'{prompts_folder}/{experiment}/prompts.json', 'r'))

        for axis in axes:
            modified_prompts = {}
            for id_term in id_terms[axis]:
                modified_prompts[id_term] = {}
                for subcategory in prompts:
                    modified_prompts[id_term][subcategory] = []
                    for prompt in prompts[subcategory]:
                        modified_prompts[id_term][subcategory].append(prompt.replace('[IDENTITY_TERM]', id_term))

            json.dump(modified_prompts, open(f'{prompts_folder}/{experiment}/{axis}_dataset.json', 'w'))

# Running the masked language model and storing results

In [21]:
model_name = 'google/muril-base-cased'

In [None]:
mask_fill = pipeline('fill-mask', model=model_name)

In [None]:
test_sentence = 'The [MASK] is a beautiful place.'
mask_fill(test_sentence)

In [None]:
for experiment in os.listdir(prompts_folder):
    if os.path.isdir(f'{prompts_folder}/{experiment}'):
        for axis in axes:
            dataset = json.load(open(f'{prompts_folder}/{experiment}/{axis}_dataset.json', 'r'))
            results = {}

            for id_term in tqdm(dataset):
                results[id_term] = {}
                for subcategory in dataset[id_term]:
                    results[id_term][subcategory] = []
                    for prompt in dataset[id_term][subcategory]:
                        result = mask_fill(prompt)
                        results[id_term][subcategory].append(result)

            json.dump(results, open(f'{prompts_folder}/{experiment}/{axis}_results.json', 'w'))