### Imports

In [1]:
import csv
import json
import os

### Getting all the stereotype tokens in the dataset for region and religion

In [2]:
dataset_folder = '../../nlp-fairness-for-india'

utils_folder = '../../utils'
os.makedirs(utils_folder, exist_ok=True)

In [3]:
axes = ['region', 'religion']

for axis in axes:
    tokens_set = set()
    with open(f'{dataset_folder}/{axis}_annotations.tsv', 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if row[0] == 'identity_term':
                continue
            tokens_set.add(row[1])
    
    print(f'Number of {axis} tokens: {len(tokens_set)}')

    with open(f'{utils_folder}/{axis}_tokens.txt', 'w') as f:
        for token in tokens_set:
            f.write(f'{token}\n')

Number of region tokens: 139
Number of religion tokens: 216


### Annotating the tokens to reflect the subcategory they belong to 

This process was carried out with the help of Gemini 2.0 and after that I had manually checked the results to ensure that the tokens were correctly annotated.

It should be noted that due to this automatic annotation process there were some issues like the very low number of of tokens being classified into the category of food or clothes. 

The annotations were stored in the form of a csv into ```region_annotated.csv``` and ```religion_annotated.csv```


In [4]:
for axis in axes:
    category_count = {}
    with open(f'{utils_folder}/{axis}_annotated.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if row[0] == 'category':
                continue
            category = row[0]
            if category not in category_count:
                category_count[category] = 0
            category_count[category] += 1

    print(f'Categories for {axis}: {[category for category in category_count]}')
    # display the number of for each category
    for category, count in category_count.items():
        print(f'{category}: {count}')

Categories for region: ['adjectives', 'subjects', 'professions', 'verbs', 'offensive_terms', 'food']
adjectives: 35
subjects: 19
professions: 81
verbs: 2
offensive_terms: 1
food: 1
Categories for religion: ['professions', 'adjectives', 'subjects', 'verbs', 'offensive_terms', 'socio-economic_status']
professions: 141
adjectives: 46
subjects: 20
verbs: 6
offensive_terms: 2
socio-economic_status: 1


### Compiling the final dataset with annotated tokens for each identity terms, along with their stereotype category and annotation confidence

In [5]:
token_annotations = {}

for axis in axes:
    token_annotations[axis] = {}
    with open(f'{utils_folder}/{axis}_annotated.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if row[0] == 'category':
                continue
            token_annotations[axis][row[1]] = row[0]

In [7]:
for axis in axes:
    compiled_data = {}
    with open(f'{dataset_folder}/{axis}_annotations.tsv', 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if row[0] == 'identity_term':
                continue
            
            identity_term = row[0]
            identity_term = identity_term[0].upper() + identity_term[1:]

            token = row[1]
            stereotype_votes = int(row[2])
            non_stereotype_votes = int(row[3])
            total_votes = int(row[5])

            if stereotype_votes > non_stereotype_votes:
                stereotype = True
                annotation_confidence = stereotype_votes / total_votes if total_votes > 0 else 0
            else:
                stereotype = False
                annotation_confidence = non_stereotype_votes / total_votes if total_votes > 0 else 0
            
            if identity_term not in compiled_data:
                compiled_data[identity_term] = []

            compiled_data[identity_term].append({
                'token': token,
                'stereotype': stereotype,
                'annotation_confidence': annotation_confidence,
                'annotation': token_annotations[axis].get(token, None)
            })    
    
    json.dump(compiled_data, open(f'{utils_folder}/all_{axis}_data.json', 'w'))