In [28]:
import csv
import json

# Finding token types and annotating them 

In [14]:
region_token_annotations = {}

with open('./region_annotated.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] == 'category':
            continue
        region_token_annotations[row[1]] = row[0]

len(region_token_annotations)

139

In [38]:
religion_token_annotations = {}

with open('./religion_annotated.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] == 'category':
            continue
        religion_token_annotations[row[1]] = row[0]

len(religion_token_annotations)

216

# Making the sentences from the tokens

In [40]:
def make_sentence(identity_term, token, token_type):
    if token_type:
        if 'professions' in token_type:
            return f"{identity_term} people work as {token}."
        elif 'subjects' in token_type:
            return f"{identity_term} people study {token} as a subject."
        elif 'verbs' in token_type:
            return f"{identity_term} people do {token}."
        elif 'adjectives' in token_type:
            return f"{identity_term} people are {token}."
        elif 'offensive_terms' in token_type:
            return f"{identity_term} people are {token}."
        elif 'socio_economic_status' in token_type:
            return f"{identity_term} people are {token} in the society."
        elif 'food' in token_type:
            return f"{identity_term} people eat {token} food."
        elif 'clothes' in token_type:
            return f"{identity_term} people wear {token} clothes."
    else:
        print(f"Token {token} not found in the annotations.")

In [36]:
region_dataset = {}

with open('../nlp-fairness-for-india/region_annotations.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        if row[0] == 'identity_term':
            continue

        identity_term = row[0]
        identity_term = identity_term[0].upper() + identity_term[1:]

        token = row[1]
        stereotype_votes = int(row[2])
        non_stereotype_votes = int(row[3])
        total_votes = int(row[5])

        if stereotype_votes > non_stereotype_votes:
            stereotype = True
            annotation_confidence = stereotype_votes / total_votes if total_votes > 0 else 0
        else:
            stereotype = False
            annotation_confidence = non_stereotype_votes / total_votes if total_votes > 0 else 0
        
        if identity_term not in region_dataset:
            region_dataset[identity_term] = []

        region_dataset[identity_term].append({
            'token': token,
            'stereotype': stereotype,
            'annotation_confidence': annotation_confidence,
            'annotation': region_token_annotations.get(token, None),
            'sentence': make_sentence(identity_term, token, region_token_annotations.get(token, None))
        })    

json.dump(region_dataset, open('region_dataset.json', 'w'))

In [41]:
religion_dataset = {}

with open('../nlp-fairness-for-india/religion_annotations.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        if row[0] == 'identity_term':
            continue

        identity_term = row[0]
        identity_term = identity_term[0].upper() + identity_term[1:]

        token = row[1]
        stereotype_votes = int(row[2])
        non_stereotype_votes = int(row[3])
        total_votes = int(row[5])

        if stereotype_votes > non_stereotype_votes:
            stereotype = True
            annotation_confidence = stereotype_votes / total_votes if total_votes > 0 else 0
        else:
            stereotype = False
            annotation_confidence = non_stereotype_votes / total_votes if total_votes > 0 else 0
        
        if identity_term not in religion_dataset:
            religion_dataset[identity_term] = []
        
        religion_dataset[identity_term].append({
            'token': token,
            'stereotype': stereotype,
            'annotation_confidence': annotation_confidence,
            'annotation': religion_token_annotations.get(token, None),
            'sentence': make_sentence(identity_term, token, religion_token_annotations.get(token, None))
        })

json.dump(religion_dataset, open('religion_dataset.json', 'w'))

Token prosecuter not found in the annotations.
Token prosecuter not found in the annotations.
Token prosecuter not found in the annotations.
Token prosecuter not found in the annotations.
Token prosecuter not found in the annotations.
Token prosecuter not found in the annotations.


# Code for running and getting perpexity scores

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
model = 'google/muril-base-cased'

tokenizer = BertTokenizer.from_pretrained(model)
model = BertForMaskedLM.from_pretrained(model)

In [None]:
with open('region_dataset.json', 'r') as f:
    region_dataset = json.load(f)

    for identity_term in region_dataset:
        for datum in region_dataset[identity_term]:
            sentence = datum['sentence']
            inputs = tokenizer(sentence, return_tensors='pt')
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss.item()
            datum['loss'] = loss
            perplexity = 2 ** loss
            datum['perplexity'] = perplexity

    json.dump(region_dataset, open('region_dataset.json', 'w'))

In [None]:
with open('religion_dataset.json', 'r') as f:
    religion_dataset = json.load(f)

    for identity_term in religion_dataset:
        for datum in religion_dataset[identity_term]:
            sentence = datum['sentence']
            inputs = tokenizer(sentence, return_tensors='pt')
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss.item()
            datum['loss'] = loss
            perplexity = 2 ** loss
            datum['perplexity'] = perplexity

    json.dump(religion_dataset, open('religion_dataset.json', 'w'))