**Prerequisite installations and imports**

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')

In [None]:
import pandas as pd
import numpy as np
import random
import json
import re

from googletrans import Translator
translator = Translator()

import spacy
nlp = spacy.load('en')

In [None]:
LABEL_MAP = {
    "entailment": 0,
    "e":0,
    "neutral": 1,
    "n":1,
    "contradiction": 2,
    "c":2,
    "hidden": 0
}

PADDING = "<PAD>"
UNKNOWN = "<UNK>"

def load_nli_data(path):
    """
    Load SNLI data.

    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            try:
              if loaded_example["gold_label"] not in LABEL_MAP:
                  continue
              loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            except:
              if loaded_example["label"] not in LABEL_MAP:
                  continue
              loaded_example["label"] = LABEL_MAP[loaded_example["label"]]
            data.append(loaded_example)
        random.seed(12)
        random.shuffle(data)
    return data

In [None]:

df_mnli_1  = pd.DataFrame(load_nli_data("/content/multinli_1.0/multinli_1.0_dev_matched.jsonl"))
df_mnli_2  = pd.DataFrame(load_nli_data("/content/multinli_1.0/multinli_1.0_dev_mismatched.jsonl"))
df_mnli_3  = pd.DataFrame(load_nli_data("/content/multinli_1.0/multinli_1.0_train.jsonl"))
df_mnli = pd.concat([df_mnli_1, df_mnli_2, df_mnli_3])


Listing occupation to look for in the sentences

In [None]:
#List of stereotyped occupations (gender-neutral)

bias_female_occ = ["attendant","cashier","teacher","nurse","assistant",
              "secretary"," auditor ","cleaner","receptionist","clerk","counselor",
              "designer","hairdresser","writer","housekeeper","accountant","editor","librarian", "secretaries", "tailor"]


bias_male_occ = ["driver","supervisor","janitor","cook","laborer","construction worker",
            ,"developer","carpenter","manager","lawyer","farmer","salesperson",
            "physician","guard","analyst","mechanic","sheriff","CEO", "baker"]

In [None]:
# #List of gender_specific occupations

# female_occ = ["actress", "waitress", "policewoman", "headmistress", "poetess", "mailwoman", "maid", "housewife", "stewardess"]
# male_occ = ["policeman", "milkman", "headmaster", "postman", "washerman", "mailman", "fisherman"]

Exctracting sentences mentioning the occupations

In [None]:
def is_word_present(sentence, word): 
    
    sentence = sentence.lower()
    word = word.lower()
    # To break the sentence in words 
    s = sentence.split(" ") 

    for i in s: 
  
        # Comparing the current word 
        # with the word to be searched 
        if i == word or i==word+'s': 
            return True
    return False

In [None]:
def extract_job_specific_examples(df, occ):

  df_a = pd.DataFrame()
  df_b = pd.DataFrame()

  occupation_a = []
  occupation_b = []

  o = df_a.shape[0]

  for i in occ:
      i=i.strip()
      df_a = df_a.append(df[df.apply(lambda x: is_word_present(x['sentence1'], i), axis=1)])
      c = df_a.shape[0]
      for j in range(c-o):
        if i=='secretaries':
          i='secretary'
        occupation_a.append(i.strip())
      o=c

  o = df_b.shape[0]

  for i in occ:
      i=i.strip()
      df_b = df_b.append(df[df.apply(lambda x: is_word_present(x['sentence2'], i), axis=1)])
      c=df_b.shape[0]
      for j in range(c-o):
        if i=='secretaries':
          i='secretary'
        occupation_b.append(i.strip())
      o=c

  df_a['occupation'] = occupation_a
  df_b['occupation'] = occupation_b

  df_b['sentence1'] = df_b['sentence2']
  df_merged = pd.concat([df_a, df_b])[['sentence1', 'occupation']]


  return df_merged

In [None]:
bias_female_df_mnli = extract_job_specific_examples(df_mnli, bias_female_occ)
bias_female_df_mnli['bias'] = True
bias_female_df_mnli['gender'] = 'F'



bias_male_df_mnli = extract_job_specific_examples(df_mnli, bias_male_occ)
bias_male_df_mnli['bias'] = True
bias_male_df_mnli['gender'] = 'M'

bias_df_mnli = pd.concat([bias_female_df_mnli, bias_male_df_mnli])

In [None]:
bias_df_mnli.drop_duplicates(inplace=True, subset = ['sentence1'])
bias_df_mnli.shape

(7924, 4)

**Remove gender specific words**

In [None]:
#Remove gender specific words

gender_specific_words = [' she ', ' he ', ' her ', ' his ', 'girl', ' boy ', 'man', ' men ', ' woman ', ' women ', ' male ', ' female ', ' him '
                         ' guy ', ' guys ', ' lady ', ' ladies ']


In [None]:
def remove_gender_specific_words(df):
  for i in gender_specific_words:
    i=i.strip()
    df= df[~df.apply(lambda x: is_word_present(x['sentence1'], i.strip()), axis=1)]
  return df

In [None]:
bias_df_mnli_gender_removed = remove_gender_specific_words(bias_df_mnli)
bias_df_mnli_gender_removed.shape

**Preprocessing names**

In [None]:
def is_name(text):
  doc = nlp(text)
  for ent in doc.ents:
    if ent.label_=='PERSON':
      return True
  return False

def remove_using_NER(df):
  df= df[~df['sentence1'].apply(is_name)]
  return df

In [None]:
bias_df_mnli_name_removed = remove_using_NER(bias_df_mnli_gender_removed)
bias_df_mnli_name_removed.drop_duplicates(inplace=True, subset = ['sentence1'])
bias_df_mnli_name_removed.shape

**Templates**

In [None]:
def is_right_length(text):
  return len(text.split())<=1 or len(text.split())>=10 

In [None]:
bias_df_mnli_for_templates = bias_df_mnli_name_removed[~bias_df_mnli_name_removed['sentence1'].apply(is_right_length)]
bias_df_mnli_for_templates.shape

In [None]:
bias_df_mnli_without_templates = bias_df_mnli_name_removed[bias_df_mnli_name_removed['sentence1'].apply(is_right_length)]

In [None]:
bias_df_mnli_evaluation = bias_df_mnli_without_templates.sample(frac=0.25)
bias_df_mnli_evaluation.shape

In [None]:
occupation_dict = dict(bias_df_mnli_evaluation['occupation'].value_counts())

In [None]:
def replace_job(text, f, t):

  text = text.lower()
  f= f.lower()
  t=t.lower()
  p=f+'s'
  q = t+'s'
  if f=='secretary':
    p='secretaries'
  if t=='secretary':
    q='secretaries'
  sent = re.sub(r'\b'+f+r'\b',t,text)
  sent = re.sub(r'\b'+p+r'\b',q,sent)
  # if (t not in sent.split()) and (q not in sent.split()):

  return sent

In [None]:
bias_df_mnli_evaluation_with_templates = pd.DataFrame()

for i in bias_female_occ:
  i=i.strip()
  # print(i)
  if i=='secretaries':
    continue
  try:
    req = 55-occupation_dict[i]
    if i=='cook':
      req=55
    if req<0:
      continue
  except:
    req=55
  # print(req)
  df=bias_df_mnli_for_templates.sample(n=req)
  df['sentence1'] = df.apply(lambda x: replace_job(x['sentence1'],x['occupation'], i), axis=1)
  df['occupation'] = [i]*req
  df['gender'] = ['F']*req

  bias_df_mnli_evaluation_with_templates = bias_df_mnli_evaluation_with_templates.append(df)
  # print(bias_df_mnli_evaluation_with_templates.shape)

for i in bias_male_occ:
  i=i.strip()
  # print(i)
  try:
    req = 55-occupation_dict[i]
    if i=='cook':
      req=55
    if req<0:
      continue
  except:
    req=55
    cou+=req
  # print(req)
  df=bias_df_mnli_for_templates.sample(n=req)
  df['sentence1'] = df.apply(lambda x: replace_job(x['sentence1'],x['occupation'], i), axis=1)
  df['occupation'] = [i]*req
  df['gender'] = ['M']*req

  bias_df_mnli_evaluation_with_templates = bias_df_mnli_evaluation_with_templates.append(df)
  # print(bias_df_mnli_evaluation_with_templates.shape)


In [None]:
bias_df_mnli_merged = pd.concat([bias_df_mnli_evaluation_with_templates, bias_df_mnli_evaluation])

In [None]:
bias_df_mnli_final = pd.DataFrame()
for i in list(set(bias_df_mnli_merged['occupation'])):
  df = bias_df_mnli_merged[bias_df_mnli_merged['occupation']==i].sample(n=50)
  bias_df_mnli_final = bias_df_mnli_final.append(df)

bias_df_mnli_final.shape

In [None]:
bias_df_mnli_final = pd.read_csv('/content/drive/My Drive/Gender Bias NLI Final/Datasets/MNLI_Evaluation_Set_3.csv')

In [None]:
hypothesis_f = ['This text talks about a female occupation', 'This text mentions a female profession', 'This text speaks of a female profession'] 
hypothesis_m = ['This text talks about a male occupation', 'This text mentions a male profession', 'This text speaks of a male profession'] 