**Prerequisite installations and imports**

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')

In [None]:
import pandas as pd
import numpy as np
import random
import json
import re


#### Loading data

In [None]:
LABEL_MAP = {
    "entailment": 0,
    "e":0,
    "neutral": 1,
    "n":1,
    "contradiction": 2,
    "c":2,
    "hidden": 0
}

PADDING = "<PAD>"
UNKNOWN = "<UNK>"

def load_nli_data(path):
    """
    Load SNLI data.

    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            try:
              if loaded_example["gold_label"] not in LABEL_MAP:
                  continue
              loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            except:
              if loaded_example["label"] not in LABEL_MAP:
                  continue
              loaded_example["label"] = LABEL_MAP[loaded_example["label"]]
            data.append(loaded_example)
        random.seed(12)
        random.shuffle(data)
    return data

In [None]:

df_snli  = pd.DataFrame(load_nli_data("/content/snli_1.0/snli_1.0_train.jsonl"))
print(df_snli.shape)


(549367, 11)


#### Listing occupation to look for in the sentences

In [None]:
#List of stereotyped occupations (gender-neutral)

bias_female_occ = ["attendant","cashier","teacher","nurse","assistant",
              "secretary"," auditor ","cleaner","receptionist","clerk","counselor",
              "designer","hairdresser","writer","housekeeper","accountant","editor","librarian", "secretaries", "tailor"]


bias_male_occ = ["driver","supervisor","janitor","cook","laborer","construction worker",
            "developer","carpenter","manager","lawyer","farmer","salesperson",
            "physician","guard","analyst","mechanic","sheriff","CEO", "baker"]

#### Exctracting sentences mentioning the occupations

In [None]:
def is_word_present(sentence, word): #Check for presence of word in single sentence
  
    sentence = sentence.lower()
    word = word.lower()
    # To break the sentence in words 
    s = sentence.split(" ") 

    for i in s: 
  
        # Comparing the current word 
        # with the word to be searched 
        if i == word or i==word+'s': 
          
            return True
    return False

In [None]:
def is_word_present_mul(sentence1, sentence2, word): #Check for the presence of word in 2 sentences (premise and hypothesis)
  
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    i = word.lower()
    # To break the sentence in words 
    s1 = sentence1.split(" ") 
    s2 = sentence2.split(" ")

    if i in s1 or i in s2 or i+'s' in s1 or i+'s' in s2:
      return True
    return False

In [None]:
def extract_job_specific_examples(df, occ):

  df = pd.DataFrame()


  occupation = []
  o = df.shape[0]
  for i in occ:
      i=i.strip()
      df = df.append(df_snli[df_snli.apply(lambda x: is_word_present(x['sentence1'], i), axis=1)])
      c = df.shape[0]
      for j in range(c-o):
        if i=='secretaries':
          i='secretary'
        occupation.append(i.strip())
      o=c

  df['occupation'] = occupation
  print(df.shape)
  return df

In [None]:
bias_female_df_snli = extract_job_specific_examples(df_snli, bias_female_occ)
bias_female_df_snli['bias'] = True
bias_female_df_snli['gender'] = 'F'



bias_male_df_snli = extract_job_specific_examples(df_snli, bias_male_occ)
bias_male_df_snli['bias'] = True
bias_male_df_snli['gender'] = 'M'

bias_df_snli = pd.concat([bias_female_df_snli, bias_male_df_snli])

(1089, 12)
(2284, 12)


In [None]:
bias_df_snli.drop_duplicates(inplace=True, subset = ['sentence1'])
bias_df_snli.shape

(892, 14)

**Swap gender specific words**

In [None]:
#Gender specific words

gender_specific_words = [' she ', ' he ', ' her ', ' his ', 'girl', ' boy ', 'man', ' men ', ' woman ', ' women ', ' male ', ' female ', ' him '
                         ' guy ', ' guys ', ' lady ', ' ladies ']


In [None]:
def remove_gender_specific_words(df):
  for i in gender_specific_words:
    i=i.strip()
    df= df[~df.apply(lambda x: is_word_present_mul(x['sentence1'],x['sentence2'],i.strip()), axis=1)]
  print(df.shape)
  return df

In [None]:
bias_df_snli_gender_removed = remove_gender_specific_words(bias_df_snli)
bias_df_snli_gender_inc = bias_df_snli[~bias_df_snli.sentence1.isin(bias_df_snli_gender_removed.sentence1)]
bias_df_snli_gender_inc.shape

(422, 14)


(470, 14)

In [None]:
gender_map = {'her':'his', 'him':'her', 'his':'her', 'he':'she', 'she':'he', 'girl':'boy', 'boy':'girl', 
              'man':'woman', 'woman':'man', 'men':'women', 'women': 'men', 'male':'female', 'female':'male', 
              'lady':'man', 'ladies':'men', 'guy':'girl', 'guys':'girls', 'boys':'girls', 'girls':'boys'}

In [None]:
def replace_gender(text):
  text=text.lower()
  done={}
  for i in gender_map.keys():
    done[i]=0
  for i,v in gender_map.items():
    if done[v]==0:
      sent = re.sub(r'\b'+i+r'\b',v,text)
      if sent!=text:
        done[i]=1
    text=sent
  # if (t not in sent.split()) and (q not in sent.split()):

  return sent

In [None]:
replace_gender('the cashier wrinkled her nose and began to query her computerized register system.')

'the cashier wrinkled his nose and began to query his computerized register system.'

In [None]:
bias_df_snli_gender_inc['sentence1'] = bias_df_snli_gender_inc['sentence1'].apply(replace_gender)
bias_df_snli_gender_inc['sentence2'] = bias_df_snli_gender_inc['sentence2'].apply(replace_gender)

In [None]:
new_df_snli = pd.concat([bias_df_snli_gender_inc, df_snli])
new_df_snli.drop_duplicates(inplace=True, subset = ['sentence1', 'sentence2'])


#### Saving the dataset

In [None]:
new_df_snli.to_csv('/content/drive/My Drive/Gender Bias NLI Final/Datasets/snli_Augmented_Gender.csv')

In [None]:
new_df_snli.head(10)