In [None]:
%%capture
!pip install transformers
!pip install googletrans==4.0.0-rc1

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import googletrans
from googletrans import Translator
import json

#Crows-Pair

In [None]:
filePath = '/content/crows_pairs_anonymized.csv'
df_crowsPair = pd.read_csv(filePath)
print(df_crowsPair.columns)
df_crowsPair.head(10)

In [None]:
sent_more = df_crowsPair['sent_more'].tolist()
sent_less = df_crowsPair['sent_less'].tolist()
bias_type = df_crowsPair['bias_type'].tolist()
stereo_antistereo = df_crowsPair['stereo_antistereo'].tolist()
unique_bias = list(set(bias_type))
print(unique_bias)
#['age','socioeconomic','physical-appearance','gender']

#age - all, physical appearance - all
#socioeconomic,gender - names of people not Indian, some sentences relevant

#disability, sexual-orientation - can be included
#sexual-orientation - names of people not Indian

In [None]:
sentences_byBias = {}
for index in range(len(bias_type)):
  if bias_type[index] in sentences_byBias.keys():
    sentences_byBias[bias_type[index]].append([sent_more[index],sent_less[index],stereo_antistereo[index],index])
  else:
    sentences_byBias[bias_type[index]] = []
    sentences_byBias[bias_type[index]].append([sent_more[index],sent_less[index],stereo_antistereo[index],index])

In [None]:
sentences_byBias['disability']

In [None]:
relevant_bias =  ['age','socioeconomic','physical-appearance','gender']
#relevant_bias = ['age','socioeconomic','physical-appearance','gender','disability','sexual-orientation']

In [None]:
indexing = []
sent_more_fil = []
sent_less_fil = []
bias_type_fil = []
stereo_antistereo_fil = []

for index in range(len(sent_more)):
  if bias_type[index] in relevant_bias:
    indexing.append(index)
    sent_more_fil.append(sent_more[index])
    sent_less_fil.append(sent_less[index])
    bias_type_fil.append(bias_type[index])
    stereo_antistereo_fil.append(stereo_antistereo[index])
print(len(sent_more_fil))
print(sent_more_fil[0])
print(sent_less_fil[0])

##Translation by NLLB

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = model.to(device=device)
translatorM = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='hin_Deva', max_length = 400, device = 0)

In [None]:
sent_more_hinM = []
sent_less_hinM = []

for index in tqdm(range(len(sent_more_fil))):
  sent_more_hinM.append((translatorM(sent_more_fil[index]))[0]['translation_text'])
  sent_less_hinM.append((translatorM(sent_less_fil[index]))[0]['translation_text'])

print(sent_more_fil[0])
print(sent_more_hinM[0])
print(sent_less_fil[0])
print(sent_less_hinM[0])

##Using google translate

In [None]:
translatorG = Translator()
print(googletrans.LANGUAGES)

In [None]:
translated_text = translatorG.translate('The Earth is round', src='en', dest= 'hi')
print(translated_text)
translated_text.text

In [None]:
sent_more_hinG = []
sent_less_hinG = []

for index in tqdm(range(len(sent_more_fil))):
  sent_more_hinG.append((translatorG.translate(sent_more_fil[index], src='en', dest= 'hi')).text)
  sent_less_hinG.append((translatorG.translate(sent_less_fil[index], src='en', dest= 'hi')).text)

print(sent_more_fil[0])
print(sent_more_hinG[0])
print(sent_less_fil[0])
print(sent_less_hinG[0])

In [None]:
len(sent_more_hinG)

##Combine

In [None]:
data = {'index': indexing,
        'sent_more_eng': sent_more_fil,
        'sent_more_hin_NLLB': sent_more_hinM,
        'sent_more_hin_Google' : sent_more_hinG,
        'sent_less_eng': sent_less_fil,
        'sent_less_hin_NLLB' : sent_less_hinM,
        'sent_less_hin_Google': sent_less_hinG,
        'bias_type': bias_type_fil,
        'stereo_antistereo': stereo_antistereo_fil}

In [None]:
df_HindiCrows_Filtered = pd.DataFrame(data)

In [None]:
df_HindiCrows_Filtered = df_HindiCrows_Filtered.sample(frac=1).reset_index(drop=True)

In [None]:
df_HindiCrows_Filtered.to_csv('HindiCrows_Filtered2.csv')

#Stereoset

In [None]:
file1 = open('dev.json')
stereosetData = json.load(file1)
print(stereosetData.keys())
print(stereosetData['data'].keys())
print(len(stereosetData['data']['intrasentence']))

In [None]:
stereosetData['data']['intrasentence'][0]

In [None]:
sentences_byBias = {}
for dict1 in stereosetData['data']['intrasentence']:

  list1 = [0,0]
  for sen in dict1['sentences']:
    if sen['gold_label'] == 'stereotype':
      list1[0] = sen['sentence']
    if sen['gold_label'] == 'anti-stereotype':
      list1[1] = sen['sentence']

  if dict1['bias_type'] in sentences_byBias.keys():
    sentences_byBias[dict1['bias_type']].append([list1[0],list1[1],"stereo"])
  else:
    sentences_byBias[dict1['bias_type']] = []
    sentences_byBias[dict1['bias_type']].append([list1[0],list1[1],"stereo"])

In [None]:
sentences_byBias.keys()

In [None]:
sentences_byBias['gender']

In [None]:
relevant_bias =  ['gender']

In [None]:
indexing = []
sent_more_fil = []
sent_less_fil = []
bias_type_fil = []
stereo_antistereo_fil = []

for dict1 in stereosetData['data']['intrasentence']:

  if dict1['bias_type'] in relevant_bias:
    list1 = [0,0]
    indexing.append(dict1['id'])
    bias_type_fil.append(dict1['bias_type'])

    for sen in dict1['sentences']:
      if sen['gold_label'] == 'stereotype':
        list1[0] = sen['sentence']
      if sen['gold_label'] == 'anti-stereotype':
        list1[1] = sen['sentence']
    sent_more_fil.append(list1[0])
    sent_less_fil.append(list1[1])
    stereo_antistereo_fil.append('stereo')


In [None]:
print(len(sent_more_fil))
print(sent_more_fil[0])
print(sent_less_fil[0])

##Translation by NLLB

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = model.to(device=device)
translatorM = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='hin_Deva', max_length = 400, device = 0)

In [None]:
sent_more_hinM = []
sent_less_hinM = []

for index in tqdm(range(len(sent_more_fil))):
  sent_more_hinM.append((translatorM(sent_more_fil[index]))[0]['translation_text'])
  sent_less_hinM.append((translatorM(sent_less_fil[index]))[0]['translation_text'])

print(sent_more_fil[0])
print(sent_more_hinM[0])
print(sent_less_fil[0])
print(sent_less_hinM[0])

##Using google translate

In [None]:
translatorG = Translator()
print(googletrans.LANGUAGES)

In [None]:
translated_text = translatorG.translate('black are slaves', src='en', dest= 'hi')
print(translated_text)
translated_text.text

In [None]:
sent_more_hinG = []
sent_less_hinG = []

for index in tqdm(range(len(sent_more_fil))):
  sent_more_hinG.append((translatorG.translate(sent_more_fil[index], src='en', dest= 'hi')).text)
  sent_less_hinG.append((translatorG.translate(sent_less_fil[index], src='en', dest= 'hi')).text)

print(sent_more_fil[0])
print(sent_more_hinG[0])
print(sent_less_fil[0])
print(sent_less_hinG[0])

In [None]:
len(sent_more_hinG)

##Combine

In [None]:
data = {'index': indexing,
        'sent_more_eng': sent_more_fil,
        'sent_more_hin_NLLB': sent_more_hinM,
        'sent_more_hin_Google' : sent_more_hinG,
        'sent_less_eng': sent_less_fil,
        'sent_less_hin_NLLB' : sent_less_hinM,
        'sent_less_hin_Google': sent_less_hinG,
        'bias_type': bias_type_fil,
        'stereo_antistereo': stereo_antistereo_fil}

In [None]:
df_HindiSteroeSet_Filtered = pd.DataFrame(data)

In [None]:
df_HindiSteroeSet_Filtered = df_HindiSteroeSet_Filtered.sample(frac=1).reset_index(drop=True)

In [None]:
df_HindiSteroeSet_Filtered.to_csv('HindiStereoSet_Filtered.csv')

# Wino Gender

In [None]:
data_all_sent = pd.read_csv('/content/all_sentences.tsv',sep='\t')
data_all_sent.head(200)

In [None]:
sentence_ids_complete = data_all_sent['sentid'].tolist()
sentences = data_all_sent['sentence'].tolist()
sentence_ids_cs = [x.split('.') for x in sentence_ids_complete]

unique_ids = []
for id in sentence_ids_cs:
  unique_ids.append(id[0] + '.' + id[1] + '.' + id[2])
unique_ids = list(set(unique_ids))

In [None]:
dataset_dict = {}
for id in unique_ids:
  dataset_dict[id] = [0,0]

In [None]:
for i,sentence in zip(sentence_ids_cs,sentences):
  id = i[0] + '.' + i[1] + '.' + i[2]
  if i[3] == 'male':
    dataset_dict[id][0] = sentence
  elif i[3] == 'female':
    dataset_dict[id][1] = sentence
  else:
    continue

In [None]:
len(dataset_dict)

In [None]:
indexing = []
sent_more_fil = []
sent_less_fil = []
bias_type_fil = []
stereo_antistereo_fil = []

for id in dataset_dict.keys():

  indexing.append(id)
  sent_more_fil.append(dataset_dict[id][0])
  sent_less_fil.append(dataset_dict[id][1])
  bias_type_fil.append("NA")
  stereo_antistereo_fil.append("NA")

In [None]:
print(len(sent_more_fil))
print(sent_more_fil[0])
print(sent_less_fil[0])