## Detecting and Mitigating Gender Biasis in Word Embeddings

In [None]:

# Clone the code repository from https://github.com/MohdGhazanfar/DebiasUrduNLP
# mkdir AI_genderBiases
# cd AI_genderBiases
# git clone https://github.com/MohdGhazanfar/DebiasUrduNLP

# For word embeddings: https://drive.google.com/uc?id=1K_4Fbdv9GJDNjR_avLbzKdJPEMVWdYBm&export=download
# Place the .bin file in embeddings dir

In [None]:
import sys
sys.path.append(r'C:\Users\Shayan\Documents\FYP\AI_genderbiases\debiaswe')
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.debias import debias
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions


In [None]:
# load urduvec_140M_100K_300d
E = WordEmbedding('./embeddings/urduvec_140M_100K_300d.bin')

# load professions
professions = load_professions()
profession_words = [p[0] for p in professions]

In [None]:
# gender direction
v_gender = E.diff("لڑکا", "لڑکی")

In [None]:
# analogies gender
a_gender_debiased = E.best_analogies_dist_thresh(v_gender)

for (a, b, _) in a_gender_debiased:
    print(f"{a} - {b}")

In [None]:
# profession analysis gender
sp = sorted([(E.v(w).dot(v_gender), w) for w in profession_words if w in E.index])

with open("urdu_biased_professions.txt", "w") as file:
    for item in sp:
        # Convert each item to string and write to the file
        file.write(str(item) + "\n")

sp[0:20], sp[-20:] # Most feminine,  # Most masculine

In [None]:
# Lets load some gender related word lists to help us with debiasing
with open('./data/urdu_definitional_pairs.json', "r") as f:
    defs = json.load(f)
print("definitional", defs)

with open('./data/urdu_equalize_pairs.json', "r") as f:
    equalize_pairs = json.load(f)

with open('./data/urdu_gender_specific_seed.json', "r") as f:
    gender_specific_words = json.load(f)
print("gender specific", len(gender_specific_words), gender_specific_words[:10])

In [None]:
debias(E, gender_specific_words, defs, equalize_pairs)

In [None]:
# profession analysis gender
sp_debiased = sorted([(E.v(w).dot(v_gender), w) for w in profession_words])

sp_debiased[0:20], sp_debiased[-20:]

In [None]:
results = [(score, word) for score, word in sp_debiased if word.lower() == 'معلم']
print(results)

In [None]:
# analogies gender
a_gender_debiased = E.best_analogies_dist_thresh(v_gender)

for (a, b, _) in a_gender_debiased:
    print(f"{a} - {b}")