# Data Preparation for Repairing the Model Inference to Handle Fairness Issue

* Preparing selected template from IMDB review data
* Taking a random sample from names, https://www.surveysystem.com/sscalc.htm (for calculating the number of names with representative sample size)
* Create mutant texts from the names
* Return the majority result from the prediction (Note: No need to have a true label from the majority. The majority will represent a fairness inference from the prediction) -> **presented in other notebook**

In [1]:
import pandas as pd
import numpy as np
import math

#### Preparing Mutant Template

Please refer to `codes/mutant-generation.ipynb`to know the detail in getting the template.

In [4]:
dfm = pd.read_csv("../data/imdb_mutant/male/test.csv", header=None, sep="\t", names=["label", "mutant", "template"])
dff = pd.read_csv("../data/imdb_mutant/female/test.csv", header=None, sep="\t", names=["label", "mutant", "template"])

In [5]:
dfm

Unnamed: 0,label,mutant,template
0,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
1,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
2,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
3,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
4,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
...,...,...,...
138995,1,"First, I'm a huge Justin fan. I grew up knowin...","First, I'm a huge Buddy Holly fan. I grew up k..."
138996,1,"First, I'm a huge Terrence fan. I grew up know...","First, I'm a huge Buddy Holly fan. I grew up k..."
138997,1,"First, I'm a huge Roger fan. I grew up knowing...","First, I'm a huge Buddy Holly fan. I grew up k..."
138998,1,"First, I'm a huge Torrance fan. I grew up know...","First, I'm a huge Buddy Holly fan. I grew up k..."


In [6]:
df = pd.concat([dfm, dff])

In [7]:
df

Unnamed: 0,label,mutant,template
0,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
1,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
2,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
3,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
4,1,"I have only see three episodes of Hack, starri...","I have only see three episodes of Hack, starri..."
...,...,...,...
138995,1,"First, I'm a huge Melanie fan. I grew up knowi...","First, I'm a huge Buddy Holly fan. I grew up k..."
138996,1,"First, I'm a huge Tanisha fan. I grew up knowi...","First, I'm a huge Buddy Holly fan. I grew up k..."
138997,1,"First, I'm a huge Nancy fan. I grew up knowing...","First, I'm a huge Buddy Holly fan. I grew up k..."
138998,1,"First, I'm a huge Tia fan. I grew up knowing w...","First, I'm a huge Buddy Holly fan. I grew up k..."


In [8]:
df["template"] = df["template"].astype("category")
df["template_id"] = df["template"].cat.codes

In [9]:
gb = df.groupby("template_id")

In [10]:
gb.count()

Unnamed: 0_level_0,label,mutant,template
template_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,40,40,40
1,40,40,40
2,40,40,40
3,40,40,40
4,40,40,40
...,...,...,...
6893,40,40,40
6894,40,40,40
6895,40,40,40
6896,40,40,40


we have 6898 templates with 40 mutants for each template

#### Preparing Name from Gender Computer

In [11]:
gc = pd.read_csv("../data/gc_name/data.csv")
gc

Unnamed: 0,Name,Gender,Country
0,Roen,male,UK
1,Jeet,male,UK
2,Hagen,male,UK
3,Willow,male,UK
4,Belal,male,UK
...,...,...,...
615,Virág,female,Hungary
616,Adél,female,Hungary
617,Olga,female,Hungary
618,Jolán,female,Hungary


In [18]:
gcm = gc[gc["Gender"] == "male"]
gcf = gc[gc["Gender"] == "female"]

#### Mutant Generation

In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import spacy
import en_core_web_lg
import neuralcoref
nlp = en_core_web_lg.load()
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [19]:
# names from GC
mnames = gcm["Name"].values
fnames = gcf["Name"].values

# names from EEC paper
# mnames = ["Alonzo", "Adam", "Alphonse", "Alan", "Darnell", "Andrew", "Jamel", "Frank", "Jerome", "Harry", "Lamar", "Jack", "Leroy", "Josh", "Malik", "Justin", "Terrence", "Roger", "Torrance", "Ryan"]
# fnames = ["Ebony", "Amanda", "Jasmine", "Betsy", "Lakisha", "Courtney", "Latisha", "Ellen", "Latoya", "Heather", "Nichelle", "Katie", "Shaniqua", "Kristin", "Shereen", "Melanie", "Tanisha", "Nancy", "Tia", "Stephanie"]

# small name for debugging
# mnames = ["Alonzo", "Adam"] 
# fnames = ["Ebony", "Amanda"]

# masculine pronoun
mp = ["He", "he", "him", "his", "himself"]

# feminine prononun
fp = ["She", "she","her", "her", "herself"]

# masculine contra -> flipper
mc = {}
fc = {}

for _m, _f in zip(mp, fp) :
    fc[_m] = _f
    mc[_f] = _m

In [20]:
# contain a word and its location inside the sentence
# The location is indicated by start char and end char
class Token: 
    word = ""
    start = -1
    end = -1
    
    def __init__(self, word, start, end) :
        self.word = word
        self.start = start
        self.end = end
        
    def __str__(self) :
        return self.word
    
    def __repr__(self) :
        return self.word
        
    def get_word(self):
        return self.word
    
    def get_start(self):
        return self.start
    
    def get_end(self):
        return self.end


# Reference is a class to save Reference data
# e.g. La Marquesa herself : [La Marquesa herself, her]
class Ref:
    
    name = ""
    reference = []
    reference_list = []
    
    def __init__(self, name, reference):
        self.name = str(name)
        self.reference = []
        self.reference_list = []
        for word in reference :
            self.reference_list.append(word.text)
            self.reference.append(Token(word.text, word.start_char, word.end_char))
            
    def __str__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def __repr__(self) :
        return self.name + ": " + str(self.reference_list)
    
    def get_name(self):
        return self.name
    
    def get_reference(self):
        return self.reference
    
    # is having male subject
    def is_having_male_subject(self):
        if "He" in self.reference_list :
            return True
        elif "he" in self.reference_list :
            return True
        else :
            return False

    # is having female subject
    def is_having_female_subject(self):
        if "She" in self.reference_list :
            return True
        elif "she" in self.reference_list :
            return True
        else :
            return False

class Coref:
    original = ""
    resolved = ""
    refs = []
    one_subject = False
    is_male = False
    subject_reference = None
    chunk = []
    
    def __init__(self, text):
        
        self.original = str(text)
        doc = nlp(text)
        refs = doc._.coref_clusters
        self.resolved = str(doc._.coref_resolved)
        self.refs = []
        for r in refs :
            self.refs.append(Ref(r.main, r.mentions))
            
        self.one_subject, self.subject_reference, self.is_male = self.check_one_subject()
        
        if self.one_subject :
            self.chunk = self.generate_chunk_from_coref()
            
    def get_original(self):
        return self.original
    
    def get_resolved(self):
        return self.resolved
    
    def get_refs(self):
        return self.refs
    
    def is_one_subject(self) :
        return self.one_subject
    
    def check_one_subject(self) :
        s = 0
        subject_reference = None
        for r in self.refs :
            if r.is_having_male_subject() :
                s += 1
                subject_reference = r
                is_male = True
            
            if r.is_having_female_subject() :
                s += 1
                subject_reference = r
                is_male = False
                
        if s == 1 :
            return True, subject_reference, is_male
        else :
            return False, None, None
    
    def get_subject_reference(self):
        return self.subject_reference
    
    def generate_chunk_from_coref(self) :
        chunk = []
        refs = self.subject_reference.get_reference()
        lb = 0 # lower bound
        ub = 0 # upper bound
        for i in range(len(refs)) :
            if i == 0 :
                ub = refs[i].start
                _chunk = text[lb:ub]
                if _chunk == "" :
                    chunk.append(" ")
                else :
                    chunk.append(_chunk)
            else :
                lb = refs[i-1].end
                ub = refs[i].start
                _chunk = text[lb:ub]
                if _chunk == "" :
                    chunk.append(" ")
                else :
                    chunk.append(_chunk)
                
            if i == len(refs)-1 :
                lb = refs[-1].end
                chunk.append(self.original[lb:])
        
        return chunk
    
    def generate_male_mutant_text(self):
        refs = self.subject_reference.get_reference()
        chunk = self.chunk
        mutant = []
        if self.is_male :
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
        else :
            gender = "female"
            for name in mnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(mc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "male", "text": "".join(t)})
        return mutant
            
    def generate_female_mutant_text(self):
        refs = self.subject_reference.get_reference()
        chunk = self.chunk
        mutant = []
        if self.is_male :
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in mp :
                        t.append(fc[r.word])
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        else :
            for name in fnames :
                t = []
                t.append(chunk[0])
                i = 1
                for r in refs :
                    if r.word in fp :
                        t.append(r.word)
                    else :
                        t.append(name)
                        
                    t.append(chunk[i])
                    i += 1
                mutant.append({"gender": "female", "text": "".join(t)})
        return mutant
    
    
    def generate_mutant_text(self):
        chunk = self.chunk
        mutant = self.generate_male_mutant_text()
        fmutant = self.generate_female_mutant_text()
        for _fmutant in fmutant :
            mutant.append(_fmutant)
        return mutant

In [24]:
dt = df[["label", "template"]]

In [30]:
dt = dt.drop_duplicates().reset_index(drop=True)
dt

Unnamed: 0,label,template
0,1,"I have only see three episodes of Hack, starri..."
1,1,In the groovy mid 70's a scruffy bunch of bras...
2,1,This must have been one of Chaplin's most ambi...
3,1,The debut that plucked from obscurity one of t...
4,1,There is really no way to compare this motion ...
...,...,...
6893,1,I heard they were going to remake this French ...
6894,1,"Well, the movie did turn out a lot better than..."
6895,0,"In this film, there is a loose plot of a man (..."
6896,1,The French Babbette appears at the modest hous...


In [48]:
import time
start = time.time()

original_arr = []
mutant_arr = []
gender_arr = []
label_arr = []

for index, row in dt.iterrows():
    label = row['label']
    text = row['template']
    c = Coref(text)
    if c.is_one_subject() :
        
        mtext = c.generate_mutant_text()
        for m in mtext :
            original_arr.append(text)
            mutant_arr.append(m["text"])
            gender_arr.append(m["gender"])
            label_arr.append(label)
        

end = time.time()
print("Execution Time: ", end-start)

Execution Time:  1116.200192451477


In [49]:
dmutant = pd.DataFrame(data={"label": label_arr, "mutant": mutant_arr, "gender": gender_arr, "original": original_arr})
dmutant

Unnamed: 0,label,mutant,gender,original
0,1,"I have only see three episodes of Hack, starri...",male,"I have only see three episodes of Hack, starri..."
1,1,"I have only see three episodes of Hack, starri...",male,"I have only see three episodes of Hack, starri..."
2,1,"I have only see three episodes of Hack, starri...",male,"I have only see three episodes of Hack, starri..."
3,1,"I have only see three episodes of Hack, starri...",male,"I have only see three episodes of Hack, starri..."
4,1,"I have only see three episodes of Hack, starri...",male,"I have only see three episodes of Hack, starri..."
...,...,...,...,...
4276755,1,"First, I'm a huge Virág fan. I grew up knowing...",female,"First, I'm a huge Buddy Holly fan. I grew up k..."
4276756,1,"First, I'm a huge Adél fan. I grew up knowing ...",female,"First, I'm a huge Buddy Holly fan. I grew up k..."
4276757,1,"First, I'm a huge Olga fan. I grew up knowing ...",female,"First, I'm a huge Buddy Holly fan. I grew up k..."
4276758,1,"First, I'm a huge Jolán fan. I grew up knowing...",female,"First, I'm a huge Buddy Holly fan. I grew up k..."


In [50]:
import os

dirname = "../data/gc_imdb/"

if not os.path.exists(dirname) :
    os.makedirs(dirname)

dmutant.to_csv(dirname + "test.csv", index=None, header=None, sep="\t")