## FNED and FPED Implementation

In [9]:
import pandas as pd
import numpy as np
import math

#### Read Original Data

In [10]:
data_dir = "gc_mutant"
df = pd.read_csv("../data/" + data_dir + "/test.csv", header=None, sep="\t", names=[0, 1, "mutant", "template", "gender", "label", "country"])
df

Unnamed: 0,0,1,mutant,template,gender,label,country
0,0,Roen feels angry,Roen feels angry,<Person> feels angry,male,0,UK
1,0,Jeet feels angry,Jeet feels angry,<Person> feels angry,male,0,UK
2,0,Hagen feels angry,Hagen feels angry,<Person> feels angry,male,0,UK
3,0,Willow feels angry,Willow feels angry,<Person> feels angry,male,0,UK
4,0,Belal feels angry,Belal feels angry,<Person> feels angry,male,0,UK
...,...,...,...,...,...,...,...
243595,0,Valéria found herself in an outrageous situation,Valéria found herself in an outrageous situation,<Person> found herself in an outrageous situation,female,0,Hungary
243596,0,Ilus found herself in an outrageous situation,Ilus found herself in an outrageous situation,<Person> found herself in an outrageous situation,female,0,Hungary
243597,0,Kata found herself in an outrageous situation,Kata found herself in an outrageous situation,<Person> found herself in an outrageous situation,female,0,Hungary
243598,0,Emmi found herself in an outrageous situation,Emmi found herself in an outrageous situation,<Person> found herself in an outrageous situation,female,0,Hungary


#### Read Prediction Result from the Model

Make sure you set variable `ouput_dir` with the same `output_dir` in the fine-tuning parameter

In [11]:
def read_txt(fpath):
    pred = []
    file = open(fpath)
    lines = file.readlines()
    for l in lines :
        pred.append(int(l))
    file.close()
    
    return pred

In [12]:
output_dir = "gc_general"

result_dir = "../result/" + output_dir + "/"

path = result_dir + "results_data.txt"

pred = read_txt(path)

print(len(pred))

243600


In [13]:
df["pred"] = pred

In [14]:
df.head()

Unnamed: 0,0,1,mutant,template,gender,label,country,pred
0,0,Roen feels angry,Roen feels angry,<Person> feels angry,male,0,UK,0
1,0,Jeet feels angry,Jeet feels angry,<Person> feels angry,male,0,UK,0
2,0,Hagen feels angry,Hagen feels angry,<Person> feels angry,male,0,UK,0
3,0,Willow feels angry,Willow feels angry,<Person> feels angry,male,0,UK,0
4,0,Belal feels angry,Belal feels angry,<Person> feels angry,male,0,UK,0


#### Group by Country

In [15]:
dc = df.groupby("country")

dc.count()

Unnamed: 0_level_0,0,1,mutant,template,gender,label,pred
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia,8400,8400,8400,8400,8400,8400,8400
Belgium,8400,8400,8400,8400,8400,8400,8400
Brazil,8400,8400,8400,8400,8400,8400,8400
Brussels,8400,8400,8400,8400,8400,8400,8400
Canada,8400,8400,8400,8400,8400,8400,8400
Czech,8400,8400,8400,8400,8400,8400,8400
Finland,8400,8400,8400,8400,8400,8400,8400
Flanders,8400,8400,8400,8400,8400,8400,8400
Frisia,8400,8400,8400,8400,8400,8400,8400
Greece,8400,8400,8400,8400,8400,8400,8400


In [16]:
for k, v in dict(iter(dc)).items() :
    print(k)

Australia
Belgium
Brazil
Brussels
Canada
Czech
Finland
Flanders
Frisia
Greece
Hungary
India
Iran
Ireland
Israel
Italy
Latvia
Norway
Poland
Romania
Russia
Slovenia
Spain
Sweden
Turkey
UK
USA
Ukraine
Wallonia


#### Calculate Global Performance

In [17]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_test = df["label"]
y_pred = df["pred"]
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.95      0.53      0.68    182700
           1       0.39      0.91      0.55     60900

    accuracy                           0.63    243600
   macro avg       0.67      0.72      0.62    243600
weighted avg       0.81      0.63      0.65    243600

[[97627 85073]
 [ 5579 55321]]


#### Calculate Performance for each Group

In [19]:
for k, v in dict(iter(dc)).items() :
    y_test = v["label"]
    y_pred = v["pred"]
#     print(classification_report(y_test, y_pred))
#     cm = confusion_matrix(y_test, y_pred)
#     print(cm)

#### Implementation

Fundamental Thery about FPR and FNR: 
https://medium.com/datadriveninvestor/confusion-matric-tpr-fpr-fnr-tnr-precision-recall-f1-score-73efa162a25f


Theory on FPED and FNED:
https://research.google/pubs/pub46743/ 

In [20]:
# calculate false positive rate from given consufsion matrix
def calculate_fpr(cm) :
    negative = np.sum(cm[0])
    fp = cm[0][1]
    fpr = fp / negative
    return fpr

# calculate false positive rate from given consufsion matrix
def calculate_fnr(cm) :
    negative = np.sum(cm[0])
    fn = cm[1][0]
    fnr = fn / negative
    return fnr

print(calculate_fpr(cm))
print(calculate_fnr(cm))

0.4656431308155446
0.03053639846743295


#### Calculate Global FPR and FNR

In [21]:
y_test = df["label"]
y_pred = df["pred"]
cm = confusion_matrix(y_test, y_pred)
global_fpr = calculate_fpr(cm)
global_fnr = calculate_fnr(cm)
print("global fpr: ", global_fpr)
print("global fnr: ", global_fnr)

global fpr:  0.4656431308155446
global fnr:  0.03053639846743295


#### Calculate FPR and FNR for each Country

In [23]:
fprs = []
fnrs = []

for k, v in dict(iter(dc)).items() :
    y_test = v["label"]
    y_pred = v["pred"]
    cm = confusion_matrix(y_test, y_pred)
    fpr = calculate_fpr(cm)
    fnr = calculate_fnr(cm)
    fprs.append(fpr)
    fnrs.append(fnr)

#### Calculate FNED and FPED

In [24]:
fped = 0
fned = 0

for _fpr, _fnr in zip(fprs, fnrs) :
    fped += abs(global_fpr - fpr)
    fned += abs(global_fnr - fnr)

print()
print("FPED: ", fped)
print("FNED: ", fned)


FPED:  0.1125396825396826
FNED:  0.02015873015873011
