In [39]:
# process file 1
# extract probabilities of being positive

def read_file_1(filename):
    y_score = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(',')
            y_score.append(float(line[1]))
    return y_score

In [40]:
# process file 2
# extract ground-truth labels (P or N)

def read_file_2(filename):
    y_true = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            class_ = line.split(',')[1].strip()
            if class_ == 'P':
                res = 1
            elif class_ == 'N':
                res = 0
            y_true.append(res)
    return y_true

In [41]:
# calculates true positive rate (tpr) and false positive rate (fpr)

def tpr_fpr(y_true, y_score, threshold):
    tp = 0
    fn = 0
    fp = 0
    tn = 0
    tpr = []
    fpr = []

    for i in range(len(y_score)):
        if y_score[i] >= threshold: # classified positive 
            if y_true[i] == 1:
                tp += 1
            else:
                fp += 1
        elif y_score[i] < threshold: # classified negative
            if y_true[i] == 1:
                fn += 1
            else:
                tn += 1
    
    tpr = tp/(tp+fn)
    fpr = fp/(tn+fp)
    
    return (fpr, tpr)

In [42]:
def auc(y_true, y_score):
    # compute tpr and fpr for each threshold
    pts = []
    for threshold in y_score:
        pts.append(tpr_fpr(y_true, y_score, threshold))
    
    pts.sort(key=lambda pt: pt[0]) # sort points by fpr
    
    # use trapezoidal rule to calculate AUC
    auc = 0
    for j in range(len(y_score)-1):
        auc += (pts[j+1][0] - pts[j][0]) * (pts[j+1][1] + pts[j][1]) / 2
    
    return auc

In [43]:
y_score = read_file_1('3.txt') # first file (predictions)
y_true = read_file_2('4.txt') # second file (true labels)

In [44]:
auc(y_true, y_score)

0.8333333333333334