In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sbn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

from balancers import PredictionBalancer
import tools

First, let's load our data. Here we're using some COVID outbreak investigation data with information about symptoms, test results, and demographics for each person.

In [2]:
records = pd.read_csv('records.csv')

Race will be our protected category. To keep things simple, we'll limit the category to 4 groups: `Black`, `White`, `Asian`, and `Undisclosed`.

In [3]:
records = records[(records.race == 'Black / African American') |
                  (records.race == 'White') |
                  (records.race == 'Asian') |
                  (records.race == 'Undisclosed')]
race = records.race.values

Next, we'll set the true outcome variable, Y. This is the thing we're trying to predict fairly across groups. Since PCR is the gold standard for SARS-CoV2 infection, that's what we'll use here.

In [4]:
pcr = records.pcr.values

We also need to come up with some (potentially unfair) predictors to predict test status. We'll start with a few binary symptom variables.

In [5]:
cough = records.cough.values
fever = records.fever.values
taste = records.losstastesmell.values

symptoms = [cough, fever, taste]
symptom_names = ['cough', 'fever', 'taste']

Before stratifying by race, let's start by seeing how well these predict test status overall. `tools.clf_metrics()` produces lots of nice info, so let's use that.

In [6]:
symptom_stats = [tools.clf_metrics(true=pcr, pred=s, mod_name=symptom_names[i]) for i, s in enumerate(symptoms)]
symptom_stats = pd.concat(symptom_stats, axis=0)

In [7]:
symptom_stats

Unnamed: 0,tp,fp,tn,fn,sens,spec,ppv,npv,j,f1,mcc,brier,auc,ap,true_prev,pred_prev,prev_diff,rel_prev_diff,model
0,114.0,301.0,2729.0,182.0,0.3851,0.9007,0.2747,0.9375,0.2858,0.3207,0.24625,0.1452,0,0,296.0,415.0,119.0,0.402,cough
0,90.0,144.0,2886.0,206.0,0.3041,0.9525,0.3846,0.9334,0.2566,0.3396,0.285612,0.1052,0,0,296.0,234.0,-62.0,-0.2095,fever
0,91.0,80.0,2950.0,205.0,0.3074,0.9736,0.5322,0.935,0.281,0.3897,0.362345,0.0857,0,0,296.0,171.0,-125.0,-0.4223,taste


All 3 symptoms predict pretty well. Loss of taste and/or smell is strong on both Youden's J and F1-score, though, so let's say it's the best overall.

In [8]:
pb = PredictionBalancer()