## CORELS

## Imports and setup

In [None]:
from corels import *
import numpy as np
import time
import matplotlib.pyplot as plt
# Load the dataset
X, y, features, prediction = load_from_csv("/home/scott/data/corels/compas.csv")

## Set up data

In [None]:
# Save race data for later analysis
Xr = X[...,9:14]

# Remove race data
Xnr = np.delete(X,[10,11,12,13,14,15],1)
print("Xnr shape: ",Xnr.shape)
racefeat = features[9:15]
print(racefeat)
features[9:15] = []

## Set up splits and hyperparameter lists

In [None]:
# Train split proportion
train_proportion = 0.8

# Generate train and test sets
train_split = int(train_proportion * Xnr.shape[0])

X_train = Xnr[:train_split]
y_train = y[:train_split]

X_test = Xnr[train_split:]
y_test = y[train_split:]
Xr_test = Xr[train_split:]

# Hyperparameters
reg = [0.1, 0.05, 0.025, 0.01, 0.005, 0.0025, 0.001] #regularization constant
pol = ['lower_bound', 'bfs', 'curious', 'objective']#, 'dfs'] #policy
card = [1, 2, 3] #maximum cardinality
supp = [0.01, 0.025, 0.05, 0.1, 0.25] #minimum support

## Run CORELS on all permutations of hyperparameters

In [None]:
i = 0
scores = []
uniques = []
lists = []
highestfpr = 0
highestdiff = 0
highesti = 0
timethen = time.time()
time.process_time()
elapsed = 0

for r in reg:
    for p in pol:
        for ca in card:
            for s in supp:
                print("Iteration ",i)
                print("reg: ",r,", policy: ",p,", cardinality: ",ca,", min support: ",s)
                # Create classifier
                c = CorelsClassifier(
                    c=r,
                    n_iter=10000,
                    policy=p,
                    verbosity=[],
                    max_card=ca,
                    min_support=s
                )
                # Fit the model. Features is a list of the feature names
                c.fit(X_train, y_train, features=features, prediction_name=prediction)
                # Score model on test set for overall accuracy
                a = c.score(X_test, y_test)
                # Generate predictions on test set
                yhat = c.predict(X_test)
                
                # Calculate statistics
                blacktp = np.logical_and(np.logical_and(yhat==True,y_test==1), Xr_test[...,0]==1)
                blacktn = np.logical_and(np.logical_and(yhat==False,y_test==0), Xr_test[...,0]==1)
                blackfp = np.logical_and(np.logical_and(yhat==True,y_test==0), Xr_test[...,0]==1)
                blackfn = np.logical_and(np.logical_and(yhat==False,y_test==1), Xr_test[...,0]==1)
                
                bltotal = blackfp.sum()+blackfn.sum()+blacktp.sum()+blacktn.sum()
                blacc = (blacktp.sum()+blacktn.sum())/bltotal
                blfpr = blackfp.sum()/(blackfp.sum()+blacktn.sum())
                
                whitetp = np.logical_and(np.logical_and(yhat==True,y_test==1), Xr_test[...,1]==1)
                whitetn = np.logical_and(np.logical_and(yhat==False,y_test==0), Xr_test[...,1]==1)
                whitefp = np.logical_and(np.logical_and(yhat==True,y_test==0), Xr_test[...,1]==1)
                whitefn = np.logical_and(np.logical_and(yhat==False,y_test==1), Xr_test[...,1]==1)
                
                whtotal = whitefp.sum()+whitefn.sum()+whitetp.sum()+whitetn.sum()
                whacc = (whitetp.sum()+whitetn.sum())/whtotal
                whfpr = whitefp.sum()/(whitefp.sum()+whitetn.sum())
                
                diff = blfpr - whfpr
                
                # Save results to scores list
                scores.append(
                    {
                        'regularization': r,
                        'policy': p,
                        'cardinality': ca,
                        'support': s,
                        'accuracy': a,
                        'blaccuracy': blacc,
                        'blfpr': blfpr,
                        'whaccuracy': whacc,
                        'whfpr': whfpr,
                        'list': c.rl()
                    }
                )
                lists.append(str(c.rl()))
                
                # Keep track of highest differential in FPR
                if diff > highestdiff:
                    highestdiff = diff
                    highestfpr = blfpr
                    highesti = i
                    highr, highp, highca, highs = r, p, ca, s
                print("Current black FPR: ",blfpr,", highest black FPR: ",highestfpr)
                
                # Keep track of time elapsed per iteration
                i = i+1
                elapsed = time.time() - timethen
                print("Elapsed: ",elapsed," seconds")
                timethen = time.time()
                print("")


## Print and Plot Results

In [None]:
print("Highest black false positive rate: ",highestfpr)
print("Hyperparameters: ",highr,", ",highp,", ",highca,", ",highs)
print("found on iteration ",highesti)

seen = set()
uniqueindices = []
for i, n in enumerate(lists):
    if n not in seen:
        uniqueindices.append(i)
        seen.add(n)
    

print("Number of unique rule lists: ",len(uniqueindices))
print("Largest difference, black FPR: ",scores[highesti]['blfpr'])
print("Largest difference, white FPR: ",scores[highesti]['whfpr'])
print(str(scores[highesti]['list']))
print("")

high = 0
low = 1
cumuacc = 0
for i in uniqueindices:
    accnow = scores[i]['accuracy']
    cumuacc = cumuacc + accnow
    if accnow < low:
        low = accnow
    if accnow > high:
        high = accnow
cumuacc = cumuacc/len(uniqueindices)
print("Average accuracy: ",cumuacc)
print("Low: ",low)
print("High: ",high)
print("")

blfprlist = []
high = 0
low = 1
blcumuacc = 0
blavfpr = 0
for i in uniqueindices:
    accnow = scores[i]['accuracy']
    fprnow = scores[i]['blfpr']
    blfprlist.append(fprnow)
    blavfpr = blavfpr + fprnow
    blcumuacc = blcumuacc + accnow
    if accnow < low:
        low = accnow
    if accnow > high:
        high = accnow
blcumuacc = blcumuacc/len(uniqueindices)
blavfpr = blavfpr/len(uniqueindices)
print("Average black accuracy: ",blcumuacc)
print("Low: ",low)
print("High: ",high)
print("Average black FPR: ",blavfpr)
print("")

whfprlist = []
whavfpr = 0
high = 0
low = 1
whcumuacc = 0
for i in uniqueindices:
    accnow = scores[i]['accuracy']
    fprnow = scores[i]['whfpr']
    whfprlist.append(fprnow)
    whavfpr = whavfpr + fprnow
    whcumuacc = whcumuacc + accnow
    if accnow < low:
        low = accnow
    if accnow > high:
        high = accnow
whcumuacc = whcumuacc/len(uniqueindices)
whavfpr = whavfpr/len(uniqueindices)
print("Average white accuracy: ",whcumuacc)
print("Low: ",low)
print("High: ",high)
print("Average white FPR: ",whavfpr)
print("")

data = [blfprlist, whfprlist]
fig1, ax1 = plt.subplots()
ax1.boxplot(data)
plt.xticks([1,2],['Black','White'])
plt.ylabel("False Positive Rate")

plt.show()