# Set up

## Imports

In [None]:
!pip install dill

In [None]:
# Imports
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl

import math
from sklearn.metrics import mean_squared_error

In [None]:
# load data and make subsets 
x_train = pd.read_csv('training_data.csv') 
y_train = np.genfromtxt('training_labels.csv', delimiter=',', dtype = float).reshape(340134,1)

x_train_subset, x_val, y_train_subset, y_val = sk.model_selection.train_test_split(x_train, y_train, test_size = .15, random_state = 42)
# PDL = PointerDecisionList(base_clf, x_train_subset, y_train_subset, x_val, y_val, 1, 1)

In [None]:
# Import current info
# Note, may not be up to date with global git
global_preds_path ='models/global_model/training_predictions.csv'
global_preds = pd.read_csv(global_preds_path, header=None).transpose()
PAS_preds_path = 'models/PAS/training_predictions.csv'
PAS_preds = pd.read_csv(PAS_preds_path, header=None).transpose()

In [None]:
print(x_train.shape)
print(y_train.shape)
print(global_preds.shape)
print(PAS_preds.shape)

## Helper Functions

In [201]:
def check_improvement(old_pred, g, h):
    indices = g(x_train)
    old_pred = old_pred[indices]
    new_pred = h(x_train[indices])
    old_RMSE = math.sqrt(mean_squared_error(y_train[indices], old_pred))
    new_RMSE = math.sqrt(mean_squared_error(y_train[indices], new_pred))
    # print(f"improvement: {old_RMSE-new_RMSE}")
    # if (old_RMSE-new_RMSE>0):
        # print("\n IMPROVEMENT \n IMPROVEMENT \n IMPROVEMENT \n")
    return old_RMSE-new_RMSE

def check_local_improvement(g,h):
    return check_improvement(PAS_preds, g, h)

def check_global_improvement(g,h):
    return check_improvement(global_preds, g, h)

In [194]:
def train_basic_h(g):
    clf = sk.tree.DecisionTreeRegressor(max_depth = 7, random_state = 42)

    # find group indices on data
    indices = g(x_train)

    # fit model specifically to group
    clf.fit(x_train[indices], y_train[indices])

    # define hypothesis function as bound clf.predict
    h = clf.predict
    
    return h

# Starter Code

In order to help minimize start up difficulties, we have provided you with a basic ML workflow for this project, as well as a few possible avenues to explore. 

## Section 1: ML Workflow for Submitting *(g,h)* pairs

### 1.0 Pip Installs and Imports

We will be using a package *dill* which is a variant of *pickle*, but allows a bit more expressive byte code serialization. This package is essential to saving your *(g,h)* pairs!.

In [None]:
!pip install dill

Here is a non-inclusive list of packages you may find helpful

In [None]:
# Imports
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl

### 1.1 Download/Load Data

Navigate to the project [webpage](https://declancharrison.github.io/CIS_5230_Bias_Bounty_2023/) and click "Download Training Data". Extract the .zip files in the folder where this notebook is located, then run the cell below.

In [None]:
x_train = pd.read_csv('training_data.csv') 
y_train = np.genfromtxt('training_labels.csv', delimiter=',', dtype = float)

### 1.2 Define a (g,h) pair

Below is an example of training a Decision Tree Regressor on individuals identified as white from the dataset.

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 1

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 5, random_state = 42)

# find group indices on data
indices = g(x_train)

# fit model specifically to group
clf.fit(x_train[indices], y_train[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

### 1.3 Save Objects

The following cell will save your group model *g* with filename *g.pkl*, and your hypothesis function *h* with filename *h.pkl*.

In [205]:
# save group function to g.pkl
with open('cluster_pkls/g.pkl', 'wb') as file:
    pkl.dump(g, file)

# save hypothesis function to h.pkl
with open('cluster_pkls/h.pkl', 'wb') as file:
    pkl.dump(h, file)

### 1.4 Upload Models to Google Drive and Submit PR Request with Links

Follow instructions on GitHub Repo to submit a *(g,h)* pair update request!

## Section 2: Reducing Workflow Time Requirements by Creating a Local PDL

As you have probably noticed, submitting a *(g,h)* pair to the GitHub repository can take a long time depending on the current workload of the server. To approximate whether or not an update will be accepted, we have provided you the PDL architecture file and a workflow that will mimic your team's private PDL maintained by the server. 

**NOTE: One major caveat is the validation data this workflow uses is a cut from the training data, meaning you will want to refrain from training on it to prevent overfitting.**

The way we suggest getting around this without losing data efficacy is to train a *(g,h)* pair on the subset of training data that does not include the validation set, and attempt the *(g,h)* pair update on the local PDL. If the pair is rejected, you can continue tuning hyperparameters or searching for new groups. If the pair is accepted, you can retrain a new *(g,h)* pair over ALL the training data, and submit this pair to the server for an update. This will allow you to "squeeze all the juice" from your training data and test potential updates much quicker.  

In [None]:
### DONT CHANGE THIS CELL ###
from pdl import PointerDecisionList

x_train_subset, x_val, y_train_subset, y_val = sk.model_selection.train_test_split(x_train, y_train, test_size = .15, random_state = 42)
base_clf = sk.tree.DecisionTreeRegressor(max_depth = 1, random_state = 42)
base_clf.fit(x_train_subset, y_train_subset)
PDL = PointerDecisionList(base_clf, x_train_subset, y_train_subset, x_val, y_val, 1, 1)

Train your *(g,h)* pair on the subset of training data below:

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 2

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 5, random_state = 42)

# find group indices on data
indices = g(x_train_subset)

# fit model specifically to group
clf.fit(x_train_subset[indices], y_train_subset[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

Attempt an update using the following syntax

In [None]:
update_flag = PDL.update(g, h, x_train_subset, y_train_subset, x_val, y_val)

You can put these two together to train a classifier using the whole training dataset after if it has been accepted:

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 1

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 10, random_state = 42)

# find group indices on training subset
indices = g(x_train_subset)

# fit model specifically to group subset
clf.fit(x_train_subset[indices], y_train_subset[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

# compute PDL update
update_flag = PDL.update(g, h, x_train_subset, y_train_subset, x_val, y_val)

if update_flag:

    # recompute indices over whole training dataset
    indices = g(x_train)

    # refit classifier to full group
    clf.fit(x_train[indices], y_train[indices])

    # define hypothesis function as bound clf.predict
    h = clf.predict    

Submit *(g,h)* pair to GitHub!

**NOTE: You can save your PDL but it will require that your validation set does not change! Thus, you should not change the random state used to split your training data once you create your PDL**

In [None]:
# save PDL
PDL.save_model()

# open PDL structure
with open('PDL/model.pkl', 'rb') as file:
    PDL = pkl.load(file)

# reload group/hypothesis functions to PDL
PDL.reload_functions()

# Automated Group Finding

## Epsilon above/below

In [None]:
# view how different global is from labels
abs_diff = (global_preds - y_train).abs()
abs_diff.describe()

In [None]:
# Train clf to identify rows with big difference

def epsilon_above(epsilon):
    # define 0,1 labels where current predictions OVERESTIMATE by at least epsilon
    binary_labels = (global_preds - y_train) < epsilon

    # define group classifier class
    clf = sk.tree.DecisionTreeClassifier(max_depth = 10, random_state = 42)

    # fit classifier to binary labels
    clf.fit(x_train, binary_labels)

    # define g
    g = clf.predict
    # visualize results
    # pd.DataFrame(g(x_train).astype(int)).describe()
    
    return g

def epsilon_below(epsilon):
    # define 0,1 labels where current predictions OVERESTIMATE by at least epsilon
    binary_labels = (y_train - global_preds) < epsilon

    # define group classifier class
    clf = sk.tree.DecisionTreeClassifier(max_depth = 10, random_state = 42)

    # fit classifier to binary labels
    clf.fit(x_train, binary_labels)

    # define g
    g = clf.predict
    # visualize results
    # pd.DataFrame(g(x_train).astype(int)).describe()
    
    return g

In [None]:
for i in range(10):
    g = epsilon_below(i*1000)
    h = train_basic_h(g)
    print(str(i*1000))
    check_global_improvement(g,h)

## Targeted Correction

In [None]:
class targeted_correction:
    def __init__(self, clf, value, epsilon):
        self.clf = clf
        self.value = value
        self.epsilon = epsilon

    def __call__(self, X):
        return self.predict(X)
    
    def predict(self, X):
        predictions = self.clf.predict(X)
        return abs(predictions - self.value) < self.epsilon

In [None]:
# data exploration
for i in range(100):
    g = targeted_correction(clf, i*1000, 5000)
    indices = g(x_train)
    old_RMSE = math.sqrt(mean_squared_error(y_train[indices], global_preds[indices]))
    print(old_RMSE)

In [None]:
for i in range(0, 100):
    # print(i*1000)
    for j in range(5, 35):
        v = i*1000
        e = j*400
        g = targeted_correction(clf, e, j)
        if (not (g(x_train)==0).all()): # check if 0 for all predictions to avoid error
            print(str(v), str(e))
            h = train_basic_h(g)
            if (check_global_improvement(g,h) > 0):
                print(v,e)
                break;

## Clustering

In [182]:
class cluster_n:

    def __init__(self, clf, n):
        # define attibutes here. You may add more parameters to the init method (see example below)
        self.clf = clf  
        self.n = n

    # DO NOT CHANGE CALL FUNCTION, FORMAT .predict
    def __call__(self, X):
        return self.predict(X)
    
    def predict(self, X):
        # find instances where cluster is 1
        return self.clf.predict(X) == self.n

    
cluster_clf = sk.cluster.KMeans(n_clusters= 5, random_state = 42)
cluster_clf.fit(x_train)
g = cluster_n(cluster_clf, 1)

# visualize results
g(x_train)

array([False, False, False, ...,  True, False,  True])

In [215]:
n_list = [100]

for n in n_list:
    print("\n starting n=" + str(n) + "\n")
    cluster_clf = sk.cluster.KMeans(n_clusters=n, random_state = 42)
    cluster_clf.fit(x_train)
    
    for i in range(0, n):
        g = cluster_n(cluster_clf, i)
        h = train_basic_h(g)
        if (check_global_improvement(g,h) > 0):
            print(n, i, check_global_improvement(g,h), g(x_train).sum())
            save_cluster_pkls(g,h,i)
            
        


 starting n=100

100 8 1210.9636548417839 903
100 11 651.6711999040526 1920
100 12 355.0483177453916 1110
100 21 244.57656546830913 2460
100 23 1196.7789898890805 1030
100 27 1129.2207599268186 763
100 32 418.1430378817604 1295
100 35 82.1676859444051 3825
100 44 176.93248377819145 2317
100 45 1114.810844262258 959
100 49 2846.9621337024873 340
100 51 15.101148062376524 3767
100 52 265.1659412173358 954
100 53 244.31999900599294 1964
100 61 36.70977274042525 2754
100 62 163.86897497259451 3426
100 67 338.0181905722693 857
100 70 926.831371248596 828
100 78 390.9525104534441 1303
100 79 959.1681221623403 1006
100 80 6.62241010802245 1280
100 82 886.087105286424 892
100 84 1246.7705163471073 550
100 86 192.10248022272572 1691
100 88 85.96495988819697 4099
100 89 3.5662618013120664 8196
100 90 432.775939060597 1269
100 92 168.3088534522467 1844
100 93 188.40742932956164 3165
100 94 1470.4657391122491 776
100 95 393.2179664145042 1764
100 98 43.77250848244694 3447


In [212]:
def save_cluster_pkls(g, h, i):
    # save group function to g.pkl
    g_path = "cluster_pkls/g{}.pkl".format(i)
    h_path = "cluster_pkls/h{}.pkl".format(i)
    
    with open(g_path, 'wb') as file:
        pkl.dump(g, file)

    # save hypothesis function to h.pkl
    with open(h_path, 'wb') as file:
        pkl.dump(h, file)