# Set up

## Imports

In [None]:
# installs
!pip install dill
!pip install xgboost
!pip install sklearn==0.0.post2

In [None]:
# Imports
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl
from xgboost.sklearn import XGBRegressor

import math
from sklearn.metrics import mean_squared_error

print(sk.__version__)

In [None]:
import xgboost as xg
print(xg.__version__)

In [7]:
# load data and make subsets 
x_train = pd.read_csv('training_data.csv') 
y_train = np.genfromtxt('training_labels.csv', delimiter=',', dtype = float).reshape(340134,1)

x_train_subset, x_val, y_train_subset, y_val = sk.model_selection.train_test_split(x_train, y_train, test_size = .15, random_state = 42)
# PDL = PointerDecisionList(base_clf, x_train_subset, y_train_subset, x_val, y_val, 1, 1)

In [None]:
# Import current info
# Note, may not be up to date with global git
global_preds_path ='models/global_model/training_predictions.csv'
global_preds = pd.read_csv(global_preds_path, header=None).transpose()
PAS_preds_path = 'models/PAS/training_predictions.csv'
PAS_preds = pd.read_csv(PAS_preds_path, header=None).transpose()

In [35]:
_, _, old_train_preds, old_val_preds = sk.model_selection.train_test_split(x_train, global_preds, test_size = .15, random_state = 42)

In [None]:
# confirm correct shapes

print(x_train.shape)
print(y_train.shape)
print(global_preds.shape)
print(PAS_preds.shape)
print()
print(len(y_train_subset))
print(len(old_train_preds))
print(len(y_val))
print(len(old_val_preds))

## Helper Functions

In [66]:
# check improvement on validation data
def check_improvement(old_pred, g, h):
    indices = g(x_val)
    new_pred = h(x_val[indices])
    
    improvement = -100000
    if(len(new_pred)!=0):   
        old_RMSE = math.sqrt(mean_squared_error(y_val[indices], old_val_preds[indices]))
        new_RMSE = math.sqrt(mean_squared_error(y_val[indices], new_pred))
        improvement = old_RMSE-new_RMSE
    # print(f"improvement: {improvement}")
    # if (improvement>0):
    #     print("\n IMPROVEMENT \n IMPROVEMENT \n IMPROVEMENT \n")
    return improvement

def check_local_improvement(g,h):
    return check_improvement(PAS_preds, g, h)

def check_global_improvement(g,h):
    return check_improvement(global_preds, g, h)

In [28]:
# def check_improvement(old_pred, g, h):
#     indices = g(x_val)
#     old_pred = old_pred[indices]
#     new_pred = h(x_train[indices])
#     old_RMSE = math.sqrt(mean_squared_error(y_train[indices], old_pred))
#     new_RMSE = math.sqrt(mean_squared_error(y_train[indices], new_pred))
#     # print(f"improvement: {old_RMSE-new_RMSE}")
#     if (old_RMSE-new_RMSE>0):
#         print("\n IMPROVEMENT \n IMPROVEMENT \n IMPROVEMENT \n")
#     return old_RMSE-new_RMSE

# def check_local_improvement(g,h):
#     return check_improvement(PAS_preds, g, h)

# def check_global_improvement(g,h):
#     return check_improvement(global_preds, g, h)

In [11]:
def save_pkls(g,h):
    with open('g.pkl', 'wb') as file:
        pkl.dump(g, file)

    # save hypothesis function to h.pkl
    with open('h.pkl', 'wb') as file:
        pkl.dump(h, file)

In [12]:
def train_basic_h(g):
    clf = sk.tree.DecisionTreeRegressor(max_depth = 7, random_state = 42)

    # find group indices on data
    indices = g(x_train)

    # fit model specifically to group
    clf.fit(x_train[indices], y_train[indices])

    # define hypothesis function as bound clf.predict
    h = clf.predict
    
    return h

In [13]:
def train_XGBRegressor(g):
    clf = XGBRegressor(max_depth = 10, random_state = 42)
    indices = g(x_train_subset)
    clf.fit(x_train_subset[indices], y_train_subset[indices])
    h = clf.predict
    return h

# Starter Code

In order to help minimize start up difficulties, we have provided you with a basic ML workflow for this project, as well as a few possible avenues to explore. 

## Section 1: ML Workflow for Submitting *(g,h)* pairs

### 1.0 Pip Installs and Imports

We will be using a package *dill* which is a variant of *pickle*, but allows a bit more expressive byte code serialization. This package is essential to saving your *(g,h)* pairs!.

In [None]:
!pip install dill

Here is a non-inclusive list of packages you may find helpful

In [None]:
# Imports
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl

### 1.1 Download/Load Data

Navigate to the project [webpage](https://declancharrison.github.io/CIS_5230_Bias_Bounty_2023/) and click "Download Training Data". Extract the .zip files in the folder where this notebook is located, then run the cell below.

In [None]:
x_train = pd.read_csv('training_data.csv') 
y_train = np.genfromtxt('training_labels.csv', delimiter=',', dtype = float)

### 1.2 Define a (g,h) pair

Below is an example of training a Decision Tree Regressor on individuals identified as white from the dataset.

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 1

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 5, random_state = 42)

# find group indices on data
indices = g(x_train)

# fit model specifically to group
clf.fit(x_train[indices], y_train[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

### 1.3 Save Objects

The following cell will save your group model *g* with filename *g.pkl*, and your hypothesis function *h* with filename *h.pkl*.

In [None]:
# save group function to g.pkl
with open('g.pkl', 'wb') as file:
    pkl.dump(g, file)

# save hypothesis function to h.pkl
with open('h.pkl', 'wb') as file:
    pkl.dump(h, file)

### 1.4 Upload Models to Google Drive and Submit PR Request with Links

Follow instructions on GitHub Repo to submit a *(g,h)* pair update request!

## Section 2: Reducing Workflow Time Requirements by Creating a Local PDL

As you have probably noticed, submitting a *(g,h)* pair to the GitHub repository can take a long time depending on the current workload of the server. To approximate whether or not an update will be accepted, we have provided you the PDL architecture file and a workflow that will mimic your team's private PDL maintained by the server. 

**NOTE: One major caveat is the validation data this workflow uses is a cut from the training data, meaning you will want to refrain from training on it to prevent overfitting.**

The way we suggest getting around this without losing data efficacy is to train a *(g,h)* pair on the subset of training data that does not include the validation set, and attempt the *(g,h)* pair update on the local PDL. If the pair is rejected, you can continue tuning hyperparameters or searching for new groups. If the pair is accepted, you can retrain a new *(g,h)* pair over ALL the training data, and submit this pair to the server for an update. This will allow you to "squeeze all the juice" from your training data and test potential updates much quicker.  

In [None]:
### DONT CHANGE THIS CELL ###
from pdl import PointerDecisionList

x_train_subset, x_val, y_train_subset, y_val = sk.model_selection.train_test_split(x_train, y_train, test_size = .15, random_state = 42)
base_clf = sk.tree.DecisionTreeRegressor(max_depth = 1, random_state = 42)
base_clf.fit(x_train_subset, y_train_subset)
PDL = PointerDecisionList(base_clf, x_train_subset, y_train_subset, x_val, y_val, 1, 1)

Train your *(g,h)* pair on the subset of training data below:

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 2

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 5, random_state = 42)

# find group indices on data
indices = g(x_train_subset)

# fit model specifically to group
clf.fit(x_train_subset[indices], y_train_subset[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

Attempt an update using the following syntax

In [None]:
update_flag = PDL.update(g, h, x_train_subset, y_train_subset, x_val, y_val)

You can put these two together to train a classifier using the whole training dataset after if it has been accepted:

In [None]:
# define group function
def g(X):
    return X['RAC1P'] == 1

# initialize ML hypothesis class
clf = sk.tree.DecisionTreeRegressor(max_depth = 10, random_state = 42)

# find group indices on training subset
indices = g(x_train_subset)

# fit model specifically to group subset
clf.fit(x_train_subset[indices], y_train_subset[indices])

# define hypothesis function as bound clf.predict
h = clf.predict

# compute PDL update
update_flag = PDL.update(g, h, x_train_subset, y_train_subset, x_val, y_val)

if update_flag:

    # recompute indices over whole training dataset
    indices = g(x_train)

    # refit classifier to full group
    clf.fit(x_train[indices], y_train[indices])

    # define hypothesis function as bound clf.predict
    h = clf.predict    

Submit *(g,h)* pair to GitHub!

**NOTE: You can save your PDL but it will require that your validation set does not change! Thus, you should not change the random state used to split your training data once you create your PDL**

In [None]:
# save PDL
PDL.save_model()

# open PDL structure
with open('PDL/model.pkl', 'rb') as file:
    PDL = pkl.load(file)

# reload group/hypothesis functions to PDL
PDL.reload_functions()

# Automated Group Finding

## Epsilon above/below

In [None]:
# view how different global is from labels
abs_diff = (global_preds - y_train).abs()
abs_diff.describe()

In [None]:
# Train clf to identify rows with big difference

def epsilon_above(epsilon):
    # define 0,1 labels where current predictions OVERESTIMATE by at least epsilon
    binary_labels = (global_preds - y_train) < epsilon

    # define group classifier class
    clf = sk.tree.DecisionTreeClassifier(max_depth = 10, random_state = 42)

    # fit classifier to binary labels
    clf.fit(x_train, binary_labels)

    # define g
    g = clf.predict
    # visualize results
    # pd.DataFrame(g(x_train).astype(int)).describe()
    
    return g

def epsilon_below(epsilon):
    # define 0,1 labels where current predictions OVERESTIMATE by at least epsilon
    binary_labels = (y_train - global_preds) < epsilon

    # define group classifier class
    clf = sk.tree.DecisionTreeClassifier(max_depth = 10, random_state = 42)

    # fit classifier to binary labels
    clf.fit(x_train, binary_labels)

    # define g
    g = clf.predict
    # visualize results
    # pd.DataFrame(g(x_train).astype(int)).describe()
    
    return g

In [None]:
for i in range(20):
    g = epsilon_below(i*5000)
    train_XGBRegressor(g)
    print(str(i*1000))
    check_global_improvement(g,h)

## Targeted Correction

In [14]:
class targeted_correction:
    def __init__(self, clf, value, epsilon):
        self.clf = clf
        self.value = value
        self.epsilon = epsilon

    def __call__(self, X):
        return self.predict(X)
    
    def predict(self, X):
        predictions = self.clf.predict(X)
        return abs(predictions - self.value) < self.epsilon

In [None]:
class XGBRegressor_wrap:
    def __init__(self, clf):
        self.clf = clf
    def __call__(self, X):
        

In [15]:
clf = sk.tree.DecisionTreeRegressor(max_depth = 7, random_state = 42)
clf.fit(x_train, y_train)

In [None]:
# data exploration
for i in range(100):
    g = targeted_correction(clf, i*1000, 5000)
    indices = g(x_train)
    # print(indices)
    if (indices.any()):
        old_RMSE = math.sqrt(mean_squared_error(y_train[indices], global_preds[indices]))
        print(i, old_RMSE)

In [58]:
g = targeted_correction(clf, 91*1000, 5000)
h = train_XGBRegressor(g)
if(check_global_improvement(g,h)>0):
    save_pkls(g,h)

improvement: -100000


In [None]:
for i in range(0, 1):
    # print("i = {}".format(i))
    for j in range(5, 35):
        v = i*1000
        e = j*400
        print("v = {}, e = {}".format(v, e))
        g = targeted_correction(clf, e, j)
        # h = train_XGBRegressor(g)
        if (check_global_improvement(g,h) > 0):
                print(v,e)
                break;

## Clustering

In [62]:
class cluster_n:

    def __init__(self, clf, n):
        # define attibutes here. You may add more parameters to the init method (see example below)
        self.clf = clf  
        self.n = n

    # DO NOT CHANGE CALL FUNCTION, FORMAT .predict
    def __call__(self, X):
        return self.predict(X)
    
    def predict(self, X):
        # find instances where cluster is 1
        return self.clf.predict(X) == self.n

    
cluster_clf = sk.cluster.KMeans(n_clusters= 5, random_state = 42, n_init=10)
cluster_clf.fit(x_train)
g = cluster_n(cluster_clf, 1)

# visualize results
g(x_train)



array([False, False, False, ...,  True, False,  True])

In [64]:
def save_cluster_pkls(g, h, i=""):
    # save group function to g.pkl
    g_path = "cluster_pkls/g{}.pkl".format(i)
    h_path = "cluster_pkls/h{}.pkl".format(i)
    
    with open(g_path, 'wb') as file:
        pkl.dump(g, file)

    # save hypothesis function to h.pkl
    with open(h_path, 'wb') as file:
        pkl.dump(h, file)

In [68]:
n_list = [100]

for n in n_list:
    print("\n starting n=" + str(n) + "\n")
    cluster_clf = sk.cluster.KMeans(n_clusters=n, random_state = 42, n_init=10)
    cluster_clf.fit(x_train)
    
    for i in range(0, n):
        g = cluster_n(cluster_clf, i)
        h = train_basic_h(g)
        if (check_global_improvement(g,h) > 0):
            print(n, i, check_global_improvement(g,h), g(x_train).sum())
            save_cluster_pkls(g,h,i)
            
        


 starting n=100

100 7 29.761137795778268 1297
100 8 1011.7760929571778 903
100 21 583.0098874141149 2460
100 23 2177.5774131573216 1030
100 27 2496.4230284159366 763
100 32 34.730515861785534 1295
100 33 270.66115488817013 1678
100 45 527.9166802486798 959
100 49 1058.0636988506922 340
100 50 498.18044380775973 3824
100 53 175.58684390818598 1964
100 57 128.29670803504268 1692
100 62 606.3828751836518 3426
100 75 199.60253681036556 2007
100 76 116.00687099235074 979
100 78 1701.48280786115 1303
100 79 283.8631980173668 1006
100 80 686.2639668247757 1280
100 82 533.5565123081724 892
100 83 13.930983860580454 3478
100 84 2486.4485488541814 550
100 90 304.7011411597141 1269
100 94 1936.7942408894487 776
