In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.metrics import roc_curve
# from utilities import *
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("https://github.com/propublica/compas-analysis/raw/master/compas-scores-two-years.csv", 
                 header=0).set_index('id')

In [5]:
def clean_compas(df):
    
    # Clean the compas dataset according to the description provided by ProPublica of their analysis. 
    # In the original notebook the authors state:

    # There are a number of reasons remove rows because of missing data:
        
        # If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, 
        # we assume that because of data quality reasons, that we do not have the right offense.

        # We coded the recidivist flag -- `is_recid` -- to be -1 if we could not find a compas case at all.

        # In a similar vein, ordinary traffic offenses -- those with a `c_charge_degree` of 'O' -- will not result in Jail time 
        # are removed (only two of them).
 
        # We filtered the underlying data from Broward county to include only those rows representing people who had either 
        # recidivated in two years, or had at least two years outside of a correctional facility.

    # ix is the index of variables we want to keep.
    # Remove entries with inconsistent arrest information.
    rows_start = len(df)
    ix = df['days_b_screening_arrest'] <= 30
    ix = (df['days_b_screening_arrest'] >= -30) & ix

    # remove entries entries where compas case could not be found.
    ix = (df['is_recid'] != -1) & ix

    # remove traffic offenses.
    ix = (df['c_charge_degree'] != "O") & ix

    # remove entries without available text scores.
    ix = (df['score_text'] != 'N/A') & ix

    # trim dataset
    df = df.loc[ix,:]

    # create new attribute "length of stay" with total jail time.
    df['length_of_stay'] = (pd.to_datetime(df['c_jail_out'])-pd.to_datetime(df['c_jail_in'])).apply(lambda x: x.days)

    # print number of rows
    print('Number of rows removed: '+str(rows_start - len(df)))
    # print list of features again
    print('Features: '+str(list(df)))
    return df


In [6]:
print(list(df))
df.head()
# Select features that will be analyzed
features_to_keep = ['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 
                    'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']
df = df[features_to_keep]
df = clean_compas(df)
df.head()
print("\ndataset shape (rows, columns)", df.shape)

['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']
Number of rows removed: 1042
Features: ['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out', 'length_of_stay']

dataset shape (rows, columns) (6172, 14)


In [8]:
df = df.loc[df['race'].isin(['African-American','Caucasian'])]
dfQ = df.copy()

# Quantize priors count between 0, 1-3, and >3
def quantizePrior(x):
    if x <=0:
        return '0'
    elif 1<=x<=3:
        return '1 to 3'
    else:
        return 'More than 3'

    
# Quantize length of stay
def quantizeLOS(x):
    if x<= 7:
        return '<week'
    if 8<x<=93:
        return '<3months'
    else:
        return '>3 months'
    
# Quantize length of stay
def adjustAge(x):
    if x == '25 - 45':
        return '25 to 45'
    else:
        return x

# Quantize score_text to MediumHigh
def quantizeScore(x):
    if (x == 'High')| (x == 'Medium'):
        return 1
    else:
        return 0

    
dfQ['priors_count'] = dfQ['priors_count'].apply(quantizePrior)
dfQ['length_of_stay'] = dfQ['length_of_stay'].apply(quantizeLOS)
dfQ['score_text'] = dfQ['score_text'].apply(quantizeScore)
dfQ['age_cat'] = dfQ['age_cat'].apply(adjustAge)

In [28]:
from sklearn.model_selection import train_test_split
dfQ.to_csv('compas.csv',index=False, header=True)
# x_train, x_test, y_train, y_test = train_test_split(
#             x, y, test_size=0.2, random_state=random_state,
#         )

In [33]:
#  Let's measure the disparate impact according to the EEOC rule
means_score = dfQ.groupby(['score_text','race']).size().unstack().reset_index()
# means_score = means_score/means_score.sum()
means_score

race,score_text,African-American,Caucasian
0,0,1346,1407
1,1,1829,696


In [36]:
means_score.sum()

race
score_text             1
African-American    3175
Caucasian           2103
dtype: int64

In [35]:
# compute disparte impact
AA_with_high_score = means_score.loc[1,'African-American']
C_with_high_score = means_score.loc[1,'Caucasian']

# C_with_high_score/AA_with_high_score
AA_with_high_score

1829

In [32]:
# %load_ext autoreload
# %autoreload 2
# %matplotlib inline

# import sys
# sys.path.append("../")

# import gurobipy
# from json import dumps, loads
# from time import time

# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from sklearn.linear_model import LogisticRegression as skLogisticRegression
# from sklearn.metrics import (classification_report, f1_score, precision_score, recall_score)
# from tqdm import tnrange, trange
# import tensorflow as tf

# from mlsql.influence import InfluenceRanker
# from mlsql.fixer import AutoFixer
# from mlsql.manager import ModelManagerLM
# from mlsql.manager_test import ModelManagerTest

# from models.simple_cnn import SimpleCNN
# from models.logreg import LogReg
# from models.linear_comb import LinearComb
# from models.linear_comb_test import LinearCombTest
# from processors.compas import CompasProcessor


# import logging
# logging.getLogger("tensorflow").setLevel(logging.CRITICAL)

# import time
# import altair as alt
# alt.data_transformers.disable_max_rows()

race,score_text,African-American,Caucasian
0,0.0,0.423937,0.669044
1,1.0,0.576063,0.330956


race
score_text          1.0
African-American    1.0
Caucasian           1.0
dtype: float64