# IMPORTING PACKAGES

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model, decomposition, datasets
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from fairlearn.metrics import demographic_parity_difference
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
from sklearn.ensemble import AdaBoostClassifier

# READING DATA

In [3]:
train = pd.read_csv("recid.csv")
num_cols = train.shape[1]
print("Number of columns in the dataset are : ",num_cols)
train_new = train.drop(['id','name','first','last','compas_screening_date','dob','c_offense_date','c_arrest_date','r_case_number','r_charge_degree',
                'r_days_from_arrest','r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid','start','end','vr_case_number',
                'vr_charge_degree','v_type_of_assessment','vr_offense_date','vr_charge_desc','screening_date','days_b_screening_arrest',
                'c_jail_in','c_jail_out','c_case_number','c_days_from_compas','v_screening_date','c_charge_desc','in_custody','out_custody',
                'type_of_assessment','is_recid'],axis=1)
train_final = pd.get_dummies(train_new)
num_cols = train_final.shape[1]
print("Number of columns after numeric conversion : ",num_cols)

Number of columns in the dataset are :  53
Number of columns after numeric conversion :  31


# REMOVING SENSITIVE FEATURES

In [4]:
# train_new = train.drop(['race'],axis=1)
# num_cols = train_new.shape[1]
# print("Number of columns in the dataset after removing unwanted columns are : ",num_cols)
# train_new.head(5)

# TRAIN TEST SPLIT ON DATA

In [5]:
train_final = train_final.drop('two_year_recid',axis=1)
X = train_final
Y = train['two_year_recid']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state=18)
X_train_no_race = X_train.drop(['race_African-American', 'race_Asian',
    'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other'],axis=1)
X_test_no_race = X_test.drop(['race_African-American', 'race_Asian',
    'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other'],axis=1)
X_test_no_race.columns
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (4833, 30)
Shape of X_test: (2381, 30)
Shape of y_train: (4833,)
Shape of y_test: (2381,)


# TRAINING LOGISTIC REGRESSION MODEL

In [8]:
logreg = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', random_state=18, n_jobs=-1, class_weight='balanced')

logreg.fit(X_train_no_race, y_train)
y_pred_no_race = logreg.predict(X_test_no_race)
accuracy = accuracy_score(y_test, y_pred_no_race)
print("Accuracy:", accuracy)
print(classification_report(y_test,y_pred_no_race))
cm=np.array(confusion_matrix(y_test,y_pred_no_race,labels=[1,0]))
confusion=pd.DataFrame(cm,index=['is_recividated','is_not_recividated'],columns=['predicted_recividated','predicted_not_recividated'])
print(confusion)

Accuracy: 0.9042419151616967
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1312
           1       0.93      0.85      0.89      1069

    accuracy                           0.90      2381
   macro avg       0.91      0.90      0.90      2381
weighted avg       0.91      0.90      0.90      2381

                    predicted_recividated  predicted_not_recividated
is_recividated                        910                        159
is_not_recividated                     69                       1243


In [9]:
""" Predicted +ve and are categorized as African American by the race variable"""
X_test[y_pred_no_race == 1]['race_African-American'].value_counts()[1]

596

In [10]:
""" Predicted +ve and are categorized as Caucasian by the race variable"""
X_test[y_pred_no_race == 1]['race_Caucasian'].value_counts()[1]

285

In [11]:
#true positives without race variable
true_no_race = y_test
pred_no_race = y_pred_no_race

unq_no_race = np.array([x + 2*y for x,y in zip(pred_no_race,true_no_race)])

tp_no_race = np.array(np.where(unq_no_race==3)).tolist()[0]
true_positives_no_race = X_test.iloc[tp_no_race]

tp_African_no_race = true_positives_no_race['race_African-American'].value_counts()[1]
tp_Caucasian_no_race = true_positives_no_race['race_Caucasian'].value_counts()[1]
tp_total_no_race = true_positives_no_race.shape[0]

#Predicted +ve & actually +ve
print("         True positive predictions by the model             ")
print("African-American         : ",tp_African_no_race)
print("Caucasian                : ",tp_Caucasian_no_race)
print("Total True Positives     : ",tp_total_no_race)

print("                 % of True positives                        ")
print("African-American         :  %.2f" % (tp_African_no_race/tp_total_no_race)    )
print("Caucasian                :  %.2f" % (tp_Caucasian_no_race/tp_total_no_race)  )

         True positive predictions by the model             
African-American         :  556
Caucasian                :  266
Total True Positives     :  910
                 % of True positives                        
African-American         :  0.61
Caucasian                :  0.29


In [13]:
sensitive_features = np.array([X_train['race_Caucasian'], X_train['race_African-American']]).T
sensitive_features_test = np.array([X_test['race_Caucasian'], X_test['race_African-American']]).T

In [15]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix for the Caucasian group
cm_caucasian = confusion_matrix(y_test[sensitive_features_test[:, 0] == 1], y_pred_no_race[sensitive_features_test[:, 0] == 1])
false_positives_caucasian = cm_caucasian[0, 1]

# Compute the confusion matrix for the African-American group
cm_african_american = confusion_matrix(y_test[sensitive_features_test[:, 1] == 1], y_pred_no_race[sensitive_features_test[:, 1] == 1])
false_positives_african_american = cm_african_american[0, 1]

print("False positives for the Caucasian group: ", false_positives_caucasian)
print("False positives for the African-American group: ", false_positives_african_american)
total_false_positives = false_positives_caucasian + false_positives_african_american
print("Total False positives are: ", total_false_positives)


from sklearn.metrics import confusion_matrix

# Compute the confusion matrix for the Caucasian group
cm_caucasian = confusion_matrix(y_test[sensitive_features_test[:, 0] == 1], y_pred_no_race[sensitive_features_test[:, 0] == 1])
true_positives_caucasian = cm_caucasian[1, 1]

# Compute the confusion matrix for the African-American group
cm_african_american = confusion_matrix(y_test[sensitive_features_test[:, 1] == 1], y_pred_no_race[sensitive_features_test[:, 1] == 1])
true_positives_african_american = cm_african_american[1, 1]

print("True positives for the Caucasian group: ", true_positives_caucasian)
print("True positives for the African-American group: ", true_positives_african_american)

False positives for the Caucasian group:  19
False positives for the African-American group:  40
Total False positives are:  59
True positives for the Caucasian group:  266
True positives for the African-American group:  556


# RANDOM FOREST MODEL

In [18]:
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train_no_race, y_train)
y_pred_no_race = rf_classifier.predict(X_test_no_race)
accuracy = accuracy_score(y_test, y_pred_no_race)
print(accuracy)


0.8945821083578328


In [19]:
cm=np.array(confusion_matrix(y_test,y_pred_no_race,labels=[1,0]))
confusion=pd.DataFrame(cm,index=['is_recividated','is_not_recividated'],columns=['predicted_recividated','predicted_not_recividated'])
print(confusion)

                    predicted_recividated  predicted_not_recividated
is_recividated                        932                        137
is_not_recividated                    114                       1198


In [20]:
sensitive_features = np.array([X_train['race_Caucasian'], X_train['race_African-American']]).T
sensitive_features_test = np.array([X_test['race_Caucasian'], X_test['race_African-American']]).T


In [21]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix for the Caucasian group
cm_caucasian = confusion_matrix(y_test[sensitive_features_test[:, 0] == 1], y_pred_no_race[sensitive_features_test[:, 0] == 1])
false_positives_caucasian = cm_caucasian[0, 1]

# Compute the confusion matrix for the African-American group
cm_african_american = confusion_matrix(y_test[sensitive_features_test[:, 1] == 1], y_pred_no_race[sensitive_features_test[:, 1] == 1])
false_positives_african_american = cm_african_american[0, 1]

print("False positives for the Caucasian group: ", false_positives_caucasian)
print("False positives for the African-American group: ", false_positives_african_american)
total_false_positives = false_positives_caucasian + false_positives_african_american
print("Total False positives are: ", total_false_positives)


from sklearn.metrics import confusion_matrix

# Compute the confusion matrix for the Caucasian group
cm_caucasian = confusion_matrix(y_test[sensitive_features_test[:, 0] == 1], y_pred_no_race[sensitive_features_test[:, 0] == 1])
true_positives_caucasian = cm_caucasian[1, 1]

# Compute the confusion matrix for the African-American group
cm_african_american = confusion_matrix(y_test[sensitive_features_test[:, 1] == 1], y_pred_no_race[sensitive_features_test[:, 1] == 1])
true_positives_african_american = cm_african_american[1, 1]

print("True positives for the Caucasian group: ", true_positives_caucasian)
print("True positives for the African-American group: ", true_positives_african_american)

False positives for the Caucasian group:  28
False positives for the African-American group:  64
Total False positives are:  92
True positives for the Caucasian group:  270
True positives for the African-American group:  570
