# Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

# Get Data

In [3]:
data = pd.read_csv('compas-scores-two-years.csv')
data
#data.columns

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


In [4]:
data['race'].value_counts()

African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64

# Removing Unnecessary columns such as name,dates..

In [5]:
df = data.drop(['id','name','first','last','compas_screening_date','dob','c_offense_date','c_arrest_date','r_case_number','r_charge_degree',
                'r_days_from_arrest','r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid','start','end','vr_case_number',
                'vr_charge_degree','v_type_of_assessment','vr_offense_date','vr_charge_desc','screening_date','days_b_screening_arrest',
                'c_jail_in','c_jail_out','c_case_number','c_days_from_compas','v_screening_date','c_charge_desc','in_custody','out_custody',
                'type_of_assessment','is_recid'],axis=1)

df
#df.columns

Unnamed: 0,sex,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_charge_degree,is_violent_recid,decile_score.1,score_text,v_decile_score,v_score_text,priors_count.1,event,two_year_recid
0,Male,69,Greater than 45,Other,0,1,0,0,0,F,0,1,Low,1,Low,0,0,0
1,Male,34,25 - 45,African-American,0,3,0,0,0,F,1,3,Low,1,Low,0,1,1
2,Male,24,Less than 25,African-American,0,4,0,1,4,F,0,4,Low,3,Low,4,0,1
3,Male,23,Less than 25,African-American,0,8,1,0,1,F,0,8,High,6,Medium,1,0,0
4,Male,43,25 - 45,Other,0,1,0,0,2,F,0,1,Low,1,Low,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,Male,23,Less than 25,African-American,0,7,0,0,0,F,0,7,Medium,5,Medium,0,0,0
7210,Male,23,Less than 25,African-American,0,3,0,0,0,F,0,3,Low,5,Medium,0,0,0
7211,Male,57,Greater than 45,Other,0,1,0,0,0,F,0,1,Low,1,Low,0,0,0
7212,Female,33,25 - 45,African-American,0,2,0,0,3,M,0,2,Low,2,Low,3,0,0


Converting String(categorical) objects to numerics using pd.get_dummies()

In [6]:
df = pd.get_dummies(df)
df.columns
#df

Index(['age', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'is_violent_recid', 'decile_score.1',
       'v_decile_score', 'priors_count.1', 'event', 'two_year_recid',
       'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45',
       'age_cat_Less than 25', 'race_African-American', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other',
       'c_charge_degree_F', 'c_charge_degree_M', 'score_text_High',
       'score_text_Low', 'score_text_Medium', 'v_score_text_High',
       'v_score_text_Low', 'v_score_text_Medium'],
      dtype='object')

Training data -> X [Features]     
two_year_recid -> Y [Target]

In [7]:
df = df.drop('two_year_recid',axis=1)
X = df
Y = data['two_year_recid']
#Y
#X

# Splitting train and test 

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state=18)

In [11]:
X_train_no_race = X_train.drop(['race_African-American', 'race_Asian',
    'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other'],axis=1)
X_test_no_race = X_test.drop(['race_African-American', 'race_Asian',
    'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other'],axis=1)
X_test_no_race.columns

Index(['age', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'is_violent_recid', 'decile_score.1',
       'v_decile_score', 'priors_count.1', 'event', 'sex_Female', 'sex_Male',
       'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25',
       'c_charge_degree_F', 'c_charge_degree_M', 'score_text_High',
       'score_text_Low', 'score_text_Medium', 'v_score_text_High',
       'v_score_text_Low', 'v_score_text_Medium'],
      dtype='object')

# Fitting Random Forest Classifier

In [12]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train,y_train)

y_pred_test = rfc.predict(X_test)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1312
           1       0.90      0.87      0.89      1069

    accuracy                           0.90      2381
   macro avg       0.90      0.90      0.90      2381
weighted avg       0.90      0.90      0.90      2381



In [9]:
#cross validation scores
scores = cross_val_score(rfc,X_train,y_train,cv=5)
np.mean(scores)

0.8955112929574509

# Scaling the Dataset using StandardScalar()

In [10]:
scaler = StandardScaler()

# Scaling for training data
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
scaled_X_train

# Scaling for test data
scaled_X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
scaled_X_test

Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,...,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M,score_text_High,score_text_Low,score_text_Medium,v_score_text_High,v_score_text_Low,v_score_text_Medium
0,-0.410621,-0.135285,1.215746,-0.194043,-0.243268,-0.500756,-0.353553,1.215746,0.912465,-0.500756,...,-0.047762,-0.238444,0.733282,-0.733282,2.028370,-1.082201,-0.600535,-0.333601,-1.385091,1.767405
1,-0.662775,-0.135285,-0.178004,-0.194043,-0.243268,-0.096150,-0.353553,-0.178004,-0.275879,-0.096150,...,-0.047762,-0.238444,0.733282,-0.733282,-0.493007,0.924043,-0.600535,-0.333601,0.721974,-0.565801
2,-0.074417,-0.135285,-1.223316,-0.194043,-0.243268,-0.703060,-0.353553,-1.223316,-1.068108,-0.703060,...,-0.047762,4.193860,-1.363731,1.363731,-0.493007,0.924043,-0.600535,-0.333601,0.721974,-0.565801
3,-1.167082,-0.135285,1.912622,-0.194043,6.361878,-0.703060,2.828427,1.912622,2.100810,-0.703060,...,-0.047762,-0.238444,0.733282,-0.733282,2.028370,-1.082201,-0.600535,2.997589,-1.385091,-0.565801
4,1.690659,-0.135285,-1.223316,-0.194043,-0.243268,-0.703060,-0.353553,-1.223316,-1.068108,-0.703060,...,-0.047762,-0.238444,-1.363731,1.363731,-0.493007,0.924043,-0.600535,-0.333601,0.721974,-0.565801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,-0.746826,-0.135285,1.564184,-0.194043,1.958447,1.724581,-0.353553,1.564184,0.120236,1.724581,...,-0.047762,-0.238444,-1.363731,1.363731,2.028370,-1.082201,-0.600535,-0.333601,0.721974,-0.565801
2377,-0.746826,-0.135285,0.170434,-0.194043,-0.243268,-0.500756,-0.353553,0.170434,0.120236,-0.500756,...,-0.047762,-0.238444,0.733282,-0.733282,-0.493007,-1.082201,1.665183,-0.333601,0.721974,-0.565801
2378,-0.998980,-0.135285,1.912622,-0.194043,-0.243268,-0.500756,-0.353553,1.912622,2.496924,-0.500756,...,-0.047762,-0.238444,0.733282,-0.733282,2.028370,-1.082201,-0.600535,2.997589,-1.385091,-0.565801
2379,1.102300,-0.135285,-1.223316,-0.194043,-0.243268,-0.703060,-0.353553,-1.223316,-1.068108,-0.703060,...,-0.047762,-0.238444,-1.363731,1.363731,-0.493007,0.924043,-0.600535,-0.333601,0.721974,-0.565801


# Fitting Logistic Regression Classifier

In [11]:
lr = LogisticRegression()
lr.fit(scaled_X_train,y_train)

y_test_pred = lr.predict(scaled_X_test)
print("Test accuracy by Logistic Regression Classifier : ",accuracy_score(y_test,y_test_pred))
log_test = accuracy_score(y_test,y_test_pred)

Test accuracy by Logistic Regression Classifier :  0.9046619067618648


# Fitting KNN Classifier

In [10]:
clf = KNeighborsClassifier(n_neighbors = 5) 
clf.fit(X_train, y_train) 
#training_score = clf.score(X_train, y_train)
#print("Train accuracy:",training_score)
test_score = clf.score(X_test, y_test) 
print("Test accuracy by KNN Classifier : ",test_score)

Test accuracy by KNN Classifier :  0.7450650986980261


# Fitting XGBoost Classifier

In [11]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
preds = xgb.predict(X_test)
#print("train accuracy: ",accuracy_score(y_train,xgb.predict(X_train)))
print("Test accuracy by XGBoost Classifier : ",accuracy_score(preds,y_test))

Test accuracy by XGBoost Classifier :  0.8992020159596809


# Confusion Matrix by Random Forest Classifier

In [12]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train,y_train)

y_pred_test = rfc.predict(X_test)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1312
           1       0.90      0.87      0.89      1069

    accuracy                           0.90      2381
   macro avg       0.90      0.90      0.90      2381
weighted avg       0.90      0.90      0.90      2381



In [18]:
#y_positive = [y_pred_test == 1]
""" Predicted +ve and are categorized as African American by the race variable"""
X_test[y_pred_test == 1]['race_African-American'].value_counts()[1]

630

In [19]:
""" Predicted +ve and are categorized as Caucasian by the race variable"""
X_test[y_pred_test == 1]['race_Caucasian'].value_counts()[1]

294

In [15]:
#without race variable
rfc.fit(X_train_no_race,y_train)

y_pred_no_race = rfc.predict(X_test_no_race)
print(classification_report(y_test,y_pred_no_race))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      1312
           1       0.89      0.87      0.88      1069

    accuracy                           0.89      2381
   macro avg       0.89      0.89      0.89      2381
weighted avg       0.89      0.89      0.89      2381



In [24]:
""" Predicted +ve and are categorized as African American by the race variable"""
X_test[y_pred_no_race == 1]['race_African-American'].value_counts()[1]

637

In [25]:
""" Predicted +ve and are categorized as Caucasian by the race variable"""
X_test[y_pred_no_race == 1]['race_Caucasian'].value_counts()[1]

300

In [14]:
[[tn , fp],[fn , tp]] = confusion_matrix(y_test,y_pred_test)
print("True Negatives       : ", tn)
print("False Positives      : ", fp)
print("False Negatives      : ", fn)
print("True Positives       : ", tp)
y_test.shape

True Negatives       :  1216
False Positives      :  96
False Negatives      :  142
True Positives       :  927


(2381,)

In [16]:
true = y_test
pred = y_pred_test

unq = np.array([x + 2*y for x,y in zip(pred,true)])

fp = np.array(np.where(unq==1)).tolist()[0]

CHECK = X_test.iloc[fp]
fp_African = CHECK['race_African-American'].value_counts()[1]
fp_Caucasian = CHECK['race_Caucasian'].value_counts()[1]
fp_total = CHECK.shape[0]
#print("Hispanic             : ",CHECK['race_Hispanic'].value_counts()[1])
#print("Native American      :  -") #CHECK['race_Native American'].value_counts())
#print("Others               : ",CHECK['race_Other'].value_counts()[1])
#print("Asians               :  -")#,CHECK['race_Asian'].value_counts())

#Predicted +ve but actually not +ve
print("         False positive predictions by the model            ")
print("African-American         :  ",fp_African)
print("Caucasian                :  ",fp_Caucasian)
print("Total False Positives    :  ",fp_total)

print("             % of False positives                           ")
print("African-American         :  %.2f" % (fp_African/fp_total)    )
print("Caucasian                :  %.2f" % (fp_Caucasian/fp_total)  )

         False positive predictions by the model            
African-American         :   56
Caucasian                :   22
Total False Positives    :   96
             % of False positives                           
African-American         :  0.58
Caucasian                :  0.23


In [17]:
tp = np.array(np.where(unq==3)).tolist()[0]
#tn = np.array(np.where(unq==0)).tolist()[0]
#fn = np.array(np.where(unq==2)).tolist()[0]
true_positives = X_test.iloc[tp]

tp_African = true_positives['race_African-American'].value_counts()[1]
tp_Caucasian = true_positives['race_Caucasian'].value_counts()[1]
tp_total = true_positives.shape[0]

#Predicted +ve & actually +ve
print("         True positive predictions by the model             ")
print("African-American         : ",tp_African)
print("Caucasian                : ",tp_Caucasian)
print("Total True Positives     : ",tp_total)

print("                 % of True positives                        ")
print("African-American         :  %.2f" % (tp_African/tp_total)    )
print("Caucasian                :  %.2f" % (tp_Caucasian/tp_total)  )


         True positive predictions by the model             
African-American         :  568
Caucasian                :  268
Total True Positives     :  927
                 % of True positives                        
African-American         :  0.61
Caucasian                :  0.29


In [27]:
#true positives without race variable
true_no_race = y_test
pred_no_race = y_pred_no_race

unq_no_race = np.array([x + 2*y for x,y in zip(pred_no_race,true_no_race)])

tp_no_race = np.array(np.where(unq_no_race==3)).tolist()[0]
true_positives_no_race = X_test.iloc[tp_no_race]

tp_African_no_race = true_positives_no_race['race_African-American'].value_counts()[1]
tp_Caucasian_no_race = true_positives_no_race['race_Caucasian'].value_counts()[1]
tp_total_no_race = true_positives_no_race.shape[0]

#Predicted +ve & actually +ve
print("         True positive predictions by the model             ")
print("African-American         : ",tp_African_no_race)
print("Caucasian                : ",tp_Caucasian_no_race)
print("Total True Positives     : ",tp_total_no_race)

print("                 % of True positives                        ")
print("African-American         :  %.2f" % (tp_African_no_race/tp_total_no_race)    )
print("Caucasian                :  %.2f" % (tp_Caucasian_no_race/tp_total_no_race)  )


         True positive predictions by the model             
African-American         :  569
Caucasian                :  271
Total True Positives     :  931
                 % of True positives                        
African-American         :  0.61
Caucasian                :  0.29


In [None]:
# using hyperparameters 
grid_param = {
    'solver' : ['newton-cg','lfbgs','liblinear'],
    'C' : [100,10,1.0,0.1,0.01],
    'max_iters' : 1000,
    }

# initialization of GridSearch with the parameters- ModelName and the dictionary of parameters
clf = LogisticRegression()
grid_search = GridSearchCV(estimator=clf, param_grid=grid_param)

# applying gridsearch onto dataset
grid_search.fit(X_train, y_train)

# best parameters
grid_search.best_params_