In [1]:
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import numpy
%matplotlib inline

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from sklearn.ensemble import GradientBoostingClassifier

In [2]:

GradBoost = GradientBoostingClassifier()

In [3]:
data = pd.read_csv(".\\Recidivism__Beginning_2008.csv")
data = data[data['Return Status']!='Returned Parole Violation']
data.sample(n=5)

Unnamed: 0,Release Year,County of Indictment,Gender,Age at Release,Return Status
83334,2009,NEW YORK,FEMALE,46,New Felony Offense
118617,2008,QUEENS,MALE,44,Not Returned
70959,2010,WAYNE,MALE,29,Not Returned
37902,2011,NEW YORK,MALE,50,Not Returned
3124,2012,BROOME,MALE,30,Not Returned


In [4]:
# Prepare replacements for categorical variables
county_labels = data['County of Indictment'].astype('category').cat.categories.tolist()
replace_county = {'County of Indictment' : {k: v for k,v in zip(county_labels,list(range(1,len(county_labels)+1)))}}

gender_labels = data['Gender'].astype('category').cat.categories.tolist()
replace_gender = {'Gender' : {'FEMALE': 0, 'MALE': 1}}

replace_status = {'Return Status':{'Returned Parole Violation': 1, 'New Felony Offense': 1, 'Not Returned': 0}}

In [5]:
# Copy and add dummies for categorical variables
data_w_dummies = data.copy()
data_w_dummies.replace(replace_county, inplace=True)
data_w_dummies.replace(replace_gender, inplace=True)
data_w_dummies.replace(replace_status, inplace=True)
# data_w_dummies = data_w_dummies.drop('Release Year', axis=1)
# data_w_dummies = data_w_dummies.drop('County of Indictment', axis=1)
# data_w_dummies = data_w_dummies[data_w_dummies['Gender']==1]
# data_w_dummies = data_w_dummies.drop('Gender', axis=1)

df_majority = data_w_dummies[data_w_dummies['Return Status']==0]
df_minority = data_w_dummies[data_w_dummies['Return Status']==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=72477,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
data_w_dummies = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
data_w_dummies['Return Status'].value_counts()

1    72477
0    72477
Name: Return Status, dtype: int64

In [6]:
x_data = data_w_dummies.iloc[:,0:-1]
y_data = data_w_dummies.iloc[:,-1:]
X_train, X_cv, y_train, y_cv = train_test_split(x_data, y_data, test_size=.20,
                                                    random_state=numpy.random.randint(200))

In [7]:
clf = GradBoost
scores, mean_auc, mean_tpr, mean_spec, mean_prec, mean_rec, mean_f1 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
cm0 = numpy.zeros((2, 2))

fit_data = clf.fit(X_train, y_train)
preds = clf.predict(X_cv)

'''Calculate performance metrics'''
scores = metrics.accuracy_score(y_cv, preds)
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
mean_auc = roc_auc
cm = metrics.confusion_matrix(y_cv, preds)
mean_tpr = float(cm[0][0]) / numpy.sum(cm[0])
cm0 = cm
prec, rec, _, _ = metrics.precision_recall_fscore_support(y_cv, preds, pos_label=1, average='binary')
mean_prec = numpy.mean(prec)
mean_rec = numpy.mean(rec)
mean_spec = rec
mean_f1 = 2 * numpy.mean(prec) * numpy.mean(rec) / (numpy.mean(prec) + numpy.mean(rec))

metrics_dict = {'Accuracy': float(scores),
            'AUC': float(mean_auc),
            'Sensitivity': float(mean_tpr),
            'Specificity': float(mean_spec),
            'Precision': float(mean_prec),
            'Recall': float(mean_rec),
            'F1-Score': float(mean_f1),
            'True Positive': int(cm0[1, 1]),
            'True Negative': int(cm0[0, 0]),
            'False Positive': int(cm0[0, 1]),
            'False Negative': int(cm0[1, 0])}

metrics_dict = {'Accuracy': float(scores),
            'AUC': float(mean_auc),
            'Sensitivity': float(mean_tpr),
            'Specificity': float(mean_spec),
            'Precision': float(mean_prec),
            'Recall': float(mean_rec),
            'F1-Score': float(mean_f1),
            'True Positive': int(cm0[1, 1]),
            'True Negative': int(cm0[0, 0]),
            'False Positive': int(cm0[0, 1]),
            'False Negative': int(cm0[1, 0]),
            'Incorrect': int(cm0[0, 1]) + int(cm0[1, 0]),
            'Correct': int(cm0[1, 1]) + int(cm0[0, 0])}
metrics_dict['Classifier'] = str(clf).split('(')[0]
metrics_df = pd.DataFrame.from_records(metrics_dict, index=[0], exclude=None, columns=None, coerce_float=False, nrows=None)
metrics_df.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,AUC,Accuracy,Classifier,Correct,F1-Score,False Negative,False Positive,Incorrect,Precision,Recall,Sensitivity,Specificity,True Negative,True Positive
0,0.584537,0.584216,GradientBoostingClassifier,16937,0.606053,5144,6910,12054,0.572982,0.643174,0.525901,0.643174,7665,9272
