In [1]:
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import numpy
import pprint
%matplotlib inline

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier


pp = pprint.PrettyPrinter(indent=4)

In [2]:
data = pd.read_csv(".\\Recidivism__Beginning_2008.csv")
GradBoost = GradientBoostingClassifier()

In [3]:
# Prepare replacements for categorical variables
county_labels = data['County of Indictment'].astype('category').cat.categories.tolist()
replace_county = {'County of Indictment' : {k: v for k,v in zip(county_labels,list(range(1,len(county_labels)+1)))}}

gender_labels = data['Gender'].astype('category').cat.categories.tolist()
replace_gender = {'Gender' : {'FEMALE': 0, 'MALE': 1}}

replace_status = {'Return Status':{'Returned Parole Violation': 1, 'New Felony Offense': 1, 'Not Returned': 0}}

In [4]:
# Copy and add dummies for categorical variables
data_w_dummies = data.copy()
data_w_dummies.replace(replace_county, inplace=True)
data_w_dummies.replace(replace_gender, inplace=True)
data_w_dummies.replace(replace_status, inplace=True)
data_w_dummies = data_w_dummies.drop('Release Year', axis=1)

In [5]:
x_data = data_w_dummies.iloc[:,0:-1]
y_data = data_w_dummies.iloc[:,-1:]
X_train, X_cv, y_train, y_cv = train_test_split(x_data, y_data, test_size=.20,
                                                    random_state=numpy.random.randint(200))

In [6]:
clf = GradBoost

scores, mean_auc, mean_tpr, mean_spec, mean_prec, mean_rec, mean_f1 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
cm0 = numpy.zeros((2, 2))

fit_data = clf.fit(X_train, y_train)
preds = clf.predict(X_cv)

'''Calculate performance metrics'''
scores = metrics.accuracy_score(y_cv, preds)
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
mean_auc = roc_auc
cm = metrics.confusion_matrix(y_cv, preds)
mean_tpr = float(cm[0][0]) / numpy.sum(cm[0])
cm0 = cm
prec, rec, _, _ = metrics.precision_recall_fscore_support(y_cv, preds, pos_label=1, average='binary')
mean_prec = numpy.mean(prec)
mean_rec = numpy.mean(rec)
mean_spec = rec
mean_f1 = 2 * numpy.mean(prec) * numpy.mean(rec) / (numpy.mean(prec) + numpy.mean(rec))

metrics_dict = {'Accuracy': float(scores),
            'AUC': float(mean_auc),
            'Sensitivity': float(mean_tpr),
            'Specificity': float(mean_spec),
            'Precision': float(mean_prec),
            'Recall': float(mean_rec),
            'F1-Score': float(mean_f1),
            'Confusion Matrix': str(cm0),
            'True Positive': int(cm0[1, 1]),
            'True Negative': int(cm0[0, 0]),
            'False Positive': int(cm0[0, 1]),
            'False Negative': int(cm0[1, 0])}

metrics_dict = {'Accuracy': float(scores),
            'AUC': float(mean_auc),
            'Sensitivity': float(mean_tpr),
            'Specificity': float(mean_spec),
            'Precision': float(mean_prec),
            'Recall': float(mean_rec),
            'F1-Score': float(mean_f1),
            'Confusion Matrix': str(cm0),
            'True Positive': int(cm0[1, 1]),
            'True Negative': int(cm0[0, 0]),
            'False Positive': int(cm0[0, 1]),
            'False Negative': int(cm0[1, 0])}
metrics_dict['Classifier'] = str(clf).split('(')[0]
metrics_df = pd.DataFrame.from_records(metrics_dict, index=[0], exclude=None, columns=None, coerce_float=False, nrows=None)
metrics_df.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,AUC,Accuracy,Classifier,Confusion Matrix,F1-Score,False Negative,False Positive,Precision,Recall,Sensitivity,Specificity,True Negative,True Positive
0,0.529966,0.59754,GradientBoostingClassifier,[[13332 1173]\n [ 8775 1438]],0.224267,8775,1173,0.550747,0.140801,0.919131,0.140801,13332,1438
