In [1]:
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import numpy
import pprint
%matplotlib inline

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

pp = pprint.PrettyPrinter(indent=4)

In [2]:
data = pd.read_csv(".\\Recidivism__Beginning_2008.csv")

In [3]:
county_labels = data['County of Indictment'].astype('category').cat.categories.tolist()
replace_county = {'County of Indictment' : {k: v for k,v in zip(county_labels,list(range(1,len(county_labels)+1)))}}

gender_labels = data['Gender'].astype('category').cat.categories.tolist()
replace_gender = {'Gender' : {'FEMALE': 0, 'MALE': 1}}

replace_status = {'Return Status':{'Returned Parole Violation': 1, 'New Felony Offense': 1, 'Not Returned': 0}}

In [4]:
data_w_dummies = data.copy()
data_w_dummies.replace(replace_county, inplace=True)
data_w_dummies.replace(replace_gender, inplace=True)
data_w_dummies.replace(replace_status, inplace=True)

In [5]:
data_w_dummies = data_w_dummies.drop('Release Year', axis=1)

In [6]:
x_data = data_w_dummies.iloc[:,0:-1]
y_data = data_w_dummies.iloc[:,-1:]
X_train, X_cv, y_train, y_cv = train_test_split(x_data, y_data, test_size=.20,
                                                    random_state=numpy.random.randint(200))

In [7]:
RandForest = RandomForestClassifier()  # Create Random Forest estimator object
GradBoost = GradientBoostingClassifier()
classifier_list = [GradBoost]

In [9]:
for classifier in classifier_list:
        clf = classifier

        scores, mean_auc, mean_tpr, mean_spec, mean_prec, mean_rec, mean_f1 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
        cm0 = numpy.zeros((2, 2))

        fit_data = clf.fit(X_train, y_train)
        preds = clf.predict(X_cv)

        '''Calculate performance metrics'''
        scores += metrics.accuracy_score(y_cv, preds)
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        mean_auc += roc_auc
        cm = metrics.confusion_matrix(y_cv, preds)
        mean_tpr += float(cm[0][0]) / numpy.sum(cm[0])
        cm0 += cm
        prec, rec, _, _ = metrics.precision_recall_fscore_support(y_cv, preds, pos_label=1, average='binary')
        mean_prec += numpy.mean(prec)
        mean_rec += numpy.mean(rec)
        mean_spec += rec
        mean_f1 += 2 * numpy.mean(prec) * numpy.mean(rec) / (numpy.mean(prec) + numpy.mean(rec))

        metrics_dict = {'Accuracy': float(scores),
                    'AUC': float(mean_auc),
                    'Sensitivity': float(mean_tpr),
                    'Specificity': float(mean_spec),
                    'Precision': float(mean_prec),
                    'Recall': float(mean_rec),
                    'F1-Score': float(mean_f1),
                    'Confusion Matrix': str(cm0),
                    'True Positive': int(cm0[1, 1]),
                    'True Negative': int(cm0[0, 0]),
                    'False Positive': int(cm0[0, 1]),
                    'False Negative': int(cm0[1, 0])}
        metrics_dict['Classifier'] = str(classifier).split('(')[0]
#         metrics_dict['Threshold'] = threshold
        pp.pprint(metrics_dict)

  y = column_or_1d(y, warn=True)


{   'AUC': 0.5359730305528906,
    'Accuracy': 0.6075734282708957,
    'Classifier': 'GradientBoostingClassifier',
    'Confusion Matrix': '[[13506.  1147.]\n [ 8553.  1512.]]',
    'F1-Score': 0.23766111285759198,
    'False Negative': 8553,
    'False Positive': 1147,
    'Precision': 0.5686348251222264,
    'Recall': 0.15022354694485843,
    'Sensitivity': 0.9217225141609227,
    'Specificity': 0.15022354694485843,
    'True Negative': 13506,
    'True Positive': 1512}
