# Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Carregamento e Pré-Processamento dos Dados

In [None]:
ds = pd.read_csv('../datasets/compas-propublica/compas-scores-two-years.csv')

def preProcess(ds):
    # select African-American and Caucasian people
    ds = ds[(ds['race'] == 'African-American') | (ds['race'] == 'Caucasian')]
    
    # drop duplicated feature columns
    ds.drop(ds[['decile_score.1', 'screening_date', 'v_screening_date', 'priors_count.1']], axis=1, inplace=True)

    # convert dates in string to date/time format
    ds['compas_screening_date'] = pd.to_datetime(ds['compas_screening_date'])
    ds['dob'] = pd.to_datetime(ds['dob'])
    ds['c_jail_in'] = pd.to_datetime(ds['c_jail_in'])
    ds['c_jail_out'] = pd.to_datetime(ds['c_jail_out'])
    ds['c_offense_date'] = pd.to_datetime(ds['c_offense_date'])
    ds['c_arrest_date'] = pd.to_datetime(ds['c_arrest_date'])
    ds['r_offense_date'] = pd.to_datetime(ds['r_offense_date'])
    ds['r_jail_in'] = pd.to_datetime(ds['r_jail_in'])
    ds['r_jail_out'] = pd.to_datetime(ds['r_jail_out'])
    ds['vr_offense_date'] = pd.to_datetime(ds['vr_offense_date'])
    ds['in_custody'] = pd.to_datetime(ds['in_custody'])
    ds['out_custody'] = pd.to_datetime(ds['out_custody'])

    # converting sex categorical variable in numeric variable
    sex = {'Female':1, 'Male':2}
    ds['sex'] = ds['sex'].map(sex)
    # converting race categorical variable in numeric variable
    race = {'African-American':1, 'Caucasian':2}
    ds['race'] = ds['race'].map(race)
    # converting c_charge_degree categorical variable in numeric variable
    c_charge_degree = {'F':1, 'M':2}
    ds['c_charge_degree'] = ds['c_charge_degree'].map(c_charge_degree)
    # converting r_charge_degree and vr_charge_degree categorical variables in numeric variables
    r_vr_charge_degree = {np.nan:0, '(CO3)':1, '(F1)':2, '(F2)':3, '(F3)':4, '(F5)':5, '(F6)':6, '(F7)':7, '(M1)':8, '(M2)':9, '(MO3)':10}
    ds['r_charge_degree'] = ds['r_charge_degree'].map(r_vr_charge_degree)
    ds['vr_charge_degree'] = ds['vr_charge_degree'].map(r_vr_charge_degree)

    # replace nan values in number 1000
    ds['days_b_screening_arrest'] = [1000 if np.isnan(days) else days for days in ds['days_b_screening_arrest']]
    ds['r_days_from_arrest'] = [1000 if np.isnan(days) else days for days in ds['r_days_from_arrest']]

    # calculate the prison days and turns into int values type
    ds['prison_days'] = (ds['c_jail_out'] - ds['c_jail_in'])
    ds['prison_days'] = [str(days).split()[0] for days in ds['prison_days']]
    ds['prison_days'] = [1000 if days=='NaT' else int(days) for days in ds['prison_days']]
    ds['prison_days'] = pd.to_numeric(ds['prison_days'])

    # calculate the custody days and turns into int values type
    ds['custody_days'] = (ds['out_custody'] - ds['in_custody'])
    ds['custody_days'] = [str(days).split()[0] for days in ds['custody_days']]
    ds['custody_days'] = [1000 if days=='NaT' else int(days) for days in ds['custody_days']]
    ds['custody_days'] = pd.to_numeric(ds['custody_days'])

    # classificate medium and high score like highscore and convert to 0 and 1
    ds['score_binary'] = np.where(ds['score_text'] != 'Low', 1, 0)

    return ds

ds = preProcess(ds)

## Testes de Visualizações

In [None]:
# variaveis categoricas: 'sex', 'race', 'c_charge_degree', 'r_charge_degree', 'vr_charge_degree'

#print(ds['race'].value_counts().to_string())
#print(ds['days_b_screening_arrest'].value_counts().sort_values())

#print(ds['r_days_from_arrest'].isna().sum())

#print(ds[(ds['race'] == 1) | (ds['race'] == 3)]['race'].value_counts())

#print(ds.dtypes)

#print(ds['days_b_screening_arrest'][3])

In [None]:
# for i in range(len(ds)):
#     if ds.loc[i]['priors_count'] != ds.loc[i]['priors_count.1']:
#         print(i, ds.loc[i]['priors_count'], ds.loc[i]['priors_count.1'])

# for i in range(len(ds)):
#     print(i, ds.loc[i]['is_recid'], ds.loc[i]['y_pred'])

In [None]:
# for linha in ds.index:
#     if ds['c_jail_in'][linha] > ds['c_jail_out'][linha]:
#         print(linha, ds['c_jail_in'][linha], ds['c_jail_out'][linha])

# Seleção dos Atributos de Treino e Rótulo

In [None]:
data_x = ds[['sex', 'age', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_charge_degree', 'is_recid', 'r_charge_degree', 'r_days_from_arrest', 'is_violent_recid', 'vr_charge_degree', 'prison_days', 'custody_days']]

data_y = ds['two_year_recid']

prev_compas = ds['score_binary']

# Divisão do Dataset para Treino e Teste

In [None]:
qtd70 = (len(data_x) * 70) // 100

train_x = data_x.loc[:qtd70]
train_y = data_y.loc[:qtd70]

test_x = data_x.loc[qtd70:]
test_y = data_y.loc[qtd70:]
test_prev_compas = prev_compas.loc[qtd70:]

# Carregamento e Execução de Modelos

In [None]:
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()

In [None]:
random_forest.fit(train_x, train_y)
gradient_boost.fit(train_x, train_y)

prev_random_forest = random_forest.predict(test_x)
prev_gradient_boost = gradient_boost.predict(test_x)

# Cálculo de Métricas

In [184]:
print('======= Random Forest with True Values =======')
print(f'Accuracy: {accuracy_score(test_y, prev_random_forest):.4f}')
print(f'Precision: {precision_score(test_y, prev_random_forest):.4f}')
print(f'Recall: {recall_score(test_y, prev_random_forest):.4f}')
#TN[0][0], FN[1][0], TP[1][1], FP[0][1]
print(f'Confusion Matrix: \n{confusion_matrix(test_y, prev_random_forest)}')

print('\n======= Gradient Boosting with True Values =======')
print(f'Accuracy: {accuracy_score(test_y, prev_gradient_boost):.4f}')
print(f'Precision: {precision_score(test_y, prev_gradient_boost):.4f}')
print(f'Recall: {recall_score(test_y, prev_gradient_boost):.4f}')
print(f'Confusion Matrix: \n{confusion_matrix(test_y, prev_gradient_boost)}')

Accuracy: 0.9661
Precision: 0.9330
Recall: 1.0000
Confusion Matrix: 
[[1222   84]
 [   0 1169]]

Accuracy: 0.9653
Precision: 0.9329
Recall: 0.9983
Confusion Matrix: 
[[1222   84]
 [   2 1167]]


In [185]:
print('======= Random Forest with COMPAS Prevision =======')
print(f'Accuracy: {accuracy_score(test_prev_compas, prev_random_forest):.4f}')
print(f'Precision: {precision_score(test_prev_compas, prev_random_forest):.4f}')
print(f'Recall: {recall_score(test_prev_compas, prev_random_forest):.4f}')
#TN[0][0], FN[1][0], TP[1][1], FP[0][1]
print(f'Confusion Matrix: \n{confusion_matrix(test_prev_compas, prev_random_forest)}')

print('\n======= Gradient Boosting with COMPAS Prevision =======')
print(f'Accuracy: {accuracy_score(test_prev_compas, prev_gradient_boost):.4f}')
print(f'Precision: {precision_score(test_prev_compas, prev_gradient_boost):.4f}')
print(f'Recall: {recall_score(test_prev_compas, prev_gradient_boost):.4f}')
print(f'Confusion Matrix: \n{confusion_matrix(test_prev_compas, prev_gradient_boost)}')

Accuracy: 0.6396
Precision: 0.6464
Recall: 0.6434
Confusion Matrix: 
[[773 443]
 [449 810]]

Accuracy: 0.6388
Precision: 0.6459
Recall: 0.6418
Confusion Matrix: 
[[773 443]
 [451 808]]


# --------------- ANTIGO ---------------

# Pré-Processamento do Dataset

Utilizando [função](https://docs.responsibly.ai/_modules/responsibly/dataset/compas.html#COMPASDataset) de pré-processamento do pacote responsibly

In [None]:
# # load of the dataset
# #ds = pd.read_csv("../datasets/compas/cox-violent-parsed.csv")

# # export the dataset to csv file
# #ds.to_csv('dataset.csv', index=False, encoding='utf8')

# def preProcess(ds):
#     # filter row of:
#     # screening arrest between and -30 and 30
#     # without recid information (-1)
#     # charge degree other than O
#     # score text not empty
#     ds = ds[(ds['days_b_screening_arrest'] <= 30) & (ds['days_b_screening_arrest'] >= -30) & (ds['is_recid'] != -1) & (ds['c_charge_degree'] != 'O') & (ds['score_text'] != 'N/A')]

#     # convert values of jail in and jail out to date/time format
#     ds['c_jail_out'] = pd.to_datetime(ds['c_jail_out'])
#     ds['c_jail_in'] = pd.to_datetime(ds['c_jail_in'])
#     # calculate the prison days
#     ds['length_of_stay'] = (ds['c_jail_out'] - ds['c_jail_in'])

#     # classificate medium and high score like highscore
#     ds['score_factor'] = np.where(ds['score_text'] != 'Low', 'HighScore', 'LowScore')
#     # create the prediction values, highscore is 1 and lowscore is 0
#     ds['y_pred'] = (ds['score_factor'] == 'HighScore')

#     return ds

# #ds = preProcess(ds)

# # export the dataset pre processed to csv file
# #ds.to_csv('dataset-pre-processed.csv', index=False, encoding='utf8')

# Carregamento do Dataset Pré-Processado

In [None]:
# ds = pd.read_csv("dataset-pre-processed.csv")

In [None]:
# # transform negative days in positive values of lenght_of_stay feature
# ds['length_of_stay_pp'] = [int(a.split()[0]) * -1 if int(a.split()[0]) < 0 else int(a.split()[0]) for a in ds['length_of_stay']]

# #drop features not importants to prediction
# #ds.drop(ds[['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'is_recid', 'r_case_number', 'r_offense_date', 'r_jail_in', 'r_jail_out', 'violent_recid', 'vr_case_number', 'vr_offense_date', 'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date', 'v_type_of_assessment', 'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1', 'y_pred']], axis=1, inplace=True)

# # data features to train and test models
# data_x = ds[['sex', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_charge_degree', 'length_of_stay']]

# label = ds['is_recid']
# prev = ds['y_pred']

# # interessante fazer um tratamento das datas de custódia, calcular os dias talvez
# # tratar a coluna length_of_stay
# # fazer tratamento de linhas repetidas para o mesmo detento, diferencia apenas as datas de custódia, pode ser possível filtrar utilizando o nome e a data de ocorrência do crime, já que existem mais de um crime para determinadas pessoas

In [None]:
# # for linha in ds.index:
# #     if ds['c_jail_in'][linha] > ds['c_jail_out'][linha]:
# #         print(linha, ds['c_jail_in'][linha], ds['c_jail_out'][linha])

# for linha in ds.index:
#     print(ds['name'][linha])

# # print(ds[['c_jail_in', 'c_jail_out', 'length_of_stay']].loc[792])


# # print(ds['length_of_stay_pp'])
# # print(ds['length_of_stay'])

In [None]:
# print(ds['name'].value_counts().sort_values().to_string())

# # for i in range(len(ds)):
# #     if str(ds.loc[i]['violent_recid']) != 'nan':
# #         print(i, ds.loc[i]['violent_recid'])