# Team 6 Project: MINJUSTICIA

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import powerlaw as plw
import networkx as nx
import community as community_louvain
import seaborn as sns
pd.options.display.max_columns = 50

path = "https://raw.githubusercontent.com/sagilar/ds4a-team6/master/retomintic/Data_UpdateJune13/reincidencia11junio2020_clean.csv"
df_mj = pd.read_csv(path, sep=",",encoding="utf-8")

# Data

In [None]:
#Date variables are parsed to datetime
df_mj["FECHA_CAPTURA"] = pd.to_datetime(df_mj["FECHA_CAPTURA"])
df_mj["FECHA_INGRESO"] = pd.to_datetime(df_mj["FECHA_INGRESO"])
df_mj["FECHA_SALIDA"] = pd.to_datetime(df_mj["FECHA_SALIDA"])
#Month and year variables are defined
df_mj["MES_INGRESO_INT"]=df_mj["FECHA_INGRESO"].dt.strftime('%m')
df_mj["ANO_INGRESO_INT"]=df_mj["FECHA_INGRESO"].dt.strftime('%y')
#Calculations on how much time have the criminal being outside since its last stay in jail
for column in ['FECHA_INGRESO', 'FECHA_SALIDA', 'FECHA_CAPTURA']:
    df_mj = df_mj.sort_values(['INTERNOEN', column], ascending = False)
    
    df_mj['DIAS' + column[5:]] = -1*(df_mj[column].diff()/timedelta(days = 1))

    df_mj.loc[(df_mj.INTERNOEN != df_mj.INTERNOEN.shift(1)) | (df_mj['DIAS' + column[5:]] == 0), 
              ['DIAS' + column[5:]]] = (datetime.today() - df_mj[column])/timedelta(days = 1)
    
#It seems that sometimes entering and gettint out is switched, that's why we computed in absolute values
df_mj['DIAS_CONDENA'] = abs(df_mj['FECHA_SALIDA'] - df_mj['FECHA_INGRESO'])/timedelta(days = 1)
df_mj['DIAS_JUDICIALIZACION'] = df_mj['FECHA_INGRESO'] - df_mj['FECHA_CAPTURA']
df_mj['DIAS_LIBRE'] = df_mj['DIAS_INGRESO'] - df_mj['DIAS_CONDENA']
#The individual finishes its sentence but she's incarcelated inmediately for another crime
df_mj.loc[df_mj.DIAS_CAPTURA < 0, 'DIAS_CAPTURA'] = 0
df_mj.loc[df_mj.DIAS_INGRESO < 0, 'DIAS_INGRESO'] = 0
df_mj.loc[df_mj.DIAS_LIBRE < 0, 'DIAS_LIBRE'] = 0
#The individual is still on jail
df_mj.loc[df_mj['DIAS_LIBRE'].isnull(), 'DIAS_LIBRE'] = 0

#Find the last date the criminal went out the jail, so that these observations are marked as censored
last_df = df_mj[['INTERNOEN', 'FECHA_INGRESO']].groupby('INTERNOEN').apply(lambda x: x.sort_values('FECHA_INGRESO', ascending = False).head(1)).reset_index(drop = True)
#Censored
last_df['CENSURADO_LIBRES'] = 0
df_mj = df_mj.merge(last_df, on = ['INTERNOEN', 'FECHA_INGRESO'], how = 'left')
#Event
df_mj.loc[df_mj['CENSURADO_LIBRES'].isnull(), 'CENSURADO_LIBRES'] = 1
#All criminals that haven't got out of jail yet have zero days out and they are not censored.
df_mj.loc[df_mj['FECHA_SALIDA'].isnull(), 'CENSURADO_LIBRES'] = 1
#Turned censored variables to integers instead of float
df_mj['CENSURADO_LIBRES'] = df_mj['CENSURADO_LIBRES'].astype('int64')

#We create a variable to count the amount of times the individual re-entered in jail
df_mj = df_mj.merge(df_mj.drop_duplicates(['INTERNOEN', 'FECHA_INGRESO']).groupby(['INTERNOEN']).size().reset_index(name = 'NUMERO_REINCIDENCIAS'), on = 'INTERNOEN', how = 'left')

#We create a variable to count the number of crimes the individual commited on this times
df_mj = df_mj.merge(df_mj.groupby(['INTERNOEN', 'FECHA_INGRESO']).size().to_frame('CRIMENES').reset_index(), on = ['INTERNOEN', 'FECHA_INGRESO'])
                    
#We dropped SITUACION_JURIDICA and REINCIDENTE as both columns are constants
df_mj = df_mj.drop(columns = ['SITUACION_JURIDICA', 'REINCIDENTE'])

#Find the correlation across crimes
crime = df_mj[['INTERNOEN', 'DELITO']]
crime = crime.groupby(['INTERNOEN', 'DELITO']).size().reset_index()
crime.columns = ['id', 'crime', 'count']
crime = crime.pivot(index='id', columns='crime', values = 'count').fillna(0)
corr = crime.corr()

#From correlation matrix get similarity function
edge_dict = {}
h = 0
for i in np.arange(corr.shape[0]):
    for  j in np.arange(corr.shape[1]):
        if i > j:
            edge_dict[h] = [corr.columns[i], corr.columns[j], corr.iloc[i, j]]
        h+=1

#Similarity get defined
pd_edges = pd.DataFrame.from_dict(edge_dict).T
pd_edges.columns = ['source', 'target', 'weight']
pd_edges['weight'] = pd_edges['weight'].astype(float)
pd_edges.weight = (pd_edges.weight.max() - pd_edges.weight)

#Threshold on the correlation
distance_threshold = 0.6925
pd_edges['reduced_weight'] = pd_edges.weight
pd_edges.loc[pd_edges.weight > distance_threshold, 'reduced_weight'] = 0
pd_edges['similarity'] = 1 - pd_edges.reduced_weight/pd_edges.reduced_weight.max()
pd_edges.loc[pd_edges.similarity == 1, 'similarity'] = 0

#Define graph from threshold on the similarity
graph = nx.from_pandas_edgelist(pd_edges.loc[pd_edges.similarity > 0, ['source', 'target', 'weight']])
edges_result = pd_edges.loc[pd_edges.similarity > 0, ['source', 'target', 'similarity']]

#Find partitions from Louvain heuristics
partition = community_louvain.best_partition(graph)

#Add partition to node list
nodes_result = pd.DataFrame.from_dict(partition.items())
nodes_result.columns = ['Id', 'community'] 
nodes_result['label'] = nodes_result['Id']

#Add degrees to node list
nodes_result = nodes_result.merge(pd.DataFrame(graph.degree, columns = ['Id', 'degree']), on = 'Id')

#Add centrality to node list
centrality = pd.DataFrame.from_dict(nx.eigenvector_centrality(graph).items()).reset_index(drop = True)
centrality.columns = ['Id', 'eigencentrality']
nodes_result = nodes_result.merge(centrality, on = 'Id')

#Add graph information to data
df_mj = df_mj.merge(nodes_result[['Id', 'community', 'eigencentrality', 'degree']], left_on = 'DELITO', right_on = 'Id')

#Add how many times a crime repeats
delitos_df = pd.DataFrame(df_mj.DELITO.value_counts().sort_values()).reset_index()
delitos_df['logDelito'] = np.log(delitos_df.DELITO) + 1
df_mj = df_mj.merge(delitos_df[['index', 'logDelito']], left_on = 'DELITO', right_on = 'index')

#Add dummy to test whether the individuals was in jaiñ 
df_mj['EN_CARCEL'] = 0
df_mj.loc[df_mj['ESTADO_INGRESO'].isin(['Espera Traslado', 'Intramuros']),'EN_CARCEL'] = 1

#Makes more readable the education years
df_mj['NIVEL_EDUCATIVO'] = df_mj['NIVEL_EDUCATIVO'].map(
            {'ANALFABETA': 0, 'CICLO I': 2, 'CICLO II': 5, 'CICLO III': 9,
            'CICLO IV': 11, 'TECNICO': 13, 'TECNOLOGICO': 14, 'PROFESIONAL': 16,
            'MAGISTER': 18, 'POST GRADO': 18})

In [38]:
df_mj = df_mj.merge(df_mj.groupby(['INTERNOEN', 'FECHA_INGRESO']).size().to_frame('CRIMENES').reset_index(), on = ['INTERNOEN', 'FECHA_INGRESO'])

In [41]:
df_mj.head()

cuantitativas = ['EDAD', 'NIVEL_EDUCATIVO', 'MES_INGRESO_INT', 'ANO_INGRESO_INT', 'DIAS_INGRESO', 'DIAS_SALIDA', 'DIAS_CAPTURA', 'DIAS_JUDICIALIZACION', 'DIAS_LIBRE', 'NUMERO_REINCIDENCIAS', 'CRIMENES', 'eigencentrality', 'degree', 'logDelito']
categoricas = ['TENTATIVA', 'AGRAVADO', 'CALIFICADO', 'GENERO', 'GENERO', 'REGIONAL', 'HIJOS_MENORES', 'ESTADO', 'community', 'EN_CARCEL', 'CENSURADO_LIBRES']