In [58]:
# decision tree
# ./github/profile/linea-144/data/linea144-2021.csv


In [72]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, f1_score


In [81]:
def get_data(data_, target_):
    X = data_.drop(target_, axis=1)
    y = data_[target_]

    return X, y

def results_classifier(y_test_, y_pred_, print_ = False):
    y_pred_ = y_pred_.astype(np.int64)
    cm = confusion_matrix(y_test_, y_pred_)
    report = classification_report(y_test_, y_pred_, zero_division=0.0)
    score = f1_score(y_test_, y_pred_)
    
    if print_:
        print("Matriz de confusão:\n", cm)
        print("Report:\n", report)
        
    return score

def compute_tree(X_train_, X_test_, y_train_, y_test_):    
    model = DecisionTreeClassifier()
    model.fit(X_train_, y_train_)
    y_pred = model.predict(X_test_)
    return results_classifier(y_test_, y_pred, True)


In [61]:
# Read and concat all files
data = pd.DataFrame()
for year in range(2020,2024):
    d = pd.read_csv(f'~/dev/github/profile/linea-144/data/linea144-{year}.csv')
    data = pd.concat([data,d])

data.shape

(84861, 21)

In [62]:
encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtypes == 'object':
        data[col] = encoder.fit_transform(data[col])


In [63]:
# Drop duplicated columns
data = data.iloc[:,0:-2]

In [64]:
# Fill age NaN with median
data['edad_persona_en_situacion_de_violencia'] \
    .fillna(int(data['edad_persona_en_situacion_de_violencia'].median()),\
    inplace=True)
data.isnull().sum()

fecha                                                0
prov_residencia_persona_en_situacion_violencia       0
genero_persona_en_situacion_de_violencia             0
edad_persona_en_situacion_de_violencia               0
pais_nacimiento_persona_en_situacion_de_violencia    0
tipo_de_violencia_fisica                             0
tipo_de_violencia_psicologica                        0
tipo_de_violencia_sexual                             0
tipo_de_violencia_economica_y_patrimonial            0
tipo_de_violencia_simbolica                          0
tipo_de_violencia_domestica                          0
modalidad_de_violencia_institucional                 0
modalidad_de_violencia_laboral                       0
modalidad_violencia_contra_libertad_reproductiva     0
modalidad_de_violencia_obstetrica                    0
modalidad_de_violencia_mediatica                     0
modalidad_de_violencia_otras                         0
vinculo_con_la_persona_agresora                      0
genero_de_

In [65]:
# Drop duplicated
data.drop_duplicates(keep='first', inplace=True)
data.duplicated().sum()

0

In [102]:
# Rodar targets
for target in data.columns:
    # target = 'tipo_de_violencia_fisica'
    try:
        assert data[target].unique().sum() == 1
        X, y = get_data(data, target)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
        print(f"Resultado violência {target}")
        compute_tree(X_train, X_test, y_train, y_test)
    except:
        pass

Resultado violência tipo_de_violencia_fisica
Matriz de confusão:
 [[2220 2826]
 [3264 6169]]
Report:
               precision    recall  f1-score   support

           0       0.40      0.44      0.42      5046
           1       0.69      0.65      0.67      9433

    accuracy                           0.58     14479
   macro avg       0.55      0.55      0.55     14479
weighted avg       0.59      0.58      0.58     14479

Resultado violência tipo_de_violencia_psicologica
Matriz de confusão:
 [[  231   575]
 [  694 12979]]
Report:
               precision    recall  f1-score   support

           0       0.25      0.29      0.27       806
           1       0.96      0.95      0.95     13673

    accuracy                           0.91     14479
   macro avg       0.60      0.62      0.61     14479
weighted avg       0.92      0.91      0.92     14479

Resultado violência tipo_de_violencia_economica_y_patrimonial
Matriz de confusão:
 [[5523 3202]
 [3337 2417]]
Report:
               