## Classifying with  _sklearn.neighbors.KNeighborsClassifier_

Using knn to predict some targets from line 144 datasource

### Imports

In [62]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Loading data

Getting data from 4 years

In [63]:
years = [2020, 2021, 2022, 2023]
data_dict = {}

for year in years:
    data_dict[year] = pd.read_csv(f'./data/linea144-{year}.csv', parse_dates=['fecha'])

data = pd.concat(data_dict, ignore_index=True)

# Excluding last column with no data
data = data.iloc[:,:-1]

assert data.shape == (84861, 19)
assert data.fecha.dtype == 'datetime64[ns]', "'fecha' must be a datetime64 type"

### Cleanning data

* Filling NaN values
* Joining multiple labels to standars

In [64]:
# Filling nan rows with mode
def fill_with_mode_the_nan_values(field):

    # n records NaN to theis field 
    fields_nan = data[field].isna().sum()
    print(f'Processing records NaN to {field}: {fields_nan}')
    
    if fields_nan > 0:
        # filling using the mode, like shown in link above
        field_mode = data[field].mode()[0]

        data[field].fillna(field_mode, inplace=True)

fields = ['prov_residencia_persona_en_situacion_violencia', 'genero_persona_en_situacion_de_violencia', \
          'pais_nacimiento_persona_en_situacion_de_violencia', 'vinculo_con_la_persona_agresora', \
            'genero_de_la_persona_agresora']

[fill_with_mode_the_nan_values(field) for field in fields]

Processing records NaN to prov_residencia_persona_en_situacion_violencia: 1229
Processing records NaN to genero_persona_en_situacion_de_violencia: 1776
Processing records NaN to pais_nacimiento_persona_en_situacion_de_violencia: 28706
Processing records NaN to vinculo_con_la_persona_agresora: 3192
Processing records NaN to genero_de_la_persona_agresora: 8737


[None, None, None, None, None]

In [65]:
# Calculate the median
age_median = data.edad_persona_en_situacion_de_violencia.median()

# Filling data with the mean
data.fillna(age_median, inplace=True)

In [66]:
boolean_fields = ['tipo_de_violencia_fisica', 'tipo_de_violencia_psicologica', 'tipo_de_violencia_sexual', \
                    'tipo_de_violencia_economica_y_patrimonial', 'tipo_de_violencia_simbolica', 'tipo_de_violencia_domestica',
                    'modalidad_de_violencia_institucional', 'modalidad_de_violencia_laboral', \
                    'modalidad_violencia_contra_libertad_reproductiva', 'modalidad_de_violencia_obstetrica', \
                    'modalidad_de_violencia_mediatica', 'modalidad_de_violencia_otras']

sim = 'Si'
nao = 'No'
mapping = {'Si': sim, 'SI': sim, 'No': nao, 'NO': nao}

for field in boolean_fields:
    data[field] = data[field].map(mapping)


### Labeled data

In [67]:
fields_to_encode = data.columns.drop(['fecha', 'edad_persona_en_situacion_de_violencia'])

for field in fields_to_encode:
    le = LabelEncoder()
    le.fit(data[field])
    data[field] = le.transform(data[field])

### Classifying ...

In [68]:
def knn(_target):
    y = data[_target]
    X = data.drop(['fecha', _target], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)

    predicted = knn.predict(X_test)
    accuracy = accuracy_score(predicted, y_test)

    print("Accuracy from {} is {}".format(_target, accuracy))


In [72]:
fields = data.columns.drop("fecha")
for target in fields:
    knn(target)

Accuracy from prov_residencia_persona_en_situacion_violencia is 0.5426265244800565
Accuracy from genero_persona_en_situacion_de_violencia is 0.9919283568019797
Accuracy from edad_persona_en_situacion_de_violencia is 0.09114475932363166
Accuracy from pais_nacimiento_persona_en_situacion_de_violencia is 0.9280032993578036
Accuracy from tipo_de_violencia_fisica is 0.6575148765686679
Accuracy from tipo_de_violencia_psicologica is 0.9507453013609851
Accuracy from tipo_de_violencia_sexual is 0.8672008484062923
Accuracy from tipo_de_violencia_economica_y_patrimonial is 0.5998350321098215
Accuracy from tipo_de_violencia_simbolica is 0.6358922995345548
Accuracy from tipo_de_violencia_domestica is 0.9358392741412832
Accuracy from modalidad_de_violencia_institucional is 0.9865669003711778
Accuracy from modalidad_de_violencia_laboral is 0.9863901490602722
Accuracy from modalidad_violencia_contra_libertad_reproductiva is 0.9948742119837389
Accuracy from modalidad_de_violencia_obstetrica is 0.999351