In [1]:
import pandas as pd

In [2]:
datalake = pd.read_csv('./datalake/datalake.csv', header=0, low_memory=False)
print(datalake.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850013 entries, 0 to 850012
Data columns (total 42 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   COMISARIA           850013 non-null  object
 1   CUADRA              53118 non-null   object
 2   DERIVADA_FISCALIA   715460 non-null  object
 3   DIRECCION           850013 non-null  object
 4   DIST_CIA            850013 non-null  object
 5   DIST_HECHO          850013 non-null  object
 6   DPTO_CIA            850013 non-null  object
 7   DPTO_HECHO          850013 non-null  object
 8   EDAD                847713 non-null  object
 9   ESTADO_DEN          850013 non-null  object
 10  EST_CIVIL           849957 non-null  object
 11  FECHA_HORA_HECHO    850013 non-null  object
 12  ID_COMISARIA        850013 non-null  int64 
 13  ID_EST_CIVIL        849962 non-null  object
 14  ID_LIBRO            850013 non-null  object
 15  ID_MATERIA          850013 non-null  int64 
 16  ID

In [3]:
object_columns = datalake.select_dtypes(['object']).columns
int64_columns = datalake.select_dtypes(['int64']).columns
print(object_columns)
print(int64_columns)

Index(['COMISARIA', 'CUADRA', 'DERIVADA_FISCALIA', 'DIRECCION', 'DIST_CIA',
       'DIST_HECHO', 'DPTO_CIA', 'DPTO_HECHO', 'EDAD', 'ESTADO_DEN',
       'EST_CIVIL', 'FECHA_HORA_HECHO', 'ID_EST_CIVIL', 'ID_LIBRO',
       'ID_NIVEL_EDUCATIVO', 'ID_SIT_PERSONA', 'ID_TIPO_DENUNCIA', 'LIBRO',
       'MATERIA', 'MODALIDAD', 'NIVEL_EDUCATIVO', 'OCUPACION', 'PROV_CIA',
       'PROV_HECHO', 'REGION', 'SEXO', 'SIT_PERSONA', 'SUB_TIPO', 'TIPO',
       'TIPO_DENUNCIA', 'UBICACION', 'VIA', 'fec_registro', 'pais_natal'],
      dtype='object')
Index(['ID_COMISARIA', 'ID_MATERIA', 'ID_MODALIDAD', 'ID_REGION', 'ID_SUBTIPO',
       'ID_TIPO', 'UBIGEO_CIA', 'UBIGEO_HECHO'],
      dtype='object')


#### Exploración de datos

##### Valores perdidos

In [4]:
import numpy as np

# Trimea los textos y si resulta vacio, se pone nan
for col in object_columns:
    datalake[col] = datalake[col].str.strip()
datalake = datalake.replace('', np.nan)

In [5]:
row_count = datalake.shape[0]
print(f'Número de registros: {row_count}')
global_null_count = datalake.isnull().sum().sum()
print(f'Valores perdidos: {global_null_count}')
for col in datalake.columns:
    null_count = datalake[col].isnull().sum()
    null_percent = null_count / row_count * 100
    if null_count > 0:
        print(f'{col}: {null_count} ({null_percent:.2f}%)')

Número de registros: 850013
Valores perdidos: 3380332
CUADRA: 796895 (93.75%)
DERIVADA_FISCALIA: 134560 (15.83%)
DIRECCION: 1 (0.00%)
EDAD: 9404 (1.11%)
ESTADO_DEN: 264066 (31.07%)
EST_CIVIL: 56 (0.01%)
FECHA_HORA_HECHO: 40302 (4.74%)
ID_EST_CIVIL: 51 (0.01%)
ID_LIBRO: 3 (0.00%)
ID_NIVEL_EDUCATIVO: 759851 (89.39%)
NIVEL_EDUCATIVO: 794249 (93.44%)
OCUPACION: 539855 (63.51%)
SEXO: 666 (0.08%)
UBICACION: 18 (0.00%)
fec_registro: 40302 (4.74%)
pais_natal: 53 (0.01%)


##### Valores atípicos

###### Edad

In [6]:
datalake['EDAD'] = datalake['EDAD'].fillna(0)
datalake['EDAD'] = datalake['EDAD'].astype(float)
datalake['EDAD'] = datalake['EDAD'].astype(np.int64)

In [78]:
print(datalake['EDAD'].min(), datalake['EDAD'].max())

18 75


In [80]:
age_sorted = datalake.sort_values('EDAD')['EDAD']
print(age_sorted.head())
print(age_sorted.tail())
median = age_sorted.median()
mode = age_sorted.mode()[0]
print(f'Mediana: {median}')
print(f'Moda: {mode}')

748746    18
662351    18
662375    18
829176    18
245287    18
Name: EDAD, dtype: int64
14259     75
838370    75
163463    75
379706    75
509146    75
Name: EDAD, dtype: int64
Mediana: 32.0
Moda: 31


In [76]:
# Queremos ver la verdadera moda
datalake['EDAD'].value_counts()

31     26951
30     26686
27     26626
28     26539
29     26523
       ...  
447        1
440        1
335        1
103        1
627        1
Name: EDAD, Length: 164, dtype: int64

In [77]:
min_age = 18
max_age = 75
datalake.loc[(datalake['EDAD'] < min_age) | (datalake['EDAD'] > max_age) | (datalake['EDAD'] == 0), 'EDAD'] = mode

###### Fecha

In [9]:
import xlrd

# from excel decimal date to yyyy-mm-dd
def transformExcelDate(excelDate):
    dateISO = xlrd.xldate.xldate_as_datetime(float(excelDate), 0).date().isoformat()
    # print(dateISO, type(dateISO))
    return dateISO

In [46]:
import re
excelDate = r'\d+(\.\d+)?'

In [67]:
def transformDate(date):
    try:
        if not pd.isna(date):
            if re.fullmatch(excelDate, date):
                return transformExcelDate(date)
            else:
                return pd.to_datetime(date).date().isoformat()
    except Exception as e:
        print(e, date)

In [69]:
datalake['FECHA_HORA_HECHO'] = datalake['FECHA_HORA_HECHO'].apply(transformDate)
datalake['fec_registro'] = datalake['fec_registro'].apply(transformDate)