# Medellin car accidents
(From 2021 to 2022)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

## Data collection:

Data source Open Data Medellín: https://geomedellin-m-medellin.opendata.arcgis.com/

In [2]:
# List of file names
file_crash_2020 = 'data/crashes_georef_2020.csv'
file_crash_2021 = 'data/crashes_georef_2021.csv'
file_crash_2022 = 'data/crashes_georef_2022.csv'

crash_2020 = pd.read_csv(file_crash_2020)
crash_2021 = pd.read_csv(file_crash_2021)
crash_2022 = pd.read_csv(file_crash_2022)

In [3]:
crash_2020.head(3)

Unnamed: 0,X,Y,OBJECTID,RADICADO,HORA,DIA_NOMBRE,PERIODO,CLASE,DIRECCION,DIRECCION_ENC,...,COMUNA,DISENO,MES,DIA,FECHA,MES_NOMBRE,X_MAGNAMED,Y_MAGNAMED,LONGITUD,LATITUD
0,-75.568507,6.210703,1,1717722,01:35:00,MIÉRCOLES,2020,Choque,CL 10 A CR 41,CL 010 A 041 000 00000,...,El Poblado,Tramo de via,7,1,2020/07/01 00:00:00+00,JULIO,834977.94,1178769.85,-75.568507,6.210703
1,-75.558959,6.244545,2,1714151,11:30:00,VIERNES,2020,Choque,CR 38 CL 49 A,CR 038 049 A 000 00000,...,La Candelaria,Tramo de via,5,8,2020/05/08 00:00:00+00,MAYO,836034.83,1182513.26,-75.558959,6.244545
2,-75.575041,6.253081,3,1727015,08:25:00,LUNES,2020,Otro,CR 57 CL 51,CR 057 051 000 00000,...,La Candelaria,Tramo de via,10,19,2020/10/19 00:00:00+00,OCTUBRE,834254.74,1183457.46,-75.575041,6.253081


In [4]:
crash_2021.head(3)

Unnamed: 0,X,Y,objectid,radicado,hora,dia_nombre,periodo,clase,direccion,direccion_enc,...,comuna,diseno,mes,dia,fecha,mes_nombre,x_magnamed,y_magnamed,longitud,latitud
0,-75.561546,6.272349,1,1734446,14:40:00,VIERNES,2021,Choque,CLL 78 CRA 50C 02,CL 078 050 C 000 00000,...,Aranjuez,Tramo de via,1,1,2021/01/01 00:00:00+00,ENERO,835748.42,1185588.76,-75.561546,6.272349
1,-75.561838,6.236723,2,1734392,05:25:00,VIERNES,2021,Choque,CR 36 39 47,CR 036 039 000 00000,...,Buenos Aires,Tramo de via,1,1,2021/01/01 00:00:00+00,ENERO,835716.16,1181648.07,-75.561838,6.236723
2,-75.56479,6.280457,3,1734448,10:20:00,VIERNES,2021,Choque,CLL 88 CRA 52C 28,CL 088 052 C 000 00000,...,Aranjuez,Tramo de via,1,1,2021/01/01 00:00:00+00,ENERO,835389.34,1186485.6,-75.56479,6.280457


In [5]:
crash_2022.head(3)

Unnamed: 0,X,Y,OBJECTID,RADICADO,HORA,DIA_NOMBRE,PERIODO,CLASE,DIRECCION,DIRECCION_ENC,...,COMUNA,DISENO,MES,DIA,FECHA,MES_NOMBRE,X_MAGNAMED,Y_MAGNAMED,LONGITUD,LATITUD
0,-75.592351,6.243512,1,1783170,08:05:00,MIÉRCOLES,2022,Choque,Circular 2 Con Carrera 74,CQ 002 074 000 00000,...,Laureles Estadio,Interseccion,3,16,2022/03/16 00:00:00+00,MARZO,832338.65,1182399.08,-75.592351,6.243512
1,-75.579832,6.230514,2,1783476,17:30:00,MIÉRCOLES,2022,Choque,Carrera 55 Con Calle 29 C,CR 055 029 C 000 00000,...,Guayabal,Interseccion,3,16,2022/03/16 00:00:00+00,MARZO,833724.32,1180961.28,-75.579832,6.230514
2,-75.599001,6.269136,3,1789709,17:00:00,MIÉRCOLES,2022,Choque,Calle 54 Con Carrera 83,CL 054 083 000 00000,...,San Javier,Interseccion,5,4,2022/05/04 00:00:00+00,MAYO,831602.7,1185233.45,-75.599001,6.269136


Converting all column names into upper case

In [6]:
crash_2020 = crash_2020.rename(columns=lambda x: x.upper())
crash_2021 = crash_2021.rename(columns=lambda x: x.upper())
crash_2022 = crash_2022.rename(columns=lambda x: x.upper())

Checking if the three data set have the same columns

In [7]:
columns_2020 = set(crash_2020.columns)
columns_2021 = set(crash_2021.columns)
columns_2022 = set(crash_2022.columns)

# Check if all dataframes have the same number of columns
if len(columns_2020) == len(columns_2021) == len(columns_2022):
    print("All data sets have the same number of columns")
else:
    print("One data set has a different number of colums")

# Check if all dataframes have the same columns
if columns_2020 == columns_2021 == columns_2022:
    print("\nAll data sets have the same columns")
else:
    print("\nDataframes have different columns")

All data sets have the same number of columns

All data sets have the same columns


## Concatenate all data sets into a single one

In [8]:
crashes_2020to2022 = pd.concat([crash_2020, crash_2021, crash_2022])
crashes_2020to2022 = crashes_2020to2022.reset_index(drop=True)

In [9]:
# Checking if cancatenate was successfully
print(crash_2020.size + crash_2021.size + crash_2022.size)
crashes_2020to2022.size

2397900


2397900

## Dropping useless columns

Because DIRECCION and DIRECCION_ENC have the same information (stree name and stree number), lets drop DIRECCION_ENC

In [10]:
crashes_2020to2022.drop(columns='DIRECCION_ENC', inplace=True)
crashes_2020to2022.columns.to_list()

['X',
 'Y',
 'OBJECTID',
 'RADICADO',
 'HORA',
 'DIA_NOMBRE',
 'PERIODO',
 'CLASE',
 'DIRECCION',
 'CBML',
 'TIPO_GEOCOD',
 'EXPEDIENTE',
 'GRAVEDAD',
 'BARRIO',
 'COMUNA',
 'DISENO',
 'MES',
 'DIA',
 'FECHA',
 'MES_NOMBRE',
 'X_MAGNAMED',
 'Y_MAGNAMED',
 'LONGITUD',
 'LATITUD']

Doing for X, Y and LATITUD and LONGITUD columns

In [11]:
# "Rounded" to int, taking into the account the first 6 figures
tmp_crashes = crashes_2020to2022[['X', 'Y', 'LONGITUD', 'LATITUD']].copy()

#
tmp_crashes['X'] = (tmp_crashes['X'] * 1e6).astype(int)
tmp_crashes['LONGITUD'] = (tmp_crashes['LONGITUD'] * 1e6).astype(int)
#
tmp_crashes['Y'] = (tmp_crashes['Y'] * 1e6).astype(int)
tmp_crashes['LATITUD'] = (tmp_crashes['LATITUD'] * 1e6).astype(int)
#
tmp_crashes.head(2)

Unnamed: 0,X,Y,LONGITUD,LATITUD
0,-75568506,6210702,-75568506,6210702
1,-75558958,6244544,-75558958,6244544


In [12]:
mask_x = tmp_crashes['X'] != tmp_crashes['LONGITUD']
mask_y = tmp_crashes['Y'] != tmp_crashes['LATITUD']

diff_rows_x = tmp_crashes[mask_x].copy()
diff_rows_y = tmp_crashes[mask_y].copy()

diff_rows_x['diff'] = diff_rows_x['X'] - diff_rows_x['LONGITUD']
diff_rows_y['diff'] = diff_rows_y['Y'] - diff_rows_y['LATITUD']

if diff_rows_x.empty:
    print("X is the same information as LONGITUD")
    crashes_2020to2022.drop(columns='X')
else:
    print("X is not the same information as LONGITUD, Needs fix")        
    print("Difference hist:", diff_rows_x['diff'].value_counts())

if diff_rows_y.empty:
    print("Y is the same information as LATITUD")
    crashes_2020to2022.drop(columns='Y')
else:
    print("Y is not the same information as LATITUD, Needs fix")
    print("Difference hist:", diff_rows_y['diff'].value_counts())

X is not the same information as LONGITUD, Needs fix
Difference hist: 1    362
Name: diff, dtype: int64
Y is not the same information as LATITUD, Needs fix
Difference hist: -1    422
Name: diff, dtype: int64


So, because the difference is just at the last significant figure, let's drop X and Y and keep LONGITUD and LATITUD, which meka more sense.

In [13]:
crashes_2020to2022.drop(columns=['X', 'Y'], inplace=True)
crashes_2020to2022.columns.to_list()

['OBJECTID',
 'RADICADO',
 'HORA',
 'DIA_NOMBRE',
 'PERIODO',
 'CLASE',
 'DIRECCION',
 'CBML',
 'TIPO_GEOCOD',
 'EXPEDIENTE',
 'GRAVEDAD',
 'BARRIO',
 'COMUNA',
 'DISENO',
 'MES',
 'DIA',
 'FECHA',
 'MES_NOMBRE',
 'X_MAGNAMED',
 'Y_MAGNAMED',
 'LONGITUD',
 'LATITUD']

Let's drop OBJECTID, RADICADO, CBML, EXPEDIENTE, X_MAGNAMED, and Y_MAGNAMED

In [14]:
crashes_2020to2022.drop(columns=['OBJECTID', 'RADICADO', 'CBML', 'EXPEDIENTE', 'X_MAGNAMED', 'Y_MAGNAMED'], inplace=True)
crashes_2020to2022.columns.to_list()

['HORA',
 'DIA_NOMBRE',
 'PERIODO',
 'CLASE',
 'DIRECCION',
 'TIPO_GEOCOD',
 'GRAVEDAD',
 'BARRIO',
 'COMUNA',
 'DISENO',
 'MES',
 'DIA',
 'FECHA',
 'MES_NOMBRE',
 'LONGITUD',
 'LATITUD']

## Checking for NaN values

In [15]:
crashes_2020to2022.isna().sum()

HORA              0
DIA_NOMBRE        0
PERIODO           0
CLASE             0
DIRECCION         0
TIPO_GEOCOD       0
GRAVEDAD          0
BARRIO         2348
COMUNA         1997
DISENO            0
MES               0
DIA               0
FECHA             0
MES_NOMBRE        0
LONGITUD          0
LATITUD           0
dtype: int64

In [16]:
non_string_comuna = crashes_2020to2022.loc[~crashes_2020to2022['COMUNA'].apply(lambda x: isinstance(x, str))].index
non_string_barrio = crashes_2020to2022.loc[~crashes_2020to2022['BARRIO'].apply(lambda x: isinstance(x, str))].index

print(crashes_2020to2022.iloc[non_string_comuna[0]])
print("\n\n")
print(crashes_2020to2022.iloc[non_string_barrio[10]])

HORA                         03:10:00
DIA_NOMBRE                  DOMINGO  
PERIODO                          2020
CLASE                          Choque
DIRECCION               CR 75 CL 71 A
TIPO_GEOCOD                No Ubicada
GRAVEDAD                   SOLO DAÑOS
BARRIO                            NaN
COMUNA                            NaN
DISENO                   Tramo de via
MES                                12
DIA                                 6
FECHA          2020/12/06 00:00:00+00
MES_NOMBRE                 DICIEMBRE 
LONGITUD                   -75.703816
LATITUD                      6.221806
Name: 114, dtype: object



HORA                         07:00:00
DIA_NOMBRE                  MIÉRCOLES
PERIODO                          2020
CLASE                       Atropello
DIRECCION                 CR 73 CL 69
TIPO_GEOCOD                No Ubicada
GRAVEDAD                  CON HERIDOS
BARRIO                            NaN
COMUNA                        Sin Inf
DISENO                

So, let's split crashes_2020to2022 into two dataframe, one without BARRIO and COMUNA info, and one with the correct values for this information 

In [17]:
crashes_2020to2022_neightborhood = crashes_2020to2022[(crashes_2020to2022['BARRIO'] == '') & (crashes_2020to2022['COMUNA'] == '')].copy()
crashes_2020to2022.drop(columns=['BARRIO', 'COMUNA'], inplace=True)

crashes_2020to2022.columns.to_list()

['HORA',
 'DIA_NOMBRE',
 'PERIODO',
 'CLASE',
 'DIRECCION',
 'TIPO_GEOCOD',
 'GRAVEDAD',
 'DISENO',
 'MES',
 'DIA',
 'FECHA',
 'MES_NOMBRE',
 'LONGITUD',
 'LATITUD']

## Applying profile 

In [18]:
profile = ProfileReport(crashes_2020to2022, title="Car crashes on Medellin city (from 2020 to 2022)")

crashes_2020to2022.to_csv('data/crashes_georef_2020_to_2022.csv', index=False)

In [19]:
profile.to_file('report_crashes_medellin_2020_to_2022.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]