# Colombia seimic global stats
(1st June, 1993 to 31st May, 2023)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

## Data Collection: 

The data was downloaded form Servicio geológico colombiano, http://bdrsnc.sgc.gov.co/paginas1/catalogo/index.php and splited in 14 files: "reporte???.xlsx" 

## Data Wrangling

According with the Servicio geológico colombiano, the data was collected and made public in two campaigns:

* Firsh batch: from 1st of June, 1993 to 28th of Febraury, 2018. Registered by SEISAN software.
* Firsh batch: from 1st of March, 2018 to 31st of May, 2023. Registered by SeisComp33 software.  

See more details in the previous link.

In line with the above, the data is coming with different fields between them.

In [2]:
# Reading data as Dataframe
seismic_1st_jan_93_31_dec_99 = pd.read_excel('data/reporte362.xlsx')
seismic_1st_mar_18_31_dec_18 = pd.read_excel('data/reporte97.xlsx')

In [3]:
print("Describe for first batch")
seismic_1st_jan_93_31_dec_99.describe()

Describe for first batch


Unnamed: 0,LATITUD (grados),LONGITUD (grados),PROFUNDIDAD (Km),MAGNITUD Ml,MAGNITUD Mw,# FASES,RMS (Seg),GAP (grados),ERROR LATITUD (Km),ERROR LONGITUD (Km),ERROR PROFUNDIDAD (Km)
count,19307.0,19307.0,19307.0,19307.0,4.0,19307.0,19120.0,19120.0,19120.0,19120.0,19120.0
mean,5.979314,-74.159555,93.640628,2.859859,3.85,6.514062,0.621967,226.296653,13.077814,15.651867,20.410722
std,1.588705,1.658059,66.114135,0.659376,0.58023,2.971662,0.388727,71.845641,14.753809,15.36401,20.087514
min,-2.646,-83.4,0.0,0.5,3.1,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.2365,-75.031,10.0,2.4,3.55,5.0,0.4,179.0,5.3,7.3,8.9
50%,6.782,-73.168,137.9,2.8,3.95,6.0,0.6,227.0,8.4,11.2,13.3
75%,6.842,-73.058,150.0,3.2,4.25,8.0,0.8,289.0,14.3,17.5,22.6
max,14.907,-70.663,700.0,6.8,4.4,23.0,9.3,360.0,99.9,99.9,99.9


In [4]:
print("Columns name\n")
for i in seismic_1st_jan_93_31_dec_99.columns.to_list():
    print(i)

Columns name

FECHA
HORA_UTC
LATITUD (grados)
LONGITUD (grados)
PROFUNDIDAD (Km)
MAGNITUD Ml
MAGNITUD Mw
DEPARTAMENTO
MUNICIPIO
# FASES
RMS (Seg)
GAP (grados)
ERROR LATITUD (Km)
ERROR LONGITUD (Km)
ERROR PROFUNDIDAD (Km)
ESTADO


In [5]:
print("Describe for second batch")
seismic_1st_mar_18_31_dec_18.describe()

Describe for second batch


Unnamed: 0,LATITUD (°),LONGITUD (°),PROF. (Km),MAGNITUD,FASES,RMS (Seg),GAP (°),ERROR LATITUD (Km),ERROR LONGITUD (Km),ERROR PROFUNDIDAD (Km)
count,18068.0,18068.0,18068.0,18068.0,18068.0,18068.0,18068.0,18068.0,18068.0,18068.0
mean,6.619257,-74.322648,88.14702,1.759182,17.946812,0.516421,135.059442,2.195343,2.221081,3.614744
std,2.181316,1.849835,58.705123,0.568986,12.256258,0.292753,54.626845,1.587369,1.628317,2.26425
min,0.277,-89.985,-2.0,0.1,4.0,0.0,32.0,0.141,0.141,0.0
25%,5.84675,-75.34925,20.38,1.4,10.0,0.3,98.0,1.202,1.202,2.0
50%,6.797,-73.4215,111.595,1.7,14.0,0.5,131.0,1.697,1.697,3.0
75%,6.928,-73.156,142.7,2.0,22.0,0.7,162.0,2.687,2.758,4.6
max,14.314,-70.514,248.78,6.1,101.0,4.4,335.0,56.781,56.781,53.39


In [6]:
print("Columns name\n")
for i in seismic_1st_mar_18_31_dec_18.columns.to_list():
    print(i)

Columns name

FECHA - HORA UTC
LATITUD (°)
LONGITUD (°)
PROF. (Km)
MAGNITUD
TIPO MAGNITUD
FASES
RMS (Seg)
GAP (°)
ERROR LATITUD (Km)
ERROR LONGITUD (Km)
ERROR PROFUNDIDAD (Km)
REGION
ESTADO


In [7]:
# Chosen columns of interest and renaming
seismic_1st_jan_93_31_dec_99 = seismic_1st_jan_93_31_dec_99[['FECHA', 'HORA_UTC', 'LATITUD (grados)', 'LONGITUD (grados)', 'PROFUNDIDAD (Km)', 'MAGNITUD Ml', 'MAGNITUD Mw', 'DEPARTAMENTO', 'MUNICIPIO', 'ERROR LATITUD (Km)', 'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)', 'ESTADO']]
seismic_1st_jan_93_31_dec_99.rename(columns={"LATITUD (grados)": "LAT", "LONGITUD (grados)": "LONG", 'PROFUNDIDAD (Km)': "PROF.", "ERROR LATITUD (Km)": "ERR-LAT", "ERROR LONGITUD (Km)": "ERR-LONG", "ERROR PROFUNDIDAD (Km)": "ERR-PROF"}, inplace=True)

print("First batch", '\n')
for i in seismic_1st_jan_93_31_dec_99.columns.to_list():
    print(i)

seismic_1st_mar_18_31_dec_18 = seismic_1st_mar_18_31_dec_18[['FECHA - HORA UTC', 'LATITUD (°)', 'LONGITUD (°)', 'PROF. (Km)', 'MAGNITUD', 'TIPO MAGNITUD', 'ERROR LATITUD (Km)', 'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)', 'REGION', 'ESTADO']]
seismic_1st_mar_18_31_dec_18.rename(columns={"LATITUD (°)": "LAT", "LONGITUD (°)": "LONG", 'PROF. (Km)': "PROF.", "ERROR LATITUD (Km)": "ERR-LAT", "ERROR LONGITUD (Km)": "ERR-LONG", "ERROR PROFUNDIDAD (Km)": "ERR-PROF"}, inplace=True)

print("\nSecond batch", '\n')
for i in seismic_1st_mar_18_31_dec_18.columns.to_list():
    print(i)

First batch 

FECHA
HORA_UTC
LAT
LONG
PROF.
MAGNITUD Ml
MAGNITUD Mw
DEPARTAMENTO
MUNICIPIO
ERR-LAT
ERR-LONG
ERR-PROF
ESTADO

Second batch 

FECHA - HORA UTC
LAT
LONG
PROF.
MAGNITUD
TIPO MAGNITUD
ERR-LAT
ERR-LONG
ERR-PROF
REGION
ESTADO


## Standardizing

### Checking for duplicate rows

In [8]:
seismic_1st_jan_93_31_dec_99[seismic_1st_jan_93_31_dec_99.duplicated()]

Unnamed: 0,FECHA,HORA_UTC,LAT,LONG,PROF.,MAGNITUD Ml,MAGNITUD Mw,DEPARTAMENTO,MUNICIPIO,ERR-LAT,ERR-LONG,ERR-PROF,ESTADO


In [9]:
seismic_1st_mar_18_31_dec_18[seismic_1st_mar_18_31_dec_18.duplicated()]

Unnamed: 0,FECHA - HORA UTC,LAT,LONG,PROF.,MAGNITUD,TIPO MAGNITUD,ERR-LAT,ERR-LONG,ERR-PROF,REGION,ESTADO


### Checking for NaN

In [10]:
print("NaN:\n")
print(seismic_1st_jan_93_31_dec_99.isna().sum())
print("\n Missing cells\n", seismic_1st_jan_93_31_dec_99.isna().sum().sum())

NaN:

FECHA               0
HORA_UTC            0
LAT                 0
LONG                0
PROF.               0
MAGNITUD Ml         0
MAGNITUD Mw     19303
DEPARTAMENTO        0
MUNICIPIO           0
ERR-LAT           187
ERR-LONG          187
ERR-PROF          187
ESTADO              0
dtype: int64

 Missing cells
 19864


In [11]:
print("NaN:\n")
print(seismic_1st_mar_18_31_dec_18.isna().sum())
print("\n Missing cells\n", seismic_1st_mar_18_31_dec_18.isna().sum().sum())

NaN:

FECHA - HORA UTC    0
LAT                 0
LONG                0
PROF.               0
MAGNITUD            0
TIPO MAGNITUD       0
ERR-LAT             0
ERR-LONG            0
ERR-PROF            0
REGION              0
ESTADO              0
dtype: int64

 Missing cells
 0


### Zoom in on missing values

In [12]:
tmp = np.where(seismic_1st_jan_93_31_dec_99['MAGNITUD Mw'].notnull())
for i in tmp:
    print(seismic_1st_jan_93_31_dec_99[['MAGNITUD Ml', 'MAGNITUD Mw']].iloc[i])

       MAGNITUD Ml  MAGNITUD Mw
5687           2.9          4.4
10641          3.0          4.2
10791          2.7          3.1
14910          0.7          3.7


As we can see, there are just four values for Mw with the respectiv Ml, so may no sense to keep the Mw magnitued column

In [13]:
seismic_1st_jan_93_31_dec_99.drop(columns='MAGNITUD Mw', inplace=True)
seismic_1st_jan_93_31_dec_99.rename(columns={'MAGNITUD Ml': 'MAGNITUD'}, inplace=True)
print("NaN:\n")
print(seismic_1st_jan_93_31_dec_99.isna().sum())

NaN:

FECHA             0
HORA_UTC          0
LAT               0
LONG              0
PROF.             0
MAGNITUD          0
DEPARTAMENTO      0
MUNICIPIO         0
ERR-LAT         187
ERR-LONG        187
ERR-PROF        187
ESTADO            0
dtype: int64


Regardin the missing values for Latitude, longitude, and deep errors, lets replace them with the respective average

In [14]:
aveErrLat = seismic_1st_jan_93_31_dec_99['ERR-LAT'].mean()
aveErrLon = seismic_1st_jan_93_31_dec_99['ERR-LONG'].mean()
aveErrPro = seismic_1st_jan_93_31_dec_99['ERR-PROF'].mean()
#
seismic_1st_jan_93_31_dec_99['ERR-LAT'].fillna(aveErrLat, inplace=True)
seismic_1st_jan_93_31_dec_99['ERR-LONG'].fillna(aveErrLon, inplace=True)
seismic_1st_jan_93_31_dec_99['ERR-PROF'].fillna(aveErrPro, inplace=True)
#
print("NaN:\n")
print(seismic_1st_jan_93_31_dec_99.isna().sum())

NaN:

FECHA           0
HORA_UTC        0
LAT             0
LONG            0
PROF.           0
MAGNITUD        0
DEPARTAMENTO    0
MUNICIPIO       0
ERR-LAT         0
ERR-LONG        0
ERR-PROF        0
ESTADO          0
dtype: int64


In [15]:
# Merging columns FECHA and HORA_UTC
seismic_1st_jan_93_31_dec_99['FECHA - HORA UTC'] = seismic_1st_jan_93_31_dec_99['FECHA'] + ' ' + seismic_1st_jan_93_31_dec_99['HORA_UTC']
seismic_1st_jan_93_31_dec_99.drop(columns=['FECHA', 'HORA_UTC'], inplace=True)

for i in seismic_1st_jan_93_31_dec_99.columns.to_list():
    print(i)

LAT
LONG
PROF.
MAGNITUD
DEPARTAMENTO
MUNICIPIO
ERR-LAT
ERR-LONG
ERR-PROF
ESTADO
FECHA - HORA UTC


### Checking for magnitudes in the second batch

In [16]:
for i in seismic_1st_mar_18_31_dec_18['TIPO MAGNITUD'].value_counts().index:
    print(i)

MLr_3
MLr_2
MLr_1
MLr
MLr_4
M_MLr
MLr_vmm
M
MLr_5
Mw(mB)
MLv
Mw
mb
Mwp
ML
Mw(Mwp)


As an approach, for this analysis we are considering all the magnitudes as Ml, __even though they are not__.  
So, lets drop the column 'TIPO DE MAGNITUD'

In [17]:
seismic_1st_mar_18_31_dec_18.drop(columns='TIPO MAGNITUD', inplace=True)
seismic_1st_mar_18_31_dec_18.columns.to_list()

['FECHA - HORA UTC',
 'LAT',
 'LONG',
 'PROF.',
 'MAGNITUD',
 'ERR-LAT',
 'ERR-LONG',
 'ERR-PROF',
 'REGION',
 'ESTADO']

Lest zoom in on MUNICIPO and DEPARTAMENTE, first batch, and for REGION, second batch

In [18]:
print(seismic_1st_jan_93_31_dec_99[['MUNICIPIO', 'DEPARTAMENTO']].iloc[1])
print("\n")
print(seismic_1st_mar_18_31_dec_18['REGION'].iloc[1])

MUNICIPIO       LOS_SANTOS
DEPARTAMENTO     SANTANDER
Name: 1, dtype: object


Los Santos - Santander, Colombia


In this case, "REGION: Los Santons - Santander, Colombia" means:  
- MUNICIPIO = Los Santos
- DEPARTAMENTO = Santander  

So, lets put into the column REGION just the MUNICIPIO, create a new column DEPARTAMENTO by putting the respective value, and rename REGION as MUNICIPIO

In [23]:
tmp_a = seismic_1st_mar_18_31_dec_18['REGION'].str.split('-').str[0].str.strip()
tmp_b = seismic_1st_mar_18_31_dec_18['REGION'].str.split(',').str[0].str.strip()
tmp_c = tmp_b.str.split('-').str[1].str.strip()
#
seismic_1st_mar_18_31_dec_18['REGION'] = tmp_a
seismic_1st_mar_18_31_dec_18['DEPARTAMENTO'] = tmp_c

seismic_1st_mar_18_31_dec_18.rename(columns={'REGION': 'MUNICIPIO'}, inplace=True)

In [26]:
seismic_1st_mar_18_31_dec_18[['MUNICIPIO', 'DEPARTAMENTO']].iloc[1]

MUNICIPIO       Los Santos
DEPARTAMENTO     Santander
Name: 1, dtype: object

### Checking if missing values

In [68]:
seismic_1st_mar_18_31_dec_18.isna().sum()

FECHA - HORA UTC       0
LAT                    0
LONG                   0
PROF.                  0
MAGNITUD               0
ERR-LAT                0
ERR-LONG               0
ERR-PROF               0
MUNICIPIO              0
ESTADO                 0
DEPARTAMENTO        1395
dtype: int64

In [69]:
tmp_index = seismic_1st_mar_18_31_dec_18[seismic_1st_mar_18_31_dec_18.isna().any(axis=1)].index
seismic_1st_mar_18_31_dec_18 = seismic_1st_mar_18_31_dec_18.drop(tmp_index)

seismic_1st_mar_18_31_dec_18.isna().sum()

FECHA - HORA UTC    0
LAT                 0
LONG                0
PROF.               0
MAGNITUD            0
ERR-LAT             0
ERR-LONG            0
ERR-PROF            0
MUNICIPIO           0
ESTADO              0
DEPARTAMENTO        0
dtype: int64

In [70]:
for i in seismic_1st_jan_93_31_dec_99.columns.to_list():
    print(i)
print(seismic_1st_jan_93_31_dec_99.shape)
print("\n")
for i in seismic_1st_mar_18_31_dec_18.columns.to_list():
    print(i)
print(seismic_1st_mar_18_31_dec_18.shape)

LAT
LONG
PROF.
MAGNITUD
DEPARTAMENTO
MUNICIPIO
ERR-LAT
ERR-LONG
ERR-PROF
ESTADO
FECHA - HORA UTC
(19307, 11)


FECHA - HORA UTC
LAT
LONG
PROF.
MAGNITUD
ERR-LAT
ERR-LONG
ERR-PROF
MUNICIPIO
ESTADO
DEPARTAMENTO
(16673, 11)


In [71]:
seismic_merged = pd.concat([seismic_1st_jan_93_31_dec_99, seismic_1st_mar_18_31_dec_18])

In [72]:
for i in seismic_merged.columns.to_list():
    print(i)
print(seismic_merged.shape)

LAT
LONG
PROF.
MAGNITUD
DEPARTAMENTO
MUNICIPIO
ERR-LAT
ERR-LONG
ERR-PROF
ESTADO
FECHA - HORA UTC
(35980, 11)


In [73]:
seismic_merged.iloc[1]

LAT                                6.79
LONG                            -73.053
PROF.                             152.5
MAGNITUD                            3.0
DEPARTAMENTO                  SANTANDER
MUNICIPIO                    LOS_SANTOS
ERR-LAT                            10.3
ERR-LONG                            9.8
ERR-PROF                            8.9
ESTADO                         Revisado
FECHA - HORA UTC    1993-06-01 03:08:28
Name: 1, dtype: object

In [74]:
print("NaN:\n")
print(seismic_merged.isna().sum(), '\n')

print(seismic_1st_mar_18_31_dec_18.isna().sum())

NaN:

LAT                 0
LONG                0
PROF.               0
MAGNITUD            0
DEPARTAMENTO        0
MUNICIPIO           0
ERR-LAT             0
ERR-LONG            0
ERR-PROF            0
ESTADO              0
FECHA - HORA UTC    0
dtype: int64 

FECHA - HORA UTC    0
LAT                 0
LONG                0
PROF.               0
MAGNITUD            0
ERR-LAT             0
ERR-LONG            0
ERR-PROF            0
MUNICIPIO           0
ESTADO              0
DEPARTAMENTO        0
dtype: int64


In [75]:
profile = ProfileReport(seismic_merged, title="Seismic from 1st Jan. 1993 to 31st Dec. 2018")
profile.to_file("report_1st_jan_93_31st_dec_18.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot reindex on an axis with duplicate labels')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]