<a href="https://colab.research.google.com/github/sergioGarcia91/BucaramangaSeismicNest_ML/blob/main/ML_SismosNido_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The consultation was made in the Seismicity Catalog of the Servicio Geológico Colombiano: http://bdrsnc.sgc.gov.co/paginas1/catalogo/index.php

Characteristics of the quadrant:

| | Min | Max |
|--------|-------|------|
| Longitud | -73.4 | -72.8 |
| Latitud | 6.5 | 7.1 |
| Año | 1994 | 2023 |

Subsequently, the events from January and February 2024 were considered to reevaluate the models, as for the month of December, the models indicated the possible occurrence of seismic events.


# Start

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install contextily

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gp
import contextily as cx # For the basemap in GeoPandas
import xyzservices.providers as xyz # To choose the basemap
import time

from scipy.spatial import distance
from sklearn.cluster import DBSCAN
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression # For the Logistic Regression model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # To evaluate the model
from sklearn.neural_network import MLPClassifier # For the Neural Network
from joblib import dump, load # Save the model
from datetime import datetime, timedelta

# Load data

In [None]:
pathDatos = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Catalogos/'
pathSaveFiguras = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Figuras/'
# Seismic Catalog: June 1, 1993 - February 28, 2018
excelFechas_01 = ['1994', '1995', '1996', '1997', '1998', '1999',
                  '2000', '2001', '2002', '2003', '2004', '2005',
                  '2006', '2007', '2008', '2009', '2010', '2011',
                  '2012', '2013', '2014', '2015', '2016', '2017',
                  '2018a']

# Seismic Catalog: March 1, 2018 to the present (2023)
excelFechas_02 = ['2018b', '2019', '2020', '2021', '2022', '2023']

In [None]:
# The one from 1994 is read first.
df = pd.read_excel(pathDatos+'1994.xlsx', decimal=',')

# The others are concatenated.
for year in excelFechas_01[1:]:
  df2 = pd.read_excel(pathDatos+year+'.xlsx', decimal=',')
  df = pd.concat([df, df2])

del(df2) # This temporary one is deleted.

# The columns of interest are selected.
df = df[['FECHA', 'HORA_UTC', 'LATITUD (grados)', 'LONGITUD (grados)',
         'PROFUNDIDAD (Km)', 'MAGNITUD Ml', 'ERROR LATITUD (Km)',
         'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)']]

# The first one from the second catalog is read.
df2 = pd.read_excel(pathDatos+'2018b.xlsx', decimal=',')

# The second catalog is concatenated.
for year in excelFechas_02[1:]:
  df3 = pd.read_excel(pathDatos+year+'.xlsx', decimal=',')
  df2 = pd.concat([df2, df3])

del(df3) # This one is deleted.

# A Split is performed on the Date-Time of the second catalog so
# that it matches the format of the first catalog.
df2[['FECHA', 'HORA_UTC']] = df2['FECHA - HORA UTC'].str.split(' ', expand=True)

# Only the relevant ones are selected.
df2 = df2[['FECHA', 'HORA_UTC', 'LATITUD ()', 'LONGITUD ()', 'PROF. (Km)',
           'MAGNITUD', 'ERROR LATITUD (Km)', 'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)']]

# The columns of the second catalog are renamed.
df2.columns = ['FECHA', 'HORA_UTC', 'LATITUD (grados)', 'LONGITUD (grados)',
               'PROFUNDIDAD (Km)', 'MAGNITUD Ml', 'ERROR LATITUD (Km)',
               'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)']

df = pd.concat([df, df2])

# Remove possible duplicates.
df.drop_duplicates(inplace=True,
                   ignore_index=True)
df.reset_index(drop=True)

del(df2) # The second catalog is deleted.

df

In [None]:
df.info()

In [None]:
np.round(df.describe(), 3)

# Graphs

## Histograms

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df['PROFUNDIDAD (Km)'],
         bins=np.arange(0, 510, 10)) # Specify a bin width of 10 km for the bars.

plt.xlabel('Depth [km]')
plt.yscale('log') # To make the Y-axis logarithmic, in order to better see the contrasts.
plt.ylabel('Count')

plt.xticks(np.arange(0, 510, 50))
plt.xlim(-10, 510)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos en profundidad 1994-2023')

plt.savefig((pathSaveFiguras + 'histEventos_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
df.columns

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df['MAGNITUD Ml'],
         bins=np.arange(0, 8, 0.5))

plt.xlabel('Magnitude')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(0, 8, 0.5))
plt.xlim(-1, 8)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos por Magnitud 1994-2023')

plt.savefig((pathSaveFiguras + 'histMagnitudEventos_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df['LATITUD (grados)'],
         bins=np.arange(6.5, 7.11, 0.025))

plt.xlabel('Latitude [degrees]')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(6.5, 7.11, 0.05))

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos en Latitud 1994-2023')

plt.savefig((pathSaveFiguras + 'histLatitudEventos_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df['LONGITUD (grados)'],
         bins=np.arange(-73.4, -72.79, 0.025))

plt.xlabel('Longitude [degrees]')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(-73.4, -72.79, 0.05))

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos en Longitud')

plt.savefig((pathSaveFiguras + 'histLongitudEventos_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

## Scatter plots

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(df['LONGITUD (grados)'],
            df['PROFUNDIDAD (Km)'],
            s= 5,
            c='r')

plt.xlabel('Longitude [degrees]')
plt.ylabel('Depth [km]')

plt.yticks(np.arange(0, 460, 50))
plt.ylim(-10, 480)
plt.gca().invert_yaxis()

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Corte W-E 1994-2023')

plt.savefig((pathSaveFiguras + 'CorteWE_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(df['LATITUD (grados)'],
            df['PROFUNDIDAD (Km)'],
            s= 5,
            c='r')

plt.xlabel('Latitude [degrees]')
plt.ylabel('Depth [km]')

plt.yticks(np.arange(0, 460, 50))
plt.ylim(-10, 480)
plt.gca().invert_yaxis()

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Corte S-N 1994-2023')

plt.savefig((pathSaveFiguras + 'CorteSN_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(df['LONGITUD (grados)'],
            df['LATITUD (grados)'],
            s= 5,
            c='r')

plt.xlabel('Latitude [degrees]')
plt.ylabel('Depth [km]')
plt.axis('equal')

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Vista en Planta 1994-2023')

plt.savefig((pathSaveFiguras + 'CortePlanta_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)
plt.show()

## Time vs. Magnitude (Ml)

In [None]:
df['Date-Time'] = df['FECHA'] + ' ' + df['HORA_UTC']
df['Date-Time'] = pd.to_datetime(df['Date-Time'], yearfirst=True)
df.info()

### Save/Load df

In [None]:
df.to_csv(pathDatos+'df.csv', index=False)

In [None]:
df = pd.read_csv(pathDatos+'df.csv')
df['Date-Time'] = pd.to_datetime(df['Date-Time'], yearfirst=True)
df.info()

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,5))

plt.scatter(df['Date-Time'],
            df['MAGNITUD Ml'],
            s=1,
            label='Sismos',
            alpha=0.5)

Q1 = df['MAGNITUD Ml'].quantile(0.25)
Q2 = df['MAGNITUD Ml'].quantile(0.5)
Q3 = df['MAGNITUD Ml'].quantile(0.75)

left, right = plt.xlim()

plt.hlines(y= Q1,
           xmin=left,
           xmax=right,
           ls='-',
           color='b',
           label='Q1={}'.format(Q1))

plt.hlines(y= Q2,
           xmin=left,
           xmax=right,
           ls='-',
           color='y',
           label='Q2={}'.format(Q2))

plt.hlines(y= Q3,
           xmin=left,
           xmax=right,
           ls='-',
           color='m',
           label='Q3={}'.format(Q3))


plt.grid(ls='--', color='k', alpha=0.5)
plt.xlim(left, right)
plt.ylim(0,7)

plt.xlabel('Date - UTC')
plt.ylabel('Ml')
plt.legend()

plt.title('1994-2023')

plt.savefig((pathSaveFiguras + 'tiempo_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

# Depth >= 50 km

In [None]:
df_nido = df[df['PROFUNDIDAD (Km)'] >= 50].reset_index(drop=True)
df_nido

The original number of events was 146408; by considering only those at depths greater than 50 km, a DataFrame with 145722 events is obtained. Approximately 0.5% (686 events) were removed.

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df_nido['PROFUNDIDAD (Km)'],
         bins=np.arange(0, 510, 10))

plt.xlabel('Depth [km]')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(0, 510, 50))
plt.xlim(-10, 510)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos en profundidad >50 km 1994-2023')

plt.savefig((pathSaveFiguras + 'histEventos_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df_nido['MAGNITUD Ml'],
         bins=np.arange(0, 8, 0.25))

plt.xlabel('Magnitude')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(0, 8, 0.5))
plt.xlim(-1, 8)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos por Magnitud >50 km 1994-2023')

plt.savefig((pathSaveFiguras + 'histMagnitud_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(12,5))

plt.scatter(df_nido['Date-Time'],
            df_nido['MAGNITUD Ml'],
            s=1,
            label='Sismos',
            alpha=0.5)

Q1 = df_nido['MAGNITUD Ml'].quantile(0.25)
Q2 = df_nido['MAGNITUD Ml'].quantile(0.5)
Q3 = df_nido['MAGNITUD Ml'].quantile(0.75)

left, right = plt.xlim()

plt.hlines(y= Q1,
           xmin=left,
           xmax=right,
           ls='-',
           color='b',
           label='Q1={}'.format(Q1))

plt.hlines(y= Q2,
           xmin=left,
           xmax=right,
           ls='-',
           color='y',
           label='Q2={}'.format(Q2))

plt.hlines(y= Q3,
           xmin=left,
           xmax=right,
           ls='-',
           color='m',
           label='Q3={}'.format(Q3))


plt.grid(ls='--', color='k', alpha=0.5)
plt.xlim(left, right)
plt.ylim(0,7)

plt.xlabel('Date - UTC')
plt.ylabel('Ml')
plt.legend()
plt.title('Eventos profundidades >50 km 1994-2023')

plt.savefig((pathSaveFiguras + 'tiempo_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

Since there seems to be insufficient information about earthquakes with magnitudes lower than 2.5 before 2008, only events with magnitudes of 2.5 and above will be considered.

# Ml >= 2.5

In [None]:
df_interes = df_nido[df_nido['MAGNITUD Ml'] >= 2.5].reset_index(drop=True)
df_interes

Out of the 145722 events at depths greater than 50 km, only 31822 events have magnitudes greater than or equal to 2.5. A total of 113900 events were removed, which corresponds to approximately 79% of the total events at these depths.

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df_interes['MAGNITUD Ml'],
         bins=np.arange(0, 8, 0.25))

plt.xlabel('Magnitude')
plt.yscale('log')
plt.ylabel('Count')

plt.xticks(np.arange(0, 8, 0.5))
plt.xlim(-1, 8)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Cantidad de eventos por Magnitud >=2.5, >50 km 1994-2023')

plt.savefig((pathSaveFiguras + 'histMagnitud_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(12,5))

plt.scatter(df_interes['Date-Time'],
            df_interes['MAGNITUD Ml'],
            s=1,
            label='Sismos',
            alpha=0.5)

Q1 = df_interes['MAGNITUD Ml'].quantile(0.25)
Q2 = df_interes['MAGNITUD Ml'].quantile(0.5)
Q3 = df_interes['MAGNITUD Ml'].quantile(0.75)

left, right = plt.xlim()

plt.hlines(y= Q1,
           xmin=left,
           xmax=right,
           ls='-',
           color='b',
           label='Q1={}'.format(Q1))

plt.hlines(y= Q2,
           xmin=left,
           xmax=right,
           ls='-',
           color='y',
           label='Q2={}'.format(Q2))

plt.hlines(y= Q3,
           xmin=left,
           xmax=right,
           ls='-',
           color='m',
           label='Q3={}'.format(Q3))


plt.grid(ls='--', color='k', alpha=0.5)
plt.xlim(left, right)
plt.ylim(2,7)

plt.xlabel('Date - UTC')
plt.ylabel('Ml')
plt.legend()
plt.title('Eventos Magnitud >= 2.5, >50 km 1994-2023')

plt.savefig((pathSaveFiguras + 'tiempo_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
years = np.arange(1994, 2024)
eventsPerYear = []

for year in years:
  df_filtrado = df_interes[df_interes['Date-Time'].dt.year == year]
  nE = len(df_filtrado)

  fig, ax = plt.subplots(figsize=(8,7))
  ax.scatter(df_filtrado['LONGITUD (grados)'],
             df_filtrado['LATITUD (grados)'],
             s= 5, c='r')


  ax.set_xlabel('Longitude [degrees]')
  ax.set_ylabel('Latitude [degrees]')

  ax.set_ylim(6.5, 7.1)
  ax.set_xlim(-73.4, -72.8)
  #ax.axis('equal')

  cx.add_basemap(ax=ax,
                 crs='epsg:4326',
                 source=xyz.OpenTopoMap,
                 reset_extent=True)

  plt.grid(color='grey', ls='--', alpha=0.5)
  plt.title(f'Año {year}: {nE} eventos')

  eventsPerYear.append(nE)

  plt.savefig((pathSaveFiguras + f'mapa_{year}.png'),
              format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

  plt.show()
  print('\n')

del(nE, df_filtrado)

In [None]:
print('Año: Eventos')
for y,e in zip(years, eventsPerYear):
  print(f'{y}: {e}')

In [None]:
plt.figure(figsize=(10,5))

plt.plot(years, eventsPerYear, ls='--')
plt.scatter(years, eventsPerYear, c='r', s=10)

plt.xlabel('Año')
plt.ylabel('Cantidad')

plt.xticks(np.arange(1994, 2025, 2))

plt.title('Eventos por año Ml >=2.5, >50 km')
plt.grid(color='grey', ls='--', alpha=0.5)

plt.savefig((pathSaveFiguras + 'eventosPorYear_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

- In the maps, there is an alignment of seismic events during the years 1994 and 1995. What could be the explanation?

- Apparently, over time, fewer seismic events have been recorded. Since 2008, this reduction seems to have been significant.

# Por día

In [None]:
dict_Dias = {'Dia':[],
             '2.5-3.0':[],
             '3.0-3.5':[],
             '3.5-4.0':[],
             '4.0-4.5':[],
             '4.5-5.0':[],
             '5.0-5.5':[],
             '5.5-':[],}

# It seems that not every day there is a record of events at depths greater
# than 50 km and magnitudes greater than 2.5.

inicio = np.datetime64('1994-01-01')
fin = np.datetime64('2024-01-01')
array_fechas = np.arange(inicio, fin, dtype='datetime64[D]')

for date in array_fechas:
  df_filtrado = df_interes[df_interes['FECHA'] == str(date)]
  dict_Dias['Dia'].append(str(date))

  for ml in np.arange(2.5, 6.0, 0.5):
    if ml == 5.5:
      nE = df_filtrado['MAGNITUD Ml'] >= 5.5

      dict_Dias['5.5-'].append(sum(nE))

    else:
      nE = (df_filtrado['MAGNITUD Ml'] >= ml) & (df_filtrado['MAGNITUD Ml'] < (ml+0.5) )
      strDict = f'{ml}-{ml+0.5}'

      dict_Dias[strDict].append(sum(nE))



In [None]:
df_Dias = pd.DataFrame.from_dict(dict_Dias)
df_Dias['Total dia'] = df_Dias.sum(axis=1)
df_Dias

In [None]:
# It should sum to 31822.
df_Dias['Total dia'].sum()

Considering events at depths greater than 50 km and magnitudes greater than 2.5, some days do not record events under this condition.

## Save/Load CSV Days

In [None]:
df_Dias.to_csv(pathDatos+'df_Dias.csv', index=False)

In [None]:
df_Dias = pd.read_csv(pathDatos+'df_Dias.csv')
df_Dias['Fecha'] = pd.to_datetime(df_Dias['Dia'], yearfirst=True)
df_Dias

In [None]:
plt.figure(figsize=(15,7))

for i in df_Dias.columns[1:-2]:
  filtro = df_Dias[i] > 0

  plt.scatter(df_Dias['Fecha'][filtro],
              df_Dias[i][filtro],
              s=5,
              label=i)

plt.grid(ls='--', color='k', alpha=0.5)

plt.xlabel('Date - UTC')
plt.ylabel('Cantidad')
plt.legend()

plt.show()

# Semivariogram over Time

A "temporal" semivariance analysis will be performed, comparing the number of events per day with differences of up to one year. This comparison will be carried out in seven different subgroups of seismic events, which were divided according to magnitudes of 2.5, with intervals of 0.5 between each subgroup.

In [None]:
lags = np.arange(1, 366, 10) #A year or 365 days will be considered.
lags
semiVariograma = []

for day in lags:
  # To know which day it falls on.
  print(day)
  # To sum the semivariance and then average it.
  var = np.zeros(7)
  # To know how many data points the semivariance is averaged over at the end.
  nDatos = 0
  for index in np.arange(len(df_Dias)):
    # The data to be compared with the others
    DatoLag = df_Dias[['2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5', '5.5-']].iloc[index]
    # The rest of the data with which it will be compared
    DatosLags = df_Dias[['2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5', '5.5-']].iloc[index+1::day]
    # The semivariance is calculated
    semiVar = ((DatosLags - DatoLag)**2)/2
    # It is added to the empty array
    var = var + semiVar.sum()
    # The count of how many data points are being summed is kept track of
    nDatos += semiVar.shape[0]
  # The semivariance is averaged
  var2 = np.array(var/nDatos)
  # It is included to create the semivariograms
  # The total number of rows is equal to the number of days
  semiVariograma.append(list(var2))
semiVariograma = np.array(semiVariograma)

In [None]:
df_semiVar = np.concatenate((np.reshape(lags, (-1, 1)), semiVariograma) ,axis=1)
df_semiVar = pd.DataFrame(df_semiVar, columns=['Lag dias', '2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5', '5.5-'])
df_semiVar

## Save/Load CSV Semivariance

In [None]:
df_semiVar.to_csv(pathDatos+'df_semiVar.csv', index=False)

In [None]:
df_semiVar = pd.read_csv(pathDatos+'df_semiVar.csv')
df_semiVar

## Plot semivariogram

In [None]:
df_semiVar.columns

In [None]:
df_Dias.var()

In [None]:
for subgrupo in df_semiVar.columns[1:]:
  plt.figure(figsize=(10,5))
  plt.plot(df_semiVar['Lag dias'],
           df_semiVar[subgrupo], ls='--')
  plt.scatter(df_semiVar['Lag dias'],
              df_semiVar[subgrupo],
              c='k',
              s=10)
  plt.axhline(y=df_Dias[subgrupo].var(), color='r', linestyle='--', label='Varianza')

  plt.legend()
  plt.title('Subgrupo Ml: ' + subgrupo)
  plt.xlabel('Lag [días]')
  plt.ylabel('Semivarianza')
  plt.grid(color='grey', ls='--', alpha=0.5)

  plt.savefig((pathSaveFiguras + f'semiVar_{subgrupo}.png'),
              format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

  plt.show()
  print('\n')

In [None]:
plt.figure(figsize=(10,5))
for subgrupo in df_semiVar.columns[1:]:
  plt.plot(df_semiVar['Lag dias'],
           df_semiVar[subgrupo],
           ls='--',
           label=subgrupo)
  plt.scatter(df_semiVar['Lag dias'],
              df_semiVar[subgrupo],
              s=10)

plt.legend()
plt.xlabel('Lag [días]')
plt.ylabel('Semivarianza')
plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('1994-2023')

plt.savefig((pathSaveFiguras + 'semivariogramas_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()


In [None]:
df_Dias.columns

In [None]:
for subgrupo in df_Dias.columns[1:8]:
  plt.figure(figsize=(10,5))
  #plt.plot(df_Dias['Fecha'],
  #         df_Dias[subgrupo], ls='--')
  plt.scatter(df_Dias['Fecha'],
              df_Dias[subgrupo],
              c='b',
              s=3)
  varianzaML = np.round(df_Dias[subgrupo].var(), 3)
  plt.axhline(y=varianzaML, color='r', linestyle='--', label=f'Varianza: {varianzaML}')

  plt.legend()
  plt.title('Subgrupo Ml: ' + subgrupo)
  plt.xlabel('Fecha - UTC')
  plt.ylabel('Cantidad eventos')
  plt.grid(color='grey', ls='--', alpha=0.5)

  plt.show()
  print('\n')

In [None]:
np.round(df_Dias.describe(), 2)

The results of the temporal semivariogram show that lower magnitudes exhibit higher semivariance, which is understandable given that these are the more frequent events. In comparison, higher-magnitude events, which are less frequent, show lower semivariance, as generally few or no events are recorded, especially those with magnitudes greater than 5.

By reviewing the scatter plot and the variance of each subgroup, part of the previous conclusion can be corroborated. Additionally, from the `3.5-4.0` subgroup onward, it is observed that 75% of the data are 0, indicating that there is no record of earthquakes with those magnitudes for the majority.

# Distance

Considering that each day is represented as a vector, with the coordinates or axes corresponding to each of the columns representing the number of events by magnitude subgroup, the distance between all the days will be calculated to assess how close they may be to each other and determine if there are similarities between the events.

In [None]:
vectoresDias = df_Dias.iloc[:,1:8].to_numpy()
vectoresDias

In [None]:
distanciaDias = distance.cdist(vectoresDias, vectoresDias, 'euclidean')
distanciaDias.shape # The distance matrix between all the days.

In [None]:
# The diagonal is zero because it represents the distance of that day to itself,
# and it is symmetric, so we are only interested in the upper or lower triangle
# of the matrix, excluding the diagonal.

np.round(distanciaDias[0:5,0:5], 2) # Only view 5 data points out of the 10957.

In [None]:
# We will look for the indices of the numbers located above the diagonal.
# Note: The diagonal will not be considered.

iu = np.triu_indices(distanciaDias.shape[0],
                     1) # k=0 to start from the diagonal; k=1 to move 1 position up.
iu

In [None]:
# We pass the indices to select only the upper part.
distDias_vector = distanciaDias[iu]
distDias_vector

In [None]:
print(' Min.: ', np.min(distDias_vector))
print(' Max.: ', np.max(distDias_vector))
print(' Media: ', np.mean(distDias_vector))
print(' Mediana: ', np.median(distDias_vector))

In [None]:
np.linspace(0, 13, 14)

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(distDias_vector,
         bins=np.linspace(0, 13, 14*5))

plt.xlabel('Distancia')
#plt.yscale('log')
plt.ylabel('Count')

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Distancia entre los vectores Días 1994-2023')

plt.savefig((pathSaveFiguras + 'distancia_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(distDias_vector,
         bins=np.linspace(0, 13, 14*5))

plt.xlabel('Distancia')
plt.yscale('log')
plt.ylabel('Count')

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Distancia entre los vectores Días 1994-2023')

plt.savefig((pathSaveFiguras + 'distanciaLog_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(5, 5))
plt.boxplot(distDias_vector)

plt.grid(color='grey', ls='--', alpha=0.5)
plt.title('Distancia entre los vectores Días 1994-2023')

plt.savefig((pathSaveFiguras + 'boxplot_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
Q1, Q2, Q3, P90 = np.quantile(distDias_vector, [0.25, 0.50, 0.75, 0.90])
RIC = Q3-Q1
minOutliers = Q1 - 1.5*RIC
maxOutliers = Q3 + 1.5*RIC
countOutliers = sum(distDias_vector > maxOutliers)
porcentajeOutliers = np.round(countOutliers*100/len(distDias_vector), 2)
print('Min. Outliers: ', minOutliers)
print('Q1: ', Q1)
print('Q2: ', Q2)
print('Q3: ', Q3)
print('P90: ', P90)
print('Max. Outliers: ', maxOutliers)
print(f'Cantidad Outliers: {countOutliers} ({porcentajeOutliers}%)')

## DBSCAN

Considering the information from the quartiles and what is observed in the histograms, the distance values ​​will be adjusted and their behavior evaluated. Later, the days will be visualized as a function of time.

In [None]:
print(vectoresDias.shape)
vectoresDias # The day vectors to cluster

In [None]:
tinicio = time.perf_counter()

distanciaMax = 0.9
eventos = 100

modelo_DBSCAN = DBSCAN(eps=distanciaMax, min_samples=eventos, metric='euclidean').fit(vectoresDias)

tfinal = time.perf_counter()
tiempo = tfinal - tinicio

print('Segundos empleados: ',tiempo)

In [None]:
#cluster info
clusters = Counter(modelo_DBSCAN.labels_)
print(clusters)
print(df_Dias[modelo_DBSCAN.labels_ == -1].head())
print('Total clusters = {}'.format(len(clusters)-1))
print('Total clusters events = ', len(df_Dias[modelo_DBSCAN.labels_ != -1]))

In [None]:
# Take the outliers
outliers_df2 = df_Dias[modelo_DBSCAN.labels_ == -1]
# The rest of the data
cluster_df2 = df_Dias[modelo_DBSCAN.labels_ != -1]

# Colors for the clusters
colors2 = modelo_DBSCAN.labels_
color_cluster2 = colors2[colors2 != -1]

porcentajeOutliers = np.round((len(outliers_df2) / len(modelo_DBSCAN.labels_)) *100, 2)

fig, ax = plt.subplots(figsize=(15,7))

ax.scatter(outliers_df2['Fecha'], outliers_df2['Total dia'],
           s=3, c='r', label=f'{len(outliers_df2)} Outliers ({porcentajeOutliers}%)', alpha=0.7)

ax.scatter(cluster_df2['Fecha'], cluster_df2['Total dia'],
           s=3, c=color_cluster2, alpha=0.5)

ax.set_ylabel('Evento totales en el dia')
ax.set_xlabel('Fecha - UTC')
plt.title(f'Clusters: {len(clusters)-1}')

plt.savefig((pathSaveFiguras + 'DBSACAN_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.legend()
plt.show()

In [None]:
outliers_df2 = df_Dias[modelo_DBSCAN.labels_ == -1]

cluster_df2 = df_Dias[modelo_DBSCAN.labels_ != -1]

colors2 = modelo_DBSCAN.labels_
color_cluster2 = colors2[colors2 != -1]

fig, ax = plt.subplots(figsize=(15,7))

ax.scatter(cluster_df2['Fecha'], cluster_df2['Total dia'],
           s=3, c=color_cluster2, alpha=0.5)

ax.set_ylabel('Evento totales en el dia')
ax.set_xlabel('Fecha - UTC')
plt.title(f'Clusters: {len(clusters)-1}, sin Outliers')

plt.savefig((pathSaveFiguras + 'DBSCAN_sinOutliers_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
clusters.keys()

In [None]:
clusters.values()

In [None]:
plt.figure(figsize=(10, 5))

plt.bar(clusters.keys(),
        clusters.values())

plt.xlabel('Cluster')
plt.ylabel('Count')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.title('Eventos por Cluster')

plt.savefig((pathSaveFiguras + 'DBSCAN_hist_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

Apparently, clusters are considered to correspond to a maximum of 5 events per day, considering an `eps=0.9` and `min_samples=100`. Generating a total of 21 Clusters without including the outliers.

## PCA

A principal component analysis (PCA) will be performed to examine how the 7 axes or magnitude subgroups defined for each day vector are affected, and to determine if there is any relationship between them.

In [None]:
modelo_PCA = PCA(n_components=7) # That preserves the 7 components
modelo_PCA.fit(vectoresDias)

In [None]:
# Percentage of variance explained for each of the 7 components
modelo_PCA.explained_variance_ratio_ * 100

The first three components appear to account for approximately 60%, 20% and 9%, respectively.

In [None]:
# We transform to obtain the new 7 axes
vectoresDias_PCA = modelo_PCA.transform(vectoresDias)
np.round(vectoresDias_PCA[0:5,:], 2)

A scatter graph will be made considering only the first two axes.

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(vectoresDias_PCA[:,0],
            vectoresDias_PCA[:,1],
            s= 5)

plt.xlabel('PC-1')
plt.ylabel('PC-2')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.title('PCA')

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
h = plt.hist2d(vectoresDias_PCA[:,0],
           vectoresDias_PCA[:,1],
           bins=60,
           cmap='viridis',
               cmin=1)

plt.colorbar()

plt.xlabel('PC-1')
plt.ylabel('PC-2')
plt.title('Hist-2D PCA')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.savefig((pathSaveFiguras + 'PCA_01_02_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
h = plt.hist2d(vectoresDias_PCA[:,0],
           vectoresDias_PCA[:,2],
           bins=60,
           cmap='viridis',
               cmin=1)

plt.colorbar()

plt.xlabel('PC-1')
plt.ylabel('PC-3')
plt.title('Hist-2D PCA')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.savefig((pathSaveFiguras + 'PCA_01_03_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
h = plt.hist2d(vectoresDias_PCA[:,1],
           vectoresDias_PCA[:,2],
           bins=60,
           cmap='viridis',
               cmin=1)

plt.colorbar()

plt.xlabel('PC-2')
plt.ylabel('PC-3')
plt.title('Hist-2D PCA')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.savefig((pathSaveFiguras + 'PCA_02_03_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

### DBSCAN-PCA

In [None]:
sum(modelo_PCA.explained_variance_ratio_ [0:3]*100)

Considering that the first three principal components best represent the variance of the day vectors, a DBSCAN model will be generated with these components.

In [None]:
np.round(vectoresDias_PCA[0:3], 2)

In [None]:
np.round(vectoresDias_PCA[:,0:3], 2)

In [None]:
tinicio = time.perf_counter()

distanciaMax = 0.9
eventos = 500

modelo_DBSCAN_PCA = DBSCAN(eps=distanciaMax, min_samples=eventos, metric='euclidean').fit(vectoresDias_PCA[:,0:3])

tfinal = time.perf_counter()
tiempo = tfinal - tinicio

print('Segundos empleados: ',tiempo)

In [None]:
#cluster info
clusters_PCA = Counter(modelo_DBSCAN_PCA.labels_)
print(clusters_PCA)
print(df_Dias[modelo_DBSCAN_PCA.labels_ == -1].head())
print('Total clusters = {}'.format(len(clusters_PCA)-1))
print('Total clusters events = ', len(df_Dias[modelo_DBSCAN_PCA.labels_ != -1]))

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(vectoresDias_PCA[:,0],
            vectoresDias_PCA[:,1],
            s= 5,
            c=modelo_DBSCAN_PCA.labels_)

plt.xlabel('PC-1')
plt.ylabel('PC-2')

plt.grid(color='grey', ls='--', alpha=0.5)

plt.title('PCA-DBSCAN')

plt.savefig((pathSaveFiguras + 'PCA_DBSCAN_01_02_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

In [None]:

outliers_df2_PCA = df_Dias[modelo_DBSCAN_PCA.labels_ == -1]

cluster_df2_PCA = df_Dias[modelo_DBSCAN_PCA.labels_ != -1]


colors2_PCA = modelo_DBSCAN_PCA.labels_
color_cluster2_PCA = colors2_PCA[colors2_PCA != -1]

porcentajeOutliers_PCA = np.round((len(outliers_df2_PCA) / len(modelo_DBSCAN_PCA.labels_)) *100, 2)

fig, ax = plt.subplots(figsize=(15,7))

ax.scatter(outliers_df2_PCA['Fecha'], outliers_df2_PCA['Total dia'],
           s=3, c='r', label=f'{len(outliers_df2_PCA)} Outliers ({porcentajeOutliers_PCA}%)', alpha=0.3)

ax.scatter(cluster_df2_PCA['Fecha'], cluster_df2_PCA['Total dia'],
           s=3, c=color_cluster2_PCA, alpha=0.5)

ax.set_ylabel('Evento totales en el dia')
ax.set_xlabel('Fecha - UTC')
plt.title(f'PCA-DBSCAN Clusters: {len(clusters_PCA)-1}')

plt.savefig((pathSaveFiguras + 'tiempo_PCA_DBSCAN_01_02_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.legend()
plt.show()

In [None]:

outliers_df2_PCA = df_Dias[modelo_DBSCAN_PCA.labels_ == -1]

cluster_df2_PCA = df_Dias[modelo_DBSCAN_PCA.labels_ != -1]


colors2_PCA = modelo_DBSCAN_PCA.labels_
color_cluster2_PCA = colors2_PCA[colors2_PCA != -1]

porcentajeOutliers_PCA = np.round((len(outliers_df2_PCA) / len(modelo_DBSCAN_PCA.labels_)) *100, 2)

fig, ax = plt.subplots(figsize=(15,7))

ax.scatter(cluster_df2_PCA['Fecha'], cluster_df2_PCA['Total dia'],
           s=3, c=color_cluster2_PCA, alpha=1)

ax.set_ylabel('Evento totales en el dia')
ax.set_xlabel('Fecha - UTC')
plt.title(f'PCA-DBSCAN Clusters: {len(clusters_PCA)-1}, sin Outliers')

plt.savefig((pathSaveFiguras + 'tiempo_PCA_DBSCAN_sinOutliers_01_02_ml2.5_50km_1994_2023.png'),
            format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

plt.show()

Apparently, the results may be similar between both DBSCAN models.

# Logistic Regression Model

A logistic regression model will be created in which we will try to predict the occurrence of events with magnitudes equal to or greater than 4.5. To do this, the 30 vectors of previous days will be taken into account to evaluate the possibility of an event of this magnitude occurring in the next 6 days.

---
Example:
Today is January 30, so the input data would be the events recorded from January 1 to 30 (up to the current date). The model will then calculate the probability of an event with magnitude >= 4.5 occurring during the period from January 31 to February 5 (a total of 6 days).

In [None]:
df_Dias.sum(axis=0)

In [None]:
# Let's add the 3 columns of interest
df_Dias['Y'] = df_Dias[['4.5-5.0', '5.0-5.5', '5.5-']].sum(axis=1)
np.round(df_Dias.describe())

In [None]:
df_Dias.head(10)

In [None]:
df_Dias['Y'].sum() # A total of 497 earthquakes have been recorded with M>=4.5

In [None]:
df_Dias

In [None]:
df_Dias[0:-2]

In [None]:
len(df_Dias)

## Crear y/X

In [None]:
y = []
X = []

dias_Considerar = 30

for dias in range(dias_Considerar-1, len(df_Dias)-6):
  di_ = dias_Considerar-1
  datos = df_Dias[['2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5', '5.5-']].iloc[dias-di_:dias+1].to_numpy()
  #print(datos.shape)
  datos = np.reshape(datos, (1,-1))
  #print(datos.shape)
  X.append(datos.tolist()[0])

  # If day 30 is not included in the query to consider in the targets
  # in training the model does not predict
  # why?
  # SD30 ... without day 30
  # SD60 ... without day 60
  target = df_Dias['Y'].iloc[dias+1:dias+7].sum()

  # If day 30 is included, the model predicts better
  # why?
  # CD30 ... with day 30
  # CD60 ... with day 60
  #target = df_Dias['Y'].iloc[dias:dias+7].sum()

  if target > 0 :
    y.append(1)
  else:
    y.append(0)
y = np.reshape(np.array(y), (-1,1))
X = np.array(X)

In [None]:
y.shape

In [None]:
X.shape

In [None]:
df_RegLog = pd.DataFrame(np.concatenate((X,y), axis=1))
df_RegLog

## Save/Load CSV Reg Log

In [None]:
df_RegLog.to_csv(pathDatos+'df_RegLog.csv', index=False)

In [None]:
df_RegLog = pd.read_csv(pathDatos+'df_RegLog.csv')
df_RegLog

## Split

The model appears to improve when only earthquakes since 2008 are taken into account, but it is still not fully adequate for prediction, as it only manages to get it right in about 11 out of 70 cases. This is similar to the results obtained when using all data before 2023.

In [None]:
# The data before 2023 will be used for training
# and the data from 2023 will be used for prediction
filtro2022 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year < 2023).to_numpy()

# To see if it only improves since 2008
# it doesn't seem to improve
#l2008 = df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year >= 2008
#l2023 = df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year < 2023
#filtro2022 = (l2008 & l2023).to_numpy()

filtro2023 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year == 2023).to_numpy()

y2022 = df_RegLog.iloc[:,-1][filtro2022].to_numpy()
X2022 = df_RegLog.iloc[:,:-1][filtro2022].to_numpy()

y2023 = df_RegLog.iloc[:,-1][filtro2023].to_numpy()
X2023 = df_RegLog.iloc[:,:-1][filtro2023].to_numpy()


### Save Numpy D30/D60

In [None]:
'''
prefijo = pathDatos + 'SD' + str(dias_Considerar)
print(prefijo)
np.savetxt(f'{prefijo}_y2022.txt', y2022)
np.savetxt(f'{prefijo}_X2022.txt', X2022)
np.savetxt(f'{prefijo}_y2023.txt', y2023)
np.savetxt(f'{prefijo}_X2023.txt', X2023)
'''

In [None]:
y2022

In [None]:
X2022

In [None]:
y2023

In [None]:
X2023

## Training

In [None]:
# We create the Logistic Regression model
modelo_RegLog = LogisticRegression(max_iter=400)

# Train the model
modelo_RegLog.fit(X2022, y2022) # It seems that the iteration limit has been reached

In [None]:
saveRegLogo = False
if saveRegLogo:

  pathSave = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'

  scoreRegLog = modelo_RegLog.score(X2022, y2022)
  print(scoreRegLog)


  Name = 'SD30_RegLog_scr' + str(round(scoreRegLog, 3)) + '.joblib'
  dump(modelo_RegLog, pathSave+Name)
  print(Name)

In [None]:
pathSave = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'
#Name = 'SD30_RegLog_scr0.788.joblib'
#modelo_RegLog = load(pathSave+Name)
modelo_RegLog

## Predict

In [None]:
# To predict label 0 or 1
pred_RegLog = modelo_RegLog.predict(X2022)
pred_RegLog

In [None]:
# To know the probability of each class
modelo_RegLog.predict_proba(X2022)

In [None]:
modelo_RegLog.classes_ # in position 1 is the one of interest

In [None]:
pred_RegLog_prob = modelo_RegLog.predict_proba(X2022)[:,1]
pred_RegLog_prob

In [None]:
nMin = dias_Considerar -1


fig, ax1 = plt.subplots(figsize=(15,5))

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022],
            pred_RegLog_prob*100,
            s= 5,
            c=pred_RegLog_prob*100,
            cmap='YlGnBu',
            vmin=0,
            vmax=100)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][y2022 >= 1],
            y2022[y2022 >= 1] * 50,
            s= 5,
            c='r')


ax1.set_ylabel('Probabilidad [%]')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('1994-2022')

plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y2022, pred_RegLog).ravel()

print('1994-2022')
print("True Negative:", tn)
print("False Positive:", fp)
print("False Negative:", fn)
print("True Positive:", tp)

In [None]:
confusion_matrix(y2022, pred_RegLog)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y2022, pred_RegLog),
                              display_labels=modelo_RegLog.classes_)


disp.plot()

plt.title('1994-2022')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y2023, modelo_RegLog.predict(X2023)).ravel()

print('2023')
print("True Negative:", tn)
print("False Positive:", fp)
print("False Negative:", fn)
print("True Positive:", tp)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y2023, modelo_RegLog.predict(X2023)),
                              display_labels=modelo_RegLog.classes_)


disp.plot()

plt.title('2023')
plt.show()

In [None]:
pred_RegLog_prob2023 = modelo_RegLog.predict_proba(X2023)[:,1]
pred_RegLog_prob2023

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023],
            pred_RegLog_prob2023*100,
            s= 5,
            c=pred_RegLog_prob2023*100,
            cmap='YlGnBu',
            vmin=0,
            vmax=100)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
            y2023[y2023 >= 1] * 50,
            s= 5,
            c='r')


ax1.set_ylabel('Probabilidad [%]')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('2023')

plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))
ax1.scatter(df['Date-Time'][(df['Date-Time'].dt.year == 2023).to_numpy()],
            df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2023).to_numpy()],
            s=5,
            label='Sismos',
            alpha=0.5)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][modelo_RegLog.predict(X2023) >= 1],
            modelo_RegLog.predict(X2023)[modelo_RegLog.predict(X2023) >= 1]*7,
            s= 10,
            c='g',
            label='Predichos Reg Log')


ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
            y2023[y2023 >= 1] * 4.4,
            s= 5,
            c='r',
            label='Datos Y')

#ax1.set_ylim(-10, 110)
ax1.set_ylabel('Ml')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('2023')
plt.legend(loc=8)

plt.show()

# MLP Classifier Model

In [None]:
X2022.shape

## Training

- It seems that when using hl 210, 5 without early_stoping, an overfit is achieved and a score of 1.0 is achieved
- When using hl 5, 5 without early_stoping, an overfit is achieved and an average score of .80 is achieved... it only manages to predict 2 for 2023
- When using hl 5, 5 with early_stoping, an overfit is achieved and an average score of .70 is achieved... it fails to predict for 2023

The early_stopping enables it to consider a part of the data as validation... it is better to continue using early_stopping.
```python
validation_fractionfloat, default=0.1
The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.
```

- In tests on January 22, 2024, a score of 0.90 was achieved and it managed to predict 45 of the events of 2023, hl of 1000, 5 ... with early_stopping ... and considered the data from Day 30 for Y ... this model was unfortunately not saved.

Does not predict correctly for 2023:
- SD30_ces_a_hl_210_5_0_scr0.789
- SD30_ces_a_hl_1000_5_0_scr0.8
- SD30_ces_a_hl_5_5_0_scr0.774
- SD30_ces_a_hl_1000_5_7_scr0.837
- CD30_ces_b_hl_5_5_0_scr0.738

They correctly predict for 2023:
- SD30_ses_a_hl_210_5_0_scr1.0 ... 15 events
- SD30_ses_a_hl_5_5_1_scr0.816 ... 2 events
- CD30_ces_a_hl_5_5_6_scr0.804 ... 11 events
- CD30_ces_a_hl_5_5_3_scr0.805 ... 10 events
- CD30_ces_a_hl_5_5_0_scr0.805 ... 10 events
- CD30_ces_b_hl_210_5_0_scr0.842 ... 11 events
- CD30_ces_c_hl_210_5_0_scr0.81 ... 8 events
- CD30_ces_c_hl_210_5_0_scr0.942 ... 17 events
- CD30_ces_b_hl_1000_5_3_scr0.939 ... 13 events
- CD30_ces_c_hl_1000_5_2_scr0.901 ... 13
- CD30_ces_d_hl_1000_5_5_scr0.908 ... 16
- CD30_ces_e_hl_1000_5_3_scr0.932 ... 18
- CD30_ces_h_hl_1000_5_4_scr0.955 ... 16
- CD30_ces_i_hl_1000_5_2_scr0.912 ... 11
- CD30_ces_j_hl_1000_5_3_scr0.974 ... 18
- CD30_ces_k_hl_1000_5_8_scr0.908 ... 17
- CD30_ces_k_hl_1000_5_0_scr0.9 ... 15
- CD30_ces_l_hl_1000_5_2_scr0.908 ... 17

Solver= 'sgd'
- CD30_ces_f_hl_1000_5_8_scr0.95 ... 16
- CD30_ces_f_hl_1000_5_0_scr0.815 ... 6

Models with prior 60 days of information will be considered:
- CD60_ces_l_hl_1000_5_0_scr0.938.joblib ... 12
- CD60_ces_l_hl_1000_5_0_scr0.938 ... 9
- CD60_ces_a_hl_1000_5_1_scr0.874 ... 11
- CD60_ces_a_hl_1000_5_0_scr0.882...13
- CD60_ces_b_hl_1000_5_1_scr0.965 ... 14
- CD60_ces_b_hl_1000_5_0_scr0.929 ... 12

'adam' vs 'sgd' ... https://medium.com/geekculture/a-2021-guide-to-improving-cnns-optimizers-adam-vs-sgd-495848ac6008

In [None]:
# The model is created
# there are 210 input data
# only the first hidden layer will be modified
# the 2nd layer will be left fixed at 5... considering that in the PCA
# the presence of 3 Principal Components is inferred
# 2 are added for some freedom
hl = [2000, 5] # only use 2 layers

intentos = 5
c = 0

# SD30 means that it is a model that the Y did not consider on day 30
# CD30 means that it is a model that the Y considers on day 30... the results are better
# the additional letter is for multiple runs
# ses means without early_stopping
# ces means with early_stopping
letras = 'SD30_ses_ae'
pathSave = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'

# For SD30 models the average is about .77, so the one of interest is taken at .80
# For CD30 models if they achieve more than .80
# The hl 1000, 5 models can achieve the score of .90
scoreInteres = 0.7

while c <= intentos:
  print('\n ', c)

  modelo_MLPClass = MLPClassifier(hidden_layer_sizes=tuple(hl),
                                activation='relu', #identity, logistic, tanh, relu
                                early_stopping=False, solver='adam', max_iter=30)#, random_state= 0)

  # early_stopping is left because when it was not used the model predicts 100%

  # It is trained
  tinicio = time.perf_counter()

  modelo_MLPClass.fit(X2022, y2022)

  tfinal = time.perf_counter()
  tiempo = tfinal - tinicio

  print('Segundos empleados: ',tiempo)

  scoreMLPClass = modelo_MLPClass.score(X2022, y2022)
  print(scoreMLPClass)

  if scoreMLPClass >= scoreInteres:
  #if (scoreMLPClass >= scoreInteres) & (scoreMLPClass < 0.91):
    # guardar modelo
    Name = letras + f'_hl_{hl[0]}_{hl[1]}_' + str(c) + '_scr' + str(round(scoreMLPClass, 3)) + '.joblib'
    #dump(modelo_MLPClass, pathSave+Name)
    #print(Name)
    print('tp 2023', confusion_matrix(y2023, modelo_MLPClass.predict(X2023))[1,1])
    if confusion_matrix(y2023, modelo_MLPClass.predict(X2023))[1,1] >=15:
      dump(modelo_MLPClass, pathSave+Name)
      print(Name)
      break

  if c == intentos:
    print('Nada ...')
  c += 1

## Predict

In [None]:
pathSave = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'
Name = 'SD30_ses_a_hl_210_5_0_scr1.0.joblib'
modelo_MLPClass = load(pathSave+Name)
modelo_MLPClass

In [None]:
print(len(modelo_MLPClass.loss_curve_), '\n')

plt.figure()
plt.plot(np.arange(len(modelo_MLPClass.loss_curve_)),
         modelo_MLPClass.loss_curve_)

plt.xlabel('Iteraciones')
plt.show()

In [None]:
modelo_MLPClass.classes_

In [None]:

predichosMLPClass = modelo_MLPClass.predict(X2022)
predProb_MLPClass = modelo_MLPClass.predict_proba(X2022)[:,1]

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y2022, predichosMLPClass),
                              display_labels=modelo_RegLog.classes_)


disp.plot()

plt.title('1994-2022 MLP Class')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022],
            predProb_MLPClass*100,
            s= 5,
            c=predProb_MLPClass*100,
            cmap='YlGnBu',
            vmin=0,
            vmax=100)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][y2022 >= 1],
            y2022[y2022 >= 1] * 50,
            s= 5,
            c='r')


ax1.set_ylim(-10, 110)
ax1.set_ylabel('Probabilidad [%]')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('1994-2022 MLP Class')

plt.show()

In [None]:
#predecimos
predichosMLPClass2023 = modelo_MLPClass.predict(X2023)
predProb_MLPClass2023 = modelo_MLPClass.predict_proba(X2023)[:,1]

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y2023, modelo_MLPClass.predict(X2023)),
                              display_labels=modelo_RegLog.classes_)


disp.plot()

plt.title('2023 MLP Class')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023],
            predProb_MLPClass2023*100,
            s= 5,
            c=predProb_MLPClass2023*100,
            cmap='YlGnBu',
            vmin=0,
            vmax=100)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
            y2023[y2023 >= 1] * 50,
            s= 5,
            c='r')

ax1.set_ylim(-10, 110)
ax1.set_ylabel('Probabilidad [%]')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('2023 MLP Class')

plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15,5))
ax1.scatter(df['Date-Time'][(df['Date-Time'].dt.year == 2023).to_numpy()],
            df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2023).to_numpy()],
            s=5,
            label='Sismos',
            alpha=0.5)

ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][predichosMLPClass2023 >= 1],
            predichosMLPClass2023[predichosMLPClass2023 >= 1]*7,
            s= 10,
            c='g',
            label='Predichos MLP Class')


ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
            y2023[y2023 >= 1] * 4.4,
            s= 5,
            c='r',
            label='Datos Y')

#ax1.set_ylim(-10, 110)
ax1.set_ylabel('Ml')
ax1.set_xlabel('Fecha - UTC')
ax1.grid(ls='--', color='grey')

plt.suptitle('2023')
plt.legend(loc=8)

plt.show()

# Compare Models

## Model List

In [None]:
listaModelos = !ls /content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class
listaModelos

In [None]:
pathModelos = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'
listaModelos = !ls /content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class
listaTemp = []
for i in range(len(listaModelos)):
  if len(listaModelos[i].split('\t')) == 1:
    a = listaModelos[i].split('\t')
    listaTemp.append(a[0])
  else:
    a = listaModelos[i].split('\t')
    listaTemp.append(a[0])
    listaTemp.append(a[1])
listaModelos = np.reshape(np.array(listaTemp), (1,-1))
del(listaTemp)
listaModelos = listaModelos[0]
listaModelos

## Upload CSVs

In [None]:
pathDF = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Catalogos/'

# DataFrame the entire earthquake catalog
df = pd.read_csv(pathDF+'df.csv')
df['Date-Time'] = pd.to_datetime(df['Date-Time'], yearfirst=True)
df.info()

In [None]:
# DataFrame by days
df_Dias = pd.read_csv(pathDatos+'df_Dias.csv')
df_Dias['Fecha'] = pd.to_datetime(df_Dias['Dia'], yearfirst=True)
df_Dias

In [None]:
dias_Considerar = 30 # 30 o 60
nMin = dias_Considerar - 1
filtro2023 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year == 2023).to_numpy()

In [None]:
X2023 = np.loadtxt(pathDatos+'SD30_X2023.txt')
X2023.shape

In [None]:
for modelo in listaModelos:
  if 'joblib' in modelo:
    if 'D30' in modelo:
      dias_Considerar = 30
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD30_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'SD30_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'SD30_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'SD30_y2022.txt')

      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD30_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'CD30_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'CD30_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'CD30_y2022.txt')

    if 'D60' in modelo:
      dias_Considerar = 60
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD60_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'SD60_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'SD60_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'SD60_y2022.txt')
      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD60_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'CD60_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'CD60_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'CD60_y2022.txt')

    modelo_P = load(pathModelos+modelo)
    # 2023
    predichosModelo = modelo_P.predict(X2023)
    predProb_Modelo = modelo_P.predict_proba(X2023)[:,1]
    # 1994-2023
    #predichosModelo_2022 = modelo_P.predict(X2022)
    predProb_Modelo_2022 = modelo_P.predict_proba(X2022)[:,1]

    nMin = dias_Considerar - 1
    filtro2023 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year == 2023).to_numpy()
    filtro2022 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year < 2023).to_numpy()


    # Figura entrenamiento 1994-2022
    fig = plt.figure(figsize=(15,5), constrained_layout=True)
    gs = fig.add_gridspec(1,3)

    if 'RegLog' in modelo:
      ax1 = fig.add_subplot(gs[0, 0:3])
      ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022],
                  predProb_Modelo_2022*100,
                  s= 5,
                  c=predProb_Modelo_2022*100,
                  cmap='YlGnBu',
                  vmin=0,
                  vmax=100)

      ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][y2022 >= 1],
                  y2022[y2022 >= 1] * 50,
                  s= 5,
                  c='r')
      ax1.set_ylim(-10, 110)
      ax1.set_ylabel('Probabilidad [%]')
      ax1.set_xlabel('Fecha - UTC')
      ax1.grid(ls='--', color='grey')

    else:
      ax1 = fig.add_subplot(gs[0, 0:2])
      ax2 = fig.add_subplot(gs[0, 2])

      ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022],
                  predProb_Modelo_2022*100,
                  s= 5,
                  c=predProb_Modelo_2022*100,
                  cmap='YlGnBu',
                  vmin=0,
                  vmax=100)

      ax1.scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][y2022 >= 1],
                  y2022[y2022 >= 1] * 50,
                  s= 5,
                  c='r')
      ax1.set_ylim(-10, 110)
      ax1.set_ylabel('Probabilidad [%]')
      ax1.set_xlabel('Fecha - UTC')
      ax1.grid(ls='--', color='grey')

      ax2.plot(np.arange(len(modelo_P.loss_curve_)),
              modelo_P.loss_curve_)
      ax2.grid(ls='--', color='grey')
      ax2.set_xlabel('Iteraciones')
      ax2.set_ylabel('Loss')

    plt.suptitle(f'Entrenamiento 1994-2022 Modelo: {modelo[:-7]}')
    plt.savefig((pathSaveFiguras + f'1994_2022_{modelo[:-7]}.png'),
                format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

    plt.show()
    print('\n')


    # Figuras 2023
    fig, ax = plt.subplots(2,1, figsize=(15,10), sharex=True)
    fig.subplots_adjust(hspace=0.1)

    # Probabilidad
    ax[0].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023],
                predProb_Modelo*100,
                s= 5,
                c=predProb_Modelo*100,
                cmap='YlGnBu',
                vmin=0,
                vmax=100)

    ax[0].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
                y2023[y2023 >= 1] * 50,
                s= 5,
                c='r')

    ax[0].set_ylim(-10, 110)
    ax[0].set_ylabel('Probabilidad [%]')
    #ax[0].set_xlabel('Fecha - UTC')
    ax[0].grid(ls='--', color='grey')
    ax[0].set_title(f'Predicción 2023 Modelo: {modelo[:-7]}')



    # Eventos logrados
    event4_5 = df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2023).to_numpy()]
    event4_5 = event4_5 >= 4.5
    event4_5 = sum(event4_5)
    ax[1].scatter(df['Date-Time'][(df['Date-Time'].dt.year == 2023).to_numpy()],
                df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2023).to_numpy()],
                s=5,
                label=f'Sismos, {event4_5} eventos Ml >=4.5 ',
                alpha=0.5)

    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][predichosModelo >= 1],
                predichosModelo[predichosModelo >= 1]*7,
                s= 10,
                c='g',
                label=f'Predichos: {sum(predichosModelo)}')


    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
                y2023[y2023 >= 1] * 4.4,
                s= 5,
                c='r',
                label='Datos Y')

    ax[1].set_ylim(-0.5, 7.5)
    ax[1].set_ylabel('Ml')
    ax[1].set_xlabel('Fecha - UTC')
    ax[1].grid(ls='--', color='grey')


    plt.legend(loc=8)

    plt.savefig((pathSaveFiguras + f'2023_{modelo[:-7]}.png'),
                format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

    plt.show()

    print('\n')

### 2022

In [None]:
for modelo in listaModelos:
  if 'joblib' in modelo:
    if 'D30' in modelo:
      dias_Considerar = 30
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD30_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'SD30_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'SD30_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'SD30_y2022.txt')

      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD30_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'CD30_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'CD30_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'CD30_y2022.txt')

    if 'D60' in modelo:
      dias_Considerar = 60
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD60_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'SD60_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'SD60_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'SD60_y2022.txt')
      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD60_X2023.txt')
        y2023 = np.loadtxt(pathDatos+'CD60_y2023.txt')
        X2022 = np.loadtxt(pathDatos+'CD60_X2022.txt')
        y2022 = np.loadtxt(pathDatos+'CD60_y2022.txt')

    modelo_P = load(pathModelos+modelo)
    # 2023
    predichosModelo = modelo_P.predict(X2022)[-365:]
    predProb_Modelo = modelo_P.predict_proba(X2022)[:,1]

    nMin = dias_Considerar - 1
    filtro2022 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year == 2022).to_numpy()

    # Figuras 2022
    fig, ax = plt.subplots(2,1, figsize=(15,10), sharex=True)
    fig.subplots_adjust(hspace=0.1)

    # Probabilidad
    ax[0].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022],
                predProb_Modelo[-365:]*100,
                s= 5,
                c=predProb_Modelo[-365:]*100,
                cmap='YlGnBu',
                vmin=0,
                vmax=100)


    y2022 = y2022[-365:]
    ax[0].scatter(df_Dias['Fecha'].iloc[dias_Considerar-1:-6][filtro2022][y2022 >= 1],
                y2022[y2022 >= 1] * 50,
                s= 5,
                c='r')

    ax[0].set_ylim(-10, 110)
    ax[0].set_ylabel('Probabilidad [%]')
    #ax[0].set_xlabel('Fecha - UTC')
    ax[0].grid(ls='--', color='grey')
    ax[0].set_title(f'Predicción 2022 Modelo: {modelo[:-7]}')


    # Eventos logrados
    event4_5 = df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2022).to_numpy()]
    event4_5 = event4_5 >= 4.5
    event4_5 = sum(event4_5)
    ax[1].scatter(df['Date-Time'][(df['Date-Time'].dt.year == 2022).to_numpy()],
                df['MAGNITUD Ml'][(df['Date-Time'].dt.year == 2022).to_numpy()],
                s=5,
                label=f'Sismos, {event4_5} eventos Ml >=4.5 ',
                alpha=0.5)

    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][predichosModelo >= 1],
                predichosModelo[predichosModelo >= 1]*7,
                s= 10,
                c='g',
                label=f'Predichos: {sum(predichosModelo)}')


    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2022][y2022 >= 1],
                y2022[y2022 >= 1] * 4.4,
                s= 5,
                c='r',
                label='Datos Y')

    ax[1].set_ylim(-0.5, 7.5)
    ax[1].set_ylabel('Ml')
    ax[1].set_xlabel('Fecha - UTC')
    ax[1].grid(ls='--', color='grey')


    plt.legend(loc=8)

    plt.savefig((pathSaveFiguras + f'2022_{modelo[:-7]}.png'),
                format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

    plt.show()

    print('\n')

# 2024


## Start

In [None]:
# Paths
pathDatos = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Catalogos/'
pathSaveFiguras = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Figuras/'
pathDF = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Catalogos/'

## Load catalog 2024

In [None]:
df = pd.read_excel(pathDatos+'EneroFebrero_2024.xlsx', decimal=',')
df.info()

In [None]:
# Only those greater than 50 km deep
df_2024 = df[df['PROF. (Km)'] >= 50].reset_index(drop=True)
df_2024.info()

In [None]:
# Only magnitudes greater than 2.5
df_2024 = df_2024[df_2024['MAGNITUD'] >= 2.5].reset_index(drop=True)
df_2024.info()

## df_Dias 2024

In [None]:
df_2024.head()

In [None]:
df_2024[['FECHA', 'HORA_UTC']] = df_2024['FECHA - HORA UTC'].str.split(' ', expand=True)
df_2024['Fecha'] = pd.to_datetime(df_2024['FECHA'], yearfirst=True)
df_2024.info()

In [None]:
df_2024.tail()

In [None]:
df_2024['Date-Time'] = pd.to_datetime(df_2024['FECHA - HORA UTC'], yearfirst=True)

In [None]:
df_2024.columns

### Save dfTotalNuevo

In [None]:
df[['FECHA', 'HORA_UTC']] = df['FECHA - HORA UTC'].str.split(' ', expand=True)
df['Date-Time'] = pd.to_datetime(df['FECHA - HORA UTC'], yearfirst=True)

In [None]:
df2_ = df[['FECHA', 'HORA_UTC','LATITUD (°)', 'LONGITUD (°)', 'PROF. (Km)',
                'MAGNITUD', 'ERROR LATITUD (Km)', 'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)', 'Date-Time']].copy()
df2_.columns = ['FECHA', 'HORA_UTC', 'LATITUD (grados)', 'LONGITUD (grados)',
                'PROFUNDIDAD (Km)', 'MAGNITUD Ml', 'ERROR LATITUD (Km)',
                'ERROR LONGITUD (Km)', 'ERROR PROFUNDIDAD (Km)', 'Date-Time']
df2_

In [None]:
# df antiguo total
pathDF = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/Catalogos/'
df = pd.read_csv(pathDF+'df.csv')
#df['Date-Time'] = pd.to_datetime(df['Date-Time'], yearfirst=True)
df.columns

In [None]:
df_Total_1994_2024 = pd.concat([df, df2_], ignore_index=True)
df_Total_1994_2024.reset_index(drop=True)
df_Total_1994_2024

In [None]:
df_Total_1994_2024.to_csv(pathDatos+'df_Total_1994_2024.csv', index=False)

### Create df New Days

In [None]:
dict_Dias = {'Dia':[],
             '2.5-3.0':[],
             '3.0-3.5':[],
             '3.5-4.0':[],
             '4.0-4.5':[],
             '4.5-5.0':[],
             '5.0-5.5':[],
             '5.5-':[],}

# Apparently not every day there is a record of events with depths greater
# than 50 km and magnitudes greater than 2.5
inicio = np.datetime64('2024-01-01')
fin = np.datetime64('2024-03-01')
array_fechas = np.arange(inicio, fin, dtype='datetime64[D]')

for date in array_fechas:
  df_filtrado = df_2024[df_2024['FECHA'] == str(date)]
  dict_Dias['Dia'].append(str(date))

  for ml in np.arange(2.5, 6.0, 0.5):
    if ml == 5.5:
      nE = df_filtrado['MAGNITUD'] >= 5.5

      dict_Dias['5.5-'].append(sum(nE))

    else:
      nE = (df_filtrado['MAGNITUD'] >= ml) & (df_filtrado['MAGNITUD'] < (ml+0.5) )
      strDict = f'{ml}-{ml+0.5}'

      dict_Dias[strDict].append(sum(nE))

df_Dias2024 = pd.DataFrame.from_dict(dict_Dias)
df_Dias2024['Total dia'] = df_Dias2024.sum(axis=1)
df_Dias2024

## Load old df_Dias

In [None]:
# Load the df_Dias to then join with 2024
df_Dias = pd.read_csv(pathDatos+'df_Dias.csv')
df_Dias

In [None]:
df_Dias_nuevo = pd.concat([df_Dias, df_Dias2024], ignore_index=True)
df_Dias_nuevo.reset_index(drop=True)
df_Dias_nuevo['Fecha'] = pd.to_datetime(df_Dias_nuevo['Dia'], yearfirst=True)
df_Dias_nuevo

In [None]:
df_Dias_nuevo.info()

### Save CSV New Days

In [None]:
df_Dias_nuevo[['Dia', '2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5',
               '5.5-', 'Total dia', 'Fecha']].to_csv(pathDatos+'df_Dias_1994_2024.csv', index=False)

In [None]:
df_Dias_nuevo['Y'] = df_Dias_nuevo[['4.5-5.0', '5.0-5.5', '5.5-']].sum(axis=1)
np.round(df_Dias_nuevo.describe())

## Create x/y 2023-2024

In [None]:
y = []
X = []

dias_Considerar = 60
fechaInicio = datetime(2023,1,1) - timedelta(days = dias_Considerar-1)
df_Dias = df_Dias_nuevo[df_Dias_nuevo['Fecha'] >= fechaInicio].copy()

for dias in range(dias_Considerar-1, len(df_Dias)-6):
  di_ = dias_Considerar-1
  datos = df_Dias[['2.5-3.0', '3.0-3.5', '3.5-4.0', '4.0-4.5', '4.5-5.0', '5.0-5.5', '5.5-']].iloc[dias-di_:dias+1].to_numpy()
  #print(datos.shape)
  datos = np.reshape(datos, (1,-1))
  #print(datos.shape)
  X.append(datos.tolist()[0])

  # Si no se incluye el día 30 en la consulta para considerar en los targets
  # en el entrenamiento el modelo no predice
  # porque?
  # SD30 ... sin el día 30
  # SD60 ... sin el día 60
  target = df_Dias['Y'].iloc[dias+1:dias+7].sum()

  # Si se incluye el día 30, el modelo si predice mejor
  # porque?
  # CD30 ... con el día 30
  # CD60 ... con el día 60
  #target = df_Dias['Y'].iloc[dias:dias+7].sum()

  if target > 0 :
    y.append(1)
  else:
    y.append(0)
y = np.reshape(np.array(y), (-1,1))
X = np.array(X)

In [None]:
y.shape

In [None]:
X.shape

In [None]:
df_2023_2024 = pd.DataFrame(np.concatenate((X,y), axis=1))
df_2023_2024

### Save/Load df_2023_2024

In [None]:
df_2023_2024.to_csv(pathDatos+'df_2023_2024.csv', index=False)

In [None]:
df_2023_2024 = pd.read_csv(pathDatos+'df_2023_2024.csv')
df_2023_2024.head()

In [None]:
y2024 = df_2023_2024.iloc[:,-1].to_numpy()
X2024 = df_2023_2024.iloc[:,:-1].to_numpy()

In [None]:
prefijo = pathDatos + 'SD' + str(dias_Considerar)
print(prefijo)
np.savetxt(f'{prefijo}_y2024.txt', y2024)
np.savetxt(f'{prefijo}_X2024.txt', X2024)

## Comparar 2024

In [None]:
listaModelos = !ls /content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class
listaModelos

In [None]:
pathModelos = '/content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class/'
listaModelos = !ls /content/drive/MyDrive/Manuscritos_Investigacion/ML_SismosNidoBucaramanga/ModelosMLP_Class
listaTemp = []
for i in range(len(listaModelos)):
  if len(listaModelos[i].split('\t')) == 1:
    a = listaModelos[i].split('\t')
    listaTemp.append(a[0])
  else:
    a = listaModelos[i].split('\t')
    listaTemp.append(a[0])
    listaTemp.append(a[1])
listaModelos = np.reshape(np.array(listaTemp), (1,-1))
del(listaTemp)
listaModelos = listaModelos[0]
listaModelos

### Load df Total

In [None]:
df = pd.read_csv(pathDF+'df_Total_1994_2024.csv')
df['Date-Time'] = pd.to_datetime(df['Date-Time'], yearfirst=True)
df.info()

### Load df Days Total

In [None]:
# DataFrame by days
df_Dias = pd.read_csv(pathDatos+'df_Dias_1994_2024.csv')
df_Dias['Fecha'] = pd.to_datetime(df_Dias['Dia'], yearfirst=True)
df_Dias

## Plots

In [None]:
for modelo in listaModelos:
  if 'joblib' in modelo:
    if 'D30' in modelo:
      dias_Considerar = 30
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD30_X2024.txt')
        y2023 = np.loadtxt(pathDatos+'SD30_y2024.txt')

      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD30_X2024.txt')
        y2023 = np.loadtxt(pathDatos+'CD30_y2024.txt')


    if 'D60' in modelo:
      dias_Considerar = 60
      if 'SD' in modelo:
        X2023 = np.loadtxt(pathDatos+'SD60_X2024.txt')
        y2023 = np.loadtxt(pathDatos+'SD60_y2024.txt')

      if 'CD' in modelo:
        X2023 = np.loadtxt(pathDatos+'CD60_X2024.txt')
        y2023 = np.loadtxt(pathDatos+'CD60_y2024.txt')


    nMin = dias_Considerar - 1
    filtro2023 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year >= 2023).to_numpy()

    modelo_P = load(pathModelos+modelo)
    # 2023
    predichosModelo = modelo_P.predict(X2023)
    predProb_Modelo = modelo_P.predict_proba(X2023)[:,1]


    nMin = dias_Considerar - 1
    filtro2023 = (df_Dias['Fecha'].iloc[dias_Considerar-1:-6].dt.year >= 2023).to_numpy()

    # Figuras 2023-2024
    fig, ax = plt.subplots(2,1, figsize=(15,10), sharex=True)
    fig.subplots_adjust(hspace=0.1)

    # Probabilidad
    print(df_Dias['Fecha'].iloc[nMin:-6][filtro2023].shape)
    print(len(predProb_Modelo))
    ax[0].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023],
                predProb_Modelo*100,
                s= 5,
                c=predProb_Modelo*100,
                cmap='YlGnBu',
                vmin=0,
                vmax=100)

    ax[0].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
                y2023[y2023 >= 1] * 50,
                s= 5,
                c='r')

    ax[0].set_ylim(-10, 110)
    ax[0].set_ylabel('Probabilidad [%]')
    #ax[0].set_xlabel('Fecha - UTC')
    ax[0].grid(ls='--', color='grey')
    ax[0].set_title(f'Predicción 2023 Modelo: {modelo[:-7]}')



    # Eventos logrados
    event4_5 = df['MAGNITUD Ml'][(df['Date-Time'].dt.year >= 2023).to_numpy()]
    event4_5 = event4_5 >= 4.5
    event4_5 = sum(event4_5)
    ax[1].scatter(df['Date-Time'][(df['Date-Time'].dt.year >= 2023).to_numpy()],
                df['MAGNITUD Ml'][(df['Date-Time'].dt.year >= 2023).to_numpy()],
                s=5,
                label=f'Sismos, {event4_5} eventos Ml >=4.5 ',
                alpha=0.5)

    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][predichosModelo >= 1],
                predichosModelo[predichosModelo >= 1]*7,
                s= 10,
                c='g',
                label=f'Predichos: {sum(predichosModelo)}')


    ax[1].scatter(df_Dias['Fecha'].iloc[nMin:-6][filtro2023][y2023 >= 1],
                y2023[y2023 >= 1] * 4.4,
                s= 5,
                c='r',
                label='Datos Y')

    ax[1].set_ylim(-0.5, 7.5)
    ax[1].set_ylabel('Ml')
    ax[1].set_xlabel('Fecha - UTC')
    ax[1].grid(ls='--', color='grey')


    plt.legend(loc=8)

    plt.savefig((pathSaveFiguras + f'2024_{modelo[:-7]}.png'),
                format='png', dpi=300, bbox_inches = 'tight',pad_inches=0.25)

    plt.show()

    print('\n')

# End