# 1. Clusterización K-Means

**Objetivo:** hacer una clusterización de sismos históricos globales.

El centro nacional de información de sismos (NEIC) determina la ubicación y magnitud de los sismos más importantes que han ocurrido en el mundo y disemina esta información inmediatament a agencias nacionales e internacionales. 

https://www.kaggle.com/usgs/earthquake-database

**Información de las características**
This dataset includes a record of the date, time, location, depth, magnitude, and source of every earthquake with a reported magnitude 5.5 or higher since 1965.

* 0 Date
* 1 Time
* 2 Location
* 3 Depth
* 4 Magnitude
* 5 Source of the earthquake


**Número de instancias:** 23412

# 2. Autenticación de drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 3. Importando librerías

In [None]:
import pandas as ___
import os
import matplotlib.pyplot as ___
import seaborn as ___
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from sklearn.cluster import KMeans

In [None]:
!pip install -U kaleido
!pip install plotly>=4.0.0
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4

# 4. Lectura del archivo y visualización de los datos

In [None]:
path = r'/content/drive/Shareddrives/Data Science para Geociencias/6. Métodos de ML/6.5 Clusterización'
name = 'NEIC_Earthquakes.csv'

In [None]:
sis_path = os.path.join(____, ____)
sismosdf = pd.read_csv(____)
sismosdf.head()

Observemos que tenemos muchos campos con NaN, ¿Eliminamos las filas con NaN o las columnas?.
Visualizaremos que tan vacío está el dataframe con un mapa de calor

In [None]:
fig, ax = plt.subplots(figsize=(25,8))
sns.heatmap(____.isnull(), ax=ax, cmap="magma")
plt.grid()
plt.show()

In [None]:
sismosdf.shape

# 5. Limpieza de los datos

Eliminar columnas con mayoría de campos indefinidos

In [None]:
_____.drop(['Depth Error', 'Depth Seismic Stations', 'Magnitude Error',
               'Magnitude Seismic Stations', 'Azimuthal Gap', 'Horizontal Distance',
               'Horizontal Error', 'Root Mean Square', 'ID', 'Source',
               'Location Source', 'Magnitude Source', 'Status'], axis=1, inplace=True)

In [None]:
print(sismosdf.shape)
____.head()

Modificación de columna de Fecha

In [None]:
for i in range(sismosdf.shape[0]):
  try:
    or_time = ____.iloc[i,0].split('/')
    year = or_time[-1]
    month = or_time[0]
    day = or_time[1]
    ____.iloc[i,0] = year+'-'+month+'-'+day
  except:
    or_time = sismosdf.iloc[i,0].split('-')
    year = or_time[0]
    month = or_time[1]
    day = or_time[-1].split('T')
    sismosdf.iloc[i,0] = year+'-'+month+'-'+day[0]
    time = day[1].split('.')
    sismosdf.iloc[i,1] = time[0]

In [None]:
sismosdf.head(2)

In [None]:
sismosdf.loc[:,'Fecha'] = pd.to_datetime(sismosdf.Date.astype(str)+' '+sismosdf.Time.astype(str))
sismosdf.drop(['Date','Time'], axis=1, inplace=True)
sismosdf = sismosdf[['Fecha', 'Latitude', 'Longitude',	'Type',	'Depth',	'Magnitude',	
                     'Magnitude Type']]
sismosdf.head(2)

Eliminación de instancias (filas) con valores idefinidos

In [None]:
____.dropna(axis=0, inplace=True)

In [None]:
sismosdf.shape

# 6. Exploración de los datos de sismos históricos

#### Revisando las columnas: Type, Magnitude Type

###### Columna Magnitude Type

In [None]:
sismosdf['Magnitude Type'].unique()

In [None]:
____['Magnitude Type'].value_counts().plot(kind='bar', figsize=(15,8), grid=True, color='darkorange')

##### Columna de magnitud

In [None]:
fig, ax = plt.subplots(figsize=(25,5))
sns.countplot(ax=ax, x=____['Magnitude'], color='teal')
plt.show()

##### Columna Type

In [None]:
_____['Type'].value_counts().plot(kind='bar', figsize=(10,5), grid=True, color='turquoise')

In [None]:
sismosdf = sismosdf[sismosdf.Type != 'Nuclear Explosion']
sismosdf = _____[sismosdf.Type != 'Explosion']
______ = sismosdf[sismosdf.Type != 'Rock Burst']

In [None]:
____.describe()

In [None]:
fig = go.Figure(data=go.Scattergeo(
        lon = _____['Longitude'],
        lat = _____['Latitude'],
        text = sismosdf['Depth'],
        marker = dict(
        color = sismosdf['Magnitude'],
        colorscale = 'Rainbow',
        reversescale = True,
        opacity = 0.7,
        size = 2,
        colorbar = dict(
            titleside = "right",
            outlinecolor = "rgba(68, 68, 68, 0)",
            ticks = "outside",
            showticksuffix = "last",
            dtick = 0.1
        )
    )
        ))
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=25),
    title = 'Sismos históricos',
    #geo_scope='south america',
    )
fig.write_image("earthquake_magnitude.png")
fig.show()

In [None]:
____['Y'] = sismosdf['Fecha'].dt.year
sismosdf['M'] = _____['Fecha'].dt.month
______ = sismosdf.set_index(['Y', 'M'])
index_list = sismosdf.index.levels[0].tolist()
print(index_list)

In [None]:
frames=[{
        'name': 'frame_{}'.format(i),

        'data': [{
            'type': 'scattermapbox',
            'lat': sismosdf.xs(i)['Latitude'],
            'lon': sismosdf.xs(i)['Longitude'],
            'marker': go.scattermapbox.Marker(
                size=(sismosdf.xs(i)['Magnitude']-3.7)**3,
                color=sismosdf.xs(i)['Magnitude'],
                showscale=True,
                colorscale='Rainbow',
                colorbar={'title': 'Magnitude', 'titleside': 'top', 'thickness': 4, 'ticksuffix': ' Mgn'}
                ),
        'customdata': np.stack((
            sismosdf.xs(i)['Magnitude'],
            sismosdf.xs(i)['Depth'],
            pd.Series(sismosdf.xs(i).index)),
                axis=-1
                ),
        'hovertemplate': "<extra></extra><em> Magnitude  %{customdata[0]}<br>Depth  %{customdata[1]}",
        }],
    } for i in index_list]

sliders = [{
            'transition':{'duration': 0},
            'x':0.08,
            'len':0.88,
            'currentvalue':{'font':{'size':15}, 'prefix':'Año ', 'visible':True, 'xanchor':'center'},
            'steps':[
                {
                    'label':i,
                    'method':'animate',
                    'args':[
                        ['frame_{}'.format(i)],
                        {'mode':'immediate', 'frame':{'duration':1000, 'redraw': True}, 'transition':{'duration':500, }}
                      ],
                } for i in index_list]
        }]


# Primer cuadro
data = frames[0]['data']

# Añadiendo los sliders al layout
layout = go.Layout(
    sliders=sliders,
    margin=dict(l=0, r=0, b=0, t=25),
    title = 'Sismos históricos',
    mapbox={
        'accesstoken':'pk.eyJ1IjoiY2xhdWNvdCIsImEiOiJja2h4MTIxd2UwMzNzMnlvNzVycXN2dW14In0.J_KSzOY3YGDzozRdHvMJgQ',
        'center':{"lat": 0, "lon": 0},
        'zoom':1,
        'style':'light',
    }
)
fig = go.Figure(data=data, layout=layout, frames=frames)
fig.write_image("earthquake_magnitude.png")
fig.show()

# 7. Creación del modelo de clusterización

In [None]:
fig = px.scatter(data_frame=____,
                    x='Magnitude',
                    y='Depth',
                    template='seaborn',
                    title='Magnitud vs Profundidad',
                    color_discrete_sequence=["darksalmon"],
                    width=1000,
                    height=500
)
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=25))
fig.update_traces(marker=dict(size=8),
                  selector=dict(mode='markers'))
pio.show(fig)

#### Determinar el número de clusters adecuados

In [None]:
X = np.array(sismosdf[['Magnitude', 'Depth']])

Nc = range(1, 10)
kmeans = [KMeans(n_clusters=i) for i in Nc]
kmeans
score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]
fig=go.Figure(layout=go.Layout(
        title=go.layout.Title(text="Gráfica del codo")
    ))
fig.update_xaxes(title_text='Número de clusters')
fig.update_yaxes(title_text='Score')
fig.add_scatter(x=pd.Series(Nc), y=pd.Series(score), mode='lines')
fig.update_layout(width=500, height=400, margin=dict(l=0, r=0, b=0, t=25))
pio.show(fig)

#### Ajuste de clusterización y cálculo de centroides

In [None]:
kmeans = KMeans(n_clusters=____ ).fit(X)
centroids = kmeans.cluster_centers_
print(centroids)

#### Cálculo de clusters

In [None]:
labels = kmeans.predict(X)
print(pd.Series(labels).unique())

#### Creación de nuevo dataframe con etiquetas de clusters

In [None]:
sismosdf['Tipo'] = labels
____.head()

In [None]:
sismosdf.replace({'Tipo': 0}, 'A', inplace=True)
sismosdf.replace({'Tipo': 1}, 'B', inplace=True)
sismosdf.replace({'Tipo': 2}, 'C', inplace=True)
_____.head()

#### Visualización de los clusters

In [None]:
fig = px.scatter(data_frame=____,
                    x='Magnitude',
                    y='Depth',
                    color='Tipo',
                    color_discrete_sequence=["red", "blue", "green","darksalmon","turquosie"],
                    template='seaborn',
                    title='Clusterización de sismos',
                    width=1000,
                    height=500)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=25))
fig.update_traces(marker=dict(size=6),
                  selector=dict(mode='markers'))
fig.add_scatter(x=pd.Series(centroids[:,0]), y=pd.Series(centroids[:,1]), mode='markers', marker = dict(size = 20, color = 'magenta', symbol = 'star'))
pio.show(fig)