In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import plot_tree

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.ensemble import ExtraTreesClassifier

from imblearn.over_sampling import RandomOverSampler

In [2]:
# Consulta general del dataframe
def display_all(df):
    with pd.option_context("display.max_rows",1000 ,  "display.max_columns", 1000): 
        display(df)

In [3]:
# Función para identificar el porcentaje de valores nulos por columna
def percentage_nulls(df):
    number_nulls = pd.DataFrame(df.isnull().sum(),columns=['Total'])
    number_nulls['% nulls'] = round((number_nulls['Total'] / df.shape[0])*100,1)
    
    return number_nulls

In [4]:
#Etiquetas para las categorías de decesos, lesionados y daños
categorias = [0,1,2,3,4]
etiquetasM = ['Ninguno','entre 1 y 50 decesos','de 51 a 100 decesos','de 101 a 1000 decesos','mas de 1000 decesos']
etiquetasL = ['Ningun lesionado','entre 1 y 50 lesionados','de 51 a 100 lesionados','de 101 a 1000 lesionados','mas de 1000 lesionados']
etiquetasD = ['Nada','menos de $1 millón','de $1 a $5 millones','de $5 a $25 millones','mas de $25 millones']
df_etiquetas_cat = pd.DataFrame()
df_etiquetas_cat['Categoria'] = categorias
df_etiquetas_cat['Descesos'] = etiquetasM
df_etiquetas_cat['Lesionados'] = etiquetasL
df_etiquetas_cat['Danios'] = etiquetasD

In [5]:
# Función para generar contador de ciudades afectadas por el temblor
def contar_ciudades(location):
    paises = location.split(';')
    ciudades = [c.split(':') for c in paises]
    contador = 0 
    if len(ciudades) == 1:
        contador = len(ciudades[0][-1].split(','))
    else:
        for c in ciudades:
            contador += len(c[-1].split(','))
    
    return contador

In [6]:
# df_earthquakes_work.rename(columns = {'Focal Depth (km)':'Depth','Mag':'Magnitud','MMI Int':'MMI','Death Description':'Death_Cat','Injuries Description':'Injuries_Cat','Damage Description':'Damage_Cat'}, inplace=True)
# Funcion que renombra columnas
def rename_columns(df):
    columns_array = {'Mo':'Month','Dy':'Day','Focal Depth (km)':'Depth','Mag':'Magnitud','MMI Int':'MMI','Death Description':'Death_Cat','Injuries Description':'Injuries_Cat','Damage Description':'Damage_Cat','Houses Destroyed Description':'Houses_Cat'}
    df.rename(columns=columns_array, inplace = True)
    return df


In [7]:
# Función para rellenar NULOS de las columnas categoricas y la profundidad con la moda
def fill_nulls(df):
    # Se imputan valores con el promedio por Region 
    columnasNull = ['Depth','Magnitud','Density','IDH']
    for col in columnasNull:
        region = df[df[col].isnull()]['Region'].array
        for reg in region:
            val = df[df['Region'] == reg][col].mean()
            df.loc[(df['Region'] == reg) & (df[col].isnull()), col] = val

    # data[data['Income($)'].notna()]
    df = df[(df['Death_Cat'].notna()) & (df['Injuries_Cat'].notna()) & (df['Damage_Cat'])]
    # df['Death_Cat'].fillna(0, inplace=True)
    # df['Injuries_Cat'].fillna(0, inplace=True)
    # df['Damage_Cat'].fillna(0, inplace=True)

    if df['Region'].isnull().sum() > 0:
        country = df[df['Region'].isnull()]['Country'].unique()
        for cnt in country:
            df['Region'].fillna(df[df['Country'] == cnt]['Region'].max(), inplace = True)
    
    df[['Month','Day','Region','Death_Cat', 'Injuries_Cat', 'Damage_Cat']] = df[['Month','Day','Region','Death_Cat', 'Injuries_Cat', 'Damage_Cat']].astype('int64')
    return df



In [8]:
# Seleccion de columnas y registros
def filtra_df(df, axo=1930):
    df = df[(df['Year'] >= axo) & (df['Magnitud'] > 0)][['Year', 'Month', 'Day', 'Country', 'Region', 'Location Name', 'Latitude', 'Longitude', 'Depth', 'Magnitud', 'Death_Cat', 'Injuries_Cat', 'Damage_Cat','Density','IDH']]
    return df

In [9]:
# Generar columna con contador de ciudades afectadas por el sismo
def categoriza_ciudad(df):
    df['Ciudades'] = df['Location Name'].apply(contar_ciudades)
    df.drop(columns='Location Name', inplace=True)
    return df

In [10]:
# Preparar los conjuntos para entrenar los modelos
#'Latitude','Longitude', 'Ciudades'
def dataset_train(df):
    df = df[df['Death_Cat']>0][['Year','Country','Latitude','Longitude','Region','Depth','Magnitud','Death_Cat','Injuries_Cat','Damage_Cat','Density','IDH']]
    return df

In [11]:
def dataframe_transfor(df):
    #Renombrar columnas
    df = rename_columns(df)
    # Filtra y selecciona columnas
    df = filtra_df(df, 1960)
    # Crear columna con numero de Ciudades Afectadas
    df = categoriza_ciudad(df)
    #Tratamiento de NULOS;
    df = fill_nulls(df)
    # df.to_csv('..\Output\earthquaqkes_clean.csv')
    return df


In [12]:
# Asigna los valores de densidad de poblacion e indice de desarrollo humano por pais y año
def asigna_sociodemo(df, df_sd):
    df['Density'] = 0
    df['IDH'] = 0

    for i in df_sd.index:
        axo = df_sd['Year'][i]
        country = df_sd['Country'][i]
        density = df_sd['Density'][i]
        idh = df_sd['IDH'][i]
        
        df.loc[(df['Year'] == axo) & (df['Country'] == country), 'Density'] = density
        df.loc[(df['Year'] == axo) & (df['Country'] == country), 'IDH'] = idh
    # df = df[df['Density'] > 0]
    return df

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 10))

sns.distplot(
    df_earthquakes.Deaths,
    hist    = False,
    rug     = True,
    color   = "blue",
    kde_kws = {'shade': True, 'linewidth': 1},
    ax      = axes[0]
)
axes[0].set_title("Distribución original", fontsize = 'medium')
axes[0].set_xlabel('Decesos', fontsize='small') 
axes[0].tick_params(labelsize = 6)

sns.distplot(
    np.sqrt(df_earthquakes.Deaths),
    hist    = False,
    rug     = True,
    color   = "blue",
    kde_kws = {'shade': True, 'linewidth': 1},
    ax      = axes[1]
)
axes[1].set_title("Transformación raíz cuadrada", fontsize = 'medium')
axes[1].set_xlabel('sqrt(Decesos)', fontsize='small') 
axes[1].tick_params(labelsize = 6)

sns.distplot(
    np.log(df_earthquakes.Deaths),
    hist    = False,
    rug     = True,
    color   = "blue",
    kde_kws = {'shade': True, 'linewidth': 1},
    ax      = axes[2]
)
axes[2].set_title("Transformación logarítmica", fontsize = 'medium')
axes[2].set_xlabel('log(Decesos)', fontsize='small') 
axes[2].tick_params(labelsize = 6)

fig.tight_layout()

In [None]:
# Gráfico de distribución para cada variable numérica
# ==============================================================================
# Ajustar número de subplots en función del número de columnas
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(9, 5))
axes = axes.flat
columnas_numeric = df_earthquakes.select_dtypes(include=['float64', 'int']).columns
columnas_numeric = columnas_numeric.drop(['Mo', 'Dy', 'MMI Int', 'Deaths', 'Death Description','Injuries Description', 'Damage Description','Houses Destroyed', 'Houses Destroyed Description', 'Houses Damaged','Houses Damaged Description'])
columnas_numeric

for i, colum in enumerate(columnas_numeric):
    sns.histplot(
        data    = df_earthquakes,
        x       = colum,
        stat    = "count",
        kde     = True,
        color   = (list(plt.rcParams['axes.prop_cycle'])*2)[i]["color"],
        line_kws= {'linewidth': 2},
        alpha   = 0.3,
        ax      = axes[i]
    )
    axes[i].set_title(colum, fontsize = 7, fontweight = "bold")
    axes[i].tick_params(labelsize = 6)
    axes[i].set_xlabel("")
    
    
fig.tight_layout()
plt.subplots_adjust(top = 0.9)
fig.suptitle('Distribución variables numéricas', fontsize = 10, fontweight = "bold");

In [13]:
# Extraccion datos fuente
df_earthquakes = pd.read_csv('..\Dataset\earthquakes-1500-2021.csv')
df_sociodemo = pd.read_csv('..\Dataset\socio_demografico.csv')
# Asocia valores de densidad de poblacion e idh
df_earthquakes = asigna_sociodemo(df_earthquakes, df_sociodemo)
# Transformación 
df_earthquakes = dataframe_transfor(df_earthquakes)
# Se generan datos para cada target
df_earthquakes_train = dataset_train(df_earthquakes)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_earthquakes_train.drop(columns=['Death_Cat']).corr(), annot=True)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_earthquakes_train.drop(columns=['Injuries_Cat']).corr(), annot=True)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_earthquakes_train.drop(columns=['Damage_Cat']).corr(), annot=True)

# Proceso de modelado Decesos

In [18]:
# Utilizando Arboles de decisión para validar si da mejores resultados que RegressionLogistics
def modelo_dt_deahts(df):
    # Separar los conjuntos (Target y variables predictoras)
    X = df.drop(columns = ['Death_Cat','Injuries_Cat','Damage_Cat','Country','Latitude','Longitude'])
    y = df['Death_Cat']

    #Balanceando los conjuntos
    ros = RandomOverSampler()
    X_ros, y = ros.fit_resample(X, y)

    #Escalando datos
    scaler = StandardScaler()
    X_t = scaler.fit_transform(X_ros)

    #Creando set de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size = 0.2, shuffle=True, random_state=50) 
    # asdf
    modelo_DT = DecisionTreeClassifier(max_depth = 8, criterion = 'gini', random_state = 123)
    modelo_DT.fit(X_train, y_train)
    return modelo_DT, X_test, y_test

In [20]:
modelo_deaths, X_test, y_test = modelo_dt_deahts(df_earthquakes_train)
y_predict_deaths = modelo_deaths.predict(X_test)
df_pred_Deaths_DT = pd.DataFrame({'Actual':y_test,'Predicted':y_predict_deaths})
# df_pred_Deaths_DT.head()
precision = round(modelo_deaths.score(X_test, y_test) *100,1)
precision

75.8

In [21]:
print(classification_report(y_test, y_predict_deaths))

              precision    recall  f1-score   support

           1       1.00      0.61      0.76       136
           2       0.75      0.83      0.79       104
           3       0.70      0.74      0.72       127
           4       0.69      0.88      0.77       121

    accuracy                           0.76       488
   macro avg       0.78      0.77      0.76       488
weighted avg       0.79      0.76      0.76       488



In [27]:
df_deaths_grf = pd.DataFrame()
df_deaths_grf['Test'] = y_test
df_deaths_grf['Pred'] = y_predict_deaths
df_deaths_grf.reset_index(drop=True)
df_deaths_grf.head()
df_deaths_grf.to_csv('../Output/earthquaks_death_predicted.csv')

# Modelado para Lesionados

In [28]:
def modelo_dt_injuries(df):
    # Separar los conjuntos (Target y variables predictoras)
    X = df.drop(columns = ['Injuries_Cat','Damage_Cat','Country','Latitude','Longitude'])
    y = df['Injuries_Cat']
    
    #Balanceo de datos
    ros = RandomOverSampler()
    X_ros, y = ros.fit_resample(X, y)

    #Escalando datos
    scaler = StandardScaler()
    X_t = scaler.fit_transform(X_ros)

    #Creando set de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size = 0.2, shuffle=True, random_state=50) 
    # asdf
    modelo = DecisionTreeClassifier(criterion = 'gini', random_state = 123)
    modelo.fit(X_train, y_train)

    return modelo, X_test, y_test 



In [29]:
modelo_injuries, X_test, y_test = modelo_dt_injuries(df_earthquakes_train) #modelo_dt_injuries(df_earthquakes_injuries)
y_predict_injuries = modelo_injuries.predict(X_test)
df_pred_injuries_DT = pd.DataFrame({'Actual':y_test,'Predicted':y_predict_injuries})
precision = round(modelo_injuries.score(X_test, y_test) *100,1)
precision

75.9

In [30]:
print(classification_report(y_test, y_predict_injuries))

              precision    recall  f1-score   support

           1       0.68      0.61      0.64        75
           2       0.78      0.91      0.84        56
           3       0.61      0.56      0.58        61
           4       0.94      0.98      0.96        65

    accuracy                           0.76       257
   macro avg       0.75      0.77      0.76       257
weighted avg       0.75      0.76      0.75       257



In [31]:
df_injuries_grf = pd.DataFrame()
df_injuries_grf['Test'] = y_test
df_injuries_grf['Pred'] = y_predict_injuries
df_injuries_grf.reset_index(drop=True)
df_injuries_grf.head()
df_injuries_grf.to_csv('../Output/earthquaks_injuries_predicted.csv')

# Proceso de Modelado Daños

In [32]:
df = df_earthquakes_train.copy()

# Separar los conjuntos (Target y variables predictoras)
X = df.drop(columns = ['Damage_Cat','Country','Latitude','Longitude'])
y = df['Damage_Cat']

# #Balanceo de datos
ros = RandomOverSampler()
X_ros, y = ros.fit_resample(X, y)

# #Escalando datos
scaler = StandardScaler()
X_t = scaler.fit_transform(X_ros)

# #Creando set de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size = 0.2, shuffle=True, random_state=50) 

# #Entrenamiento
# modelo = DecisionTreeClassifier(criterion = 'gini', min_samples_split=20, min_samples_leaf=5, random_state = 30)
#n_estimators=100, criterion='gini', max_depth=3, min_samples_split=4, min_samples_leaf=2,bootstrap=True
modelo_damage = RandomForestClassifier(n_estimators=600, criterion='gini')
# modelo = LogisticRegression(solver='newton-cg',penalty='l2', multi_class='ovr')
# modelo = SVC(decision_function_shape='ovo')
# modelo = LinearSVC()

modelo_damage.fit(X_train, y_train)

RandomForestClassifier(n_estimators=600)

In [33]:
y_pred = modelo_damage.predict(X_test)
# df_earthquakes_train.head()

matriz = confusion_matrix(y_test, y_pred)
print(matriz)
print('\n')
exactitud = accuracy_score(y_test, y_pred, normalize=True)
print(round(exactitud*100,1), '%')


[[62  6  4  0]
 [ 9 29 12  3]
 [ 2  5 34  7]
 [ 0  4  7 45]]


74.2 %


In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.85      0.86      0.86        72
           2       0.66      0.55      0.60        53
           3       0.60      0.71      0.65        48
           4       0.82      0.80      0.81        56

    accuracy                           0.74       229
   macro avg       0.73      0.73      0.73       229
weighted avg       0.74      0.74      0.74       229



In [35]:
df_damage_grf = pd.DataFrame()
df_damage_grf['Test'] = y_test
df_damage_grf['Pred'] = y_pred
df_damage_grf.reset_index(drop=True)
df_damage_grf.head()
df_damage_grf.to_csv('../Output/earthquaks_damage_predicted.csv')

In [41]:
df_prueba_gpo = pd.DataFrame(df_damage_grf['Test'].value_counts().reset_index())
# df_prueba_gpo = pd.DataFrame(df_damage_grf['Pred'].value_counts())
df_prueba_gpo.head()

Unnamed: 0,index,Test
0,1,72
1,4,56
2,2,53
3,3,48


In [None]:
## Dataset para modelado final PRODUCTIVO
df_eq_modelado = df_earthquakes.drop(columns=['Month','Day','Death_Cat','Injuries_Cat','Damage_Cat',]).copy()

# Predicción de Muertos para el Dataset FINAL

In [None]:
#Transformar los datos
scaler = StandardScaler()
X_final = scaler.fit_transform(df_eq_modelado[['Year','Region','Depth','Magnitud','Density','IDH']])

# Obtener la prediccion de los Decesos con la información de los lesionados
predict_deaths = modelo_deaths.predict(X_final)
df_eq_modelado['Death_Cat'] = predict_deaths
df_eq_modelado.head(3)
# df_final.to_csv('../Output/earthquaks_model_deaths.csv', index=False)

# Predicción de Lesionados para el dataset Final

In [None]:
#Transformar los datos
scaler = StandardScaler()
X_final = scaler.fit_transform(df_eq_modelado[['Year','Region','Depth','Magnitud','Density','IDH','Death_Cat']])

# Obtener la prediccion de los lesionados con la información 
predict_injuries = modelo_injuries.predict(X_final)
df_eq_modelado['Injuries_Cat'] = predict_injuries
df_eq_modelado.head(3)
# df_final.to_csv('../Output/earthquaks_model_injuries.csv', index=False)

# Predicción de Daños para el dataset Final

In [None]:
#Transformar los datos
scaler = StandardScaler()
X_final = scaler.fit_transform(df_eq_modelado[['Year','Region','Depth','Magnitud','Density','IDH','Death_Cat','Injuries_Cat']])

# Obtener la prediccion de los Decesos con la información de los lesionados
predict_damage = modelo_damage.predict(X_final)
df_eq_modelado['Damage_Cat'] = predict_damage
df_eq_modelado.head(3)

# Genera los grupos para la densidad de población
# ALPHA para que los extremos no coincidan con los valores de los datos
alpha = 1
bins = np.linspace(df_eq_modelado.Density.min()- alpha,df_eq_modelado.Density.max() + alpha,5)
i = np.digitize(df_eq_modelado.Density,bins)
df_eq_modelado['Density_Gpo'] = i


In [None]:
# Agrega descripción de etiquetas
for i in df_etiquetas_cat.index:
    df_eq_modelado.loc[df_eq_modelado['Death_Cat'] == df_etiquetas_cat['Categoria'][i], 'Death_Descripcion'] = df_etiquetas_cat['Descesos'][i]#######
    df_eq_modelado.loc[df_eq_modelado['Injuries_Cat'] == df_etiquetas_cat['Categoria'][i], 'Injurie_Descripcion'] = df_etiquetas_cat['Lesionados'][i]
    df_eq_modelado.loc[df_eq_modelado['Damage_Cat'] == df_etiquetas_cat['Categoria'][i], 'Damage_Descripcion'] = df_etiquetas_cat['Danios'][i]

# Se genera el archivo para realizar las visualizaciones en Power BI
df_eq_modelado.to_csv('../Output/earthquaks_modelado_did.csv', index=False)

In [None]:
df_idh_bajo_death_1 = df_eq_modelado[(df_eq_modelado.Year >= 2010) & (df_eq_modelado['IDH'] >= 0.56) & (df_eq_modelado['IDH'] < 0.68)  & (df_eq_modelado['Death_Cat'] == 1)]['IDH'].count()
df_idh_medio_death_1 = df_eq_modelado[(df_eq_modelado.Year >= 2010) & (df_eq_modelado['IDH'] >= 0.68) & (df_eq_modelado['IDH'] < 0.80) & (df_eq_modelado['Death_Cat'] == 1)]['IDH'].count()
df_idh_alto_death_1 = df_eq_modelado[(df_eq_modelado.Year >= 2010) & (df_eq_modelado['IDH'] >= 0.80) & (df_eq_modelado['Death_Cat'] == 1)]['IDH'].count()

In [None]:
bins

In [None]:
# Genera los grupos para la densidad de población
# ALPHA para que los extremos no coincidan con los valores de los datos
alpha = 1
bins = np.linspace(df_final.Density.min()- alpha,df_final.Density.max() + alpha,5)
i = np.digitize(df_final.Density,bins)
df_final['Density_Gpo'] = i

# Agrega descripción de etiquetas
for i in df_etiquetas_cat.index:
    df_final.loc[df_final['Death_Cat'] == df_etiquetas_cat['Categoria'][i], 'Death_Descripcion'] = df_etiquetas_cat['Descesos'][i]#######
    df_final.loc[df_final['Injuries_Cat'] == df_etiquetas_cat['Categoria'][i], 'Injurie_Descripcion'] = df_etiquetas_cat['Lesionados'][i]
    df_final.loc[df_final['Damage_Cat'] == df_etiquetas_cat['Categoria'][i], 'Damage_Descripcion'] = df_etiquetas_cat['Danios'][i]

# Se genera el archivo para realizar las visualizaciones en Power BI
df_final.to_csv('../Output/earthquaks_final_model.csv', index=False)


# Fin modelado de Decesos, Lesionados y Daños materiales

In [None]:
# Se crea catalogo de regiones para agregar al dataset final
df_cat_regions = df_earthquakes.groupby(['Country','Region']).agg({'Location Name':'count'}).reset_index()

In [None]:
# Leer archivo para predicción de dataset final
df_final = pd.read_csv('..\Output\earthquaqkes_all_clean.csv')
df_final = df_final[['Year', 'Date','Latitude','Longitude','Country','Depth','Magnitud' ]]
#Pegar los datos de densidad de población e indice de desarrollo
df_final = asigna_sociodemo(df_final, df_sociodemo)


In [None]:
# Asociar catalogo de regiones con el dataset final
df_final['Region'] = 0

for i in df_cat_regions.index:
    df_final.loc[(df_final['Country'] == df_cat_regions['Country'][i]), 'Region'] = df_cat_regions['Region'][i]

In [None]:
# Agrega descripción de etiquetas
for i in df_etiquetas_cat.index:
    df_final.loc[df_final['Death_Cat'] == df_etiquetas_cat['Categoria'][i], 'Death_Descripcion'] = df_etiquetas_cat['Descesos'][i]#######
    df_final.loc[df_final['Injuries_Cat'] == df_etiquetas_cat['Categoria'][i], 'Injurie_Descripcion'] = df_etiquetas_cat['Lesionados'][i]
    df_final.loc[df_final['Damage_Cat'] == df_etiquetas_cat['Categoria'][i], 'Damage_Descripcion'] = df_etiquetas_cat['Danios'][i]

# Se genera el archivo para realizar las visualizaciones en Power BI
df_final.to_csv('../Output/earthquaks_final_model.csv', index=False)

In [None]:
# Obtener el TOP 10 de más temblores por año
df_top_earthquakesBy_YearCountry = df_final[(df_final['Country'] != 'NO_COUNTRY') & (df_final['Density'] > 0)].groupby(['Year','Country']).agg({'Depth':'count'}).reset_index()
df_top_earthquakesBy_YearCountry.rename(columns={'Depth':'Earthquakes'}, inplace=True)

# Se arma el Top 10 de paises con más temblores por año
years = list(df_top_earthquakesBy_YearCountry.Year.unique())
df_Top = pd.DataFrame()
for y in years:
    df_Top = df_Top.append(df_top_earthquakesBy_YearCountry[df_top_earthquakesBy_YearCountry['Year'] == y].nlargest(10,'Earthquakes'), ignore_index=True)

# Genera el archivo para Power BI
df_Top.to_csv('../Output/earthquaks_top_ten.csv', index=False)

In [None]:
modelo_injuries, X_test, y_test = modelo_dt_injuries(df_earthquakes_train) #modelo_dt_injuries(df_earthquakes_injuries)
y_predict_injuries = modelo_injuries.predict(X_test)
df_pred_injuries_DT = pd.DataFrame({'Actual':y_test,'Predicted':y_predict_injuries})
precision = round(modelo_injuries.score(X_test, y_test) *100,1)
precision

In [None]:
df_earthquakes_injuries.head(1)
# Region	Depth	Magnitud	Injuries_Cat	Ciudades
# Year	Month	Day	Country Latitude	Longitude Death_Cat	Injuries_Cat	Damage_Cat	Density	Ciudades

In [None]:
df_earthquakes_damage.Damage_Cat.value_counts()

In [None]:
modelo.score(X_test, y_test)
# modelo.n_classes_

In [None]:
from collections import Counter
Counter(y).items()

In [None]:
# Podado de arbol para identificar el mejor nivel por validación cruzada
# ------------------------------------------------------------------------------
# Valores de ccp_alpha evaluados
param_grid = {'ccp_alpha':np.linspace(0, 5, 10)}

# Búsqueda por validación cruzada
grid = GridSearchCV(
        # El árbol se crece al máximo posible antes de aplicar el pruning
        estimator = DecisionTreeClassifier(
                            criterion         = 'entropy',
                            min_samples_split = 20,
                            min_samples_leaf  = 5,
                            random_state      = 30),
        param_grid = param_grid,
        scoring    = 'accuracy',
        cv         = 10,
        refit      = True,
        return_train_score = True)

grid.fit(X_train, y_train)

# fig, ax = plt.subplots(figsize=(6, 3.84))
# scores = pd.DataFrame(grid.cv_results_)
# scores.plot(x='param_ccp_alpha', y='mean_train_score', yerr='std_train_score', ax=ax)
# scores.plot(x='param_ccp_alpha', y='mean_test_score', yerr='std_test_score', ax=ax)
# ax.set_title("Error de validacion cruzada vs hiperparámetro ccp_alpha");

In [None]:
grid.best_params_

In [None]:
modelo_final_injuries = grid.best_estimator_
print(f"Profundidad del árbol: {modelo_final_injuries.get_depth()}")
print(f"Número de nodos terminales: {modelo_final_injuries.get_n_leaves()}")

In [None]:
predicciones = modelo_final_injuries.predict(X = X_test)
accuracy = accuracy_score(y_true= y_test, y_pred= predicciones,normalize = True)
print(f"El accuracy de test es: {100 * accuracy} %")

In [None]:
labels = X.columns.to_list()
fig, ax = plt.subplots(figsize=(20, 10))

print(f"Profundidad del árbol: {modelo.get_depth()}")
print(f"Número de nodos terminales: {modelo.get_n_leaves()}")

plot = plot_tree( decision_tree = modelo, feature_names = labels, class_names = 'deaths_category', filled = True, 
impurity= False, fontsize= 7, ax = ax)

In [None]:
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores = pd.DataFrame({'predictor': labels,'importancia': modelo_final.feature_importances_})
importancia_predictores.sort_values('importancia', ascending=False)

In [None]:
# Selección de características mediante Arbol de clasificación (Grafica)
tree_classifier = ExtraTreesClassifier(n_estimators=100, criterion='gini')
tree_classifier.fit(X_t,y)
index = np.flipud(np.argsort(tree_classifier.feature_importances_))
score = tree_classifier.feature_importances_[index]

fig,ax = plt.subplots(figsize=(20,8))
plt.bar(range(len(index)),score,align='center')
plt.xticks(range(len(index)),index)
plt.title('importances of features')
plt.show()

In [None]:
y_predict_DT = modelo_DT.predict(X_test)
df_pred_Deaths_DT = pd.DataFrame({'Actual':y_test,'Predicted':y_predict_DT})
df_pred_Deaths_DT['Validacion'] = abs(df_pred_Deaths_DT['Actual'] - df_pred_Deaths_DT['Predicted'])
correctos = df_pred_Deaths_DT[df_pred_Deaths_DT['Validacion'] == 0]['Validacion'].count()
total = df_pred_Deaths_DT.shape[0]
incorrectos = total - correctos
porcentaje = (correctos / total) * 100
print(f'Correctos: {correctos}')
print(f'Incorrectos: {incorrectos}')
print(f'Precision: {porcentaje}')


In [None]:
confusion_matrix(y_test, y_predict_DT)

In [None]:
accuracy_DT = accuracy_score(y_true=y_test, y_pred=y_predict_DT, normalize=True)
print(f'El accuracy test es de: {accuracy_DT * 100} %')