## Earthquake Type Prediction

### Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

### Loading Dataset

In [None]:
df = pd.read_csv('../input/earthquake-database/database.csv')

In [None]:
# lets check the first five rows of data
df.head()

In [None]:
# shape of data
df.shape

In [None]:
df.info()

In [None]:
# Checking for Missing values
df.isnull().sum()

##### As we can see there are lot of missing values present, we need to handle it before model building.

### Data Preprocessing

In [None]:
def preprocess_input(df):
    df = df.copy()
    # dropping ID as its an unrelevant feature.
    df.drop('ID', axis=1,inplace=True)
    # checking for features in which greater than 66%(2/3) of data is missing
    null_columns = df.loc[:, df.isna().sum() > 0.66 * df.shape[0]].columns
    # dropping those columns 
    df.drop(null_columns, axis=1,inplace=True)
    # filling missing values by substituting mean value in rms columns 
    df['Root Mean Square'] = df['Root Mean Square'].fillna(df['Root Mean Square'].mean())
    # filling missing values by substituting mode in Magnitude type columns 
    df['Magnitude Type'] = df['Magnitude Type'].fillna(df['Magnitude Type'].mode()[0])
    df.reset_index(drop=True,inplace=True)
    # converting Date into pandas datetime 
    df['Date'] = pd.to_datetime(df['Date'],utc=True)
    # converting Time into pandas datetime 
    df['Time'] = pd.to_datetime(df['Time'],utc=True)
    # Extracting Year and Month From Date column and converting it into integer
    df['Year'] = df['Date'].apply(lambda x: str(x)[0:4]).astype(np.int)
    df['Month'] = df['Date'].apply(lambda x: str(x)[5:7]).astype(np.int)
    # Extracting Hour From Time column and converting it into integer
    df['Hour'] = df['Time'].apply(lambda x: str(x)[11:13]).astype(np.int)
    
    # dropping Date and Time as we dont need it anymore.
    df.drop(['Date','Time'], axis=1,inplace=True)
    
    # encoding Status to make it numerical.
    df['Status'] = df['Status'].map({'Automatic':0,
                                    'Reviewed':1}).astype('int')
    
    return df

In [None]:
df = preprocess_input(df)

In [None]:
# checking missing values again.
df.isnull().sum()

In [None]:
# creating list of categorical columns for one hot encoding
categorical_columns = [col for col in df.columns if df.dtypes[col] == 'object']

# creating list of numerical columns to standardized data 
numerical_columns = [col for col in df.columns if ((df.dtypes[col] != 'object') & (col != 'Status'))]

print('Numerical Features are : ',numerical_columns)
print('Categorical Features are : ',categorical_columns)

### OneHotEncoding for Categorical Features

In [None]:
# one hot encoding for categorical features 
def onehot_encoder(df, cols):
    df = df.copy()
    for col in cols:
        dummies = pd.get_dummies(df[col], drop_first=True)
        df = pd.concat([df, dummies], axis=1)
        df.drop(col, axis=1,inplace=True)
    return df

In [None]:
df = onehot_encoder(df,categorical_columns)

### Scaling Numerical Features

In [None]:
sc = StandardScaler()
df[numerical_columns] = sc.fit_transform(df[numerical_columns])

In [None]:
df.head()

### Separate dependent and independent variables

In [None]:
X = df.drop('Status',axis=1)
y = df['Status']

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

### Model Building

In [None]:
# Create a `Sequential` model and add a Dense layer as the first layer.
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
# Now the model will take as input arrays of shape (None, 104)
# and output arrays of shape (None, 32).
# Note that after the first layer, you don't need to specify
# the size of the input anymore:
model.add(Dense(64, activation='relu'))
# Only 1 output neuron. It will contain a value from 0-1 where 0 for class ('Automatic') 
# and 1 for the other ('Reviewed')
model.add(Dense(units=1,activation='sigmoid'))

In [None]:
#The following is the model summary of the model:
model.summary()

#### Specifying the optimizer and compile the model

In [None]:
# Early Stopping
monitor = EarlyStopping(monitor='val_auc', patience=5, 
                        verbose=1, mode='max',restore_best_weights=True)
# Defining Callbacks

filepath = './best_weights.hdf5'
checkpoint    = ModelCheckpoint(filepath, 
                                monitor = 'val_auc', 
                                mode='max', 
                                save_best_only=True, 
                                verbose = 1)

reduceLR = ReduceLROnPlateau()
callback_list = [monitor, checkpoint,reduceLR]

# model compile
model.compile(
    optimizer=Adam(lr=0.001),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

###  Model Fitting

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=25,
    callbacks=callback_list,
    verbose=1
)

### Result

In [None]:
plt.figure(figsize=(20, 6))

train_loss, val_loss = history.history['loss'], history.history['val_loss']
train_auc, val_auc = history.history['auc'], history.history['val_auc']

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label="Training Loss")
plt.plot(history.history['val_loss'], label="Validation Loss")
plt.legend()
plt.title("Model Loss")

plt.subplot(1, 2, 2)
plt.plot(history.history['auc'], label="Training AUC")
plt.plot(history.history['val_auc'], label="Validation AUC")
plt.legend()
plt.title("Model AUC")

plt.show()

### Model Evaluation

In [None]:
model.evaluate(X_test, y_test)

### Model Prediction

In [None]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

### Result Visualization

In [None]:
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix,annot=True,fmt="d")
plt.title('Confusion Matrix', fontsize = 23)
plt.show()

In [None]:
y_test.value_counts()