In [None]:
import numpy as np
import pandas as pd
pd.set_option("max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv("../input/earthquake-database/database.csv")

In [None]:
data

In [None]:
# Creating a list with numeric columns for visualization

numeric_columns = []

for column in data.columns:
    if data.dtypes[column] !="object":
        numeric_columns.append(column) 

In [None]:
numeric_columns

In [None]:
# Correlation matrix with heatmap

corr_mat = data[numeric_columns].corr() 
plt.figure(figsize=(12,8))
sns.heatmap(corr_mat, annot=True, cmap="Blues")
plt.show()

In [None]:
data = data.drop("ID", axis=1)

In [None]:
data.info() 

In [None]:
def preprocess_inputs(df):
    
    df = df.copy()
    
    # Dropping columns with more than 30% missing values
    
    for column in data.columns:
        if df[column].isna().mean() > 0.3:
            df = df.drop(column, axis=1) 
        
    # Filling the missing values of the Root Mean Square column
    
    df["Root Mean Square"] = df["Root Mean Square"].fillna(df["Root Mean Square"].mean())
    
    # Dropping the rows with missing targat values
    
    df = df.dropna(axis=0).reset_index(drop=True)
    
    # Extracting the date features
    
    df["Month"] = df["Date"].apply(lambda x: (x[0:2])) 
    df["Year"] = df["Date"].apply(lambda x: (x[-4:]))    
    df = df.drop("Date", axis=1)
    
    # Convert Month column to integer or float
    
    df["Month"] = df["Month"].astype(np.int)
    
    # Dropping the rows in the Year column which have Z and converting the column to int or float
    
    invalid_indexes = df[df["Year"].str.contains("Z")].index
    df = df.drop(invalid_indexes, axis=0).reset_index(drop=True)
    
    df["Year"] = df["Year"].astype(np.int)
    
    # Extracting the hour feature
    
    df["Hour"] = df["Time"].apply(lambda x: np.int(x[0:2])) 
    
    df = df.drop("Time", axis=1)
    
    # Binary encode the Status column
    
    df["Status"] = df["Status"].replace({"Automatic": 1, "Reviewed": 0})
    
    # One-hot encoding
    
    for column in ["Type", "Magnitude Type", "Source", "Location Source", "Magnitude Source"]:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    
    # Splitting and scaling the data
    
    y = df["Status"]
    X = df.drop("Status", axis=1)
    
    # Scaling X 
    
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
    
    # Train test split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test= preprocess_inputs(data)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
# Training

model = RandomForestClassifier()
model.fit(X_train, y_train)
print("RF trained.")

In [None]:
# Results

model.score(X_test, y_test)