In [1]:
import os

In [2]:
print(os.cpu_count())

36


In [1]:
#Basic packages
import numpy as np
import pandas as pd
from sklearn.utils import shuffle #to shuffle data

#Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import xgboost as xgb

#Hyperparameter tuning of Models and train test split
from sklearn.model_selection import train_test_split, GridSearchCV

#Evaluation of model
from sklearn.metrics import mean_squared_error

#For neural nets
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#to check the time for each model
import time




In [2]:
data = pd.read_csv("./final_draft.csv")

In [3]:
data

Unnamed: 0,time,latitude,longitude,depth,mag,magtype,nst,gap,dmin,rms,...,magtype_ms,magtype_mw,magtype_mwb,magtype_mwc,magtype_mwr,magtype_mww,distance_to_tokyo,distance_to_osaka,distance_to_kyoto,seismic_energy
0,-2039928659,25.967,143.332,15.000,9.197486,mw,22.0,134.0,2.044,0.37,...,0,1,0,0,0,0,1132.512196,1224.909473,1237.072915,4.926980
1,-2038069220,33.715,131.759,60.000,6.910000,mw,32.0,29.0,1.788,1.12,...,0,1,0,0,0,0,758.801537,361.643937,395.810290,6.027314
2,-2035093153,38.417,144.119,15.000,7.412194,mw,34.0,38.0,0.905,1.36,...,0,1,0,0,0,0,496.588240,874.563628,835.874573,4.711177
3,-2017908626,34.175,138.025,300.000,9.145070,mw,89.0,169.0,2.881,0.65,...,0,1,0,0,0,0,226.746186,238.891320,226.883283,7.916997
4,-2015019979,34.633,141.424,15.000,6.250000,mw,46.0,15.0,1.153,0.86,...,0,1,0,0,0,0,196.585337,542.749335,519.061708,4.540632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22118,1696800102,29.782,139.938,10.000,6.825116,mb,32.0,131.0,6.900,0.52,...,0,0,0,0,0,0,655.548644,686.419697,699.986233,4.223194
22119,1696819895,40.252,142.393,50.461,6.762525,mb,18.0,179.0,1.146,0.27,...,0,0,0,0,0,0,559.205166,866.672841,824.289528,5.832597
22120,1697191192,41.080,140.160,172.478,5.000000,mww,90.0,28.0,0.745,0.97,...,0,0,0,0,0,1,599.761180,818.419515,775.824429,6.759708
22121,1697250565,41.487,142.116,61.485,9.528641,mb,65.0,127.0,0.941,0.74,...,0,0,0,0,0,0,677.259322,950.754114,907.929360,6.373095


In [4]:
data['time'] = pd.to_datetime(data['time']).astype("int64") / 10**9
features = ['latitude', 'longitude', 'depth', 'time', 'seismic_energy']
target = 'mag'
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
rf_model = RandomForestRegressor()
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
start_time = time.time()
grid_search = GridSearchCV(rf_model, param_grid, cv = 5, scoring='neg_mean_squared_error', n_jobs = -1)
grid_search.fit(X_train, y_train)
end_time = time.time()
print("Best Hyperparameters:")
print(grid_search.best_params_)
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")
print(f"Time taken for Grid Search and Training: {end_time - start_time} seconds")

Best Hyperparameters:
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error on Test Set: 0.008646318714239709
Time taken for Grid Search and Training: 248.92595672607422 seconds


In [8]:
start_time = time.time()
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
end_time = time.time()
print(f"Mean Squared Error on Test Set: {lr_mse}")
print(f"Time taken for Grid Search and Training: {end_time - start_time} seconds")

Mean Squared Error on Test Set: 2.421625956621727
Time taken for Grid Search and Training: 0.020879745483398438 seconds


In [9]:
#DecisionTreeRegressor
start_time = time.time()
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_pred)
end_time = time.time()
print(f"Mean Squared Error on Test Set: {dt_mse}")
print(f"Time taken for Grid Search and Training: {end_time - start_time} seconds")

Mean Squared Error on Test Set: 0.023090381339033655
Time taken for Grid Search and Training: 0.3101773262023926 seconds


In [10]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
start_time = time.time()
grid_search_gb = GridSearchCV(gb_model, param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)
end_time = time.time()
best_gb_model = grid_search_gb.best_estimator_
gb_pred = best_gb_model.predict(X_test)
gb_mse = mean_squared_error(y_test, gb_pred)
print("Best Hyperparameters:")
print(grid_search_gb.best_params_)
print(f"Mean Squared Error on Test Set: {gb_mse}")
print(f"Time taken for Grid Search and Training: {end_time - start_time} seconds")

Best Hyperparameters:
{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Mean Squared Error on Test Set: 0.00898795249309743
Time taken for Grid Search and Training: 80.0874433517456 seconds


In [None]:
svr_model = SVR()
param_grid_svr = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.2],
    'kernel': ['linear', 'rbf']
}
start_time = time.time()
grid_search_svr = GridSearchCV(svr_model, param_grid_svr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_svr.fit(X_train, y_train)
end_time = time.time()
best_svr_model = grid_search_svr.best_estimator_
svr_pred = best_svr_model.predict(X_test)
svr_mse = mean_squared_error(y_test, svr_pred)
print("Best Hyperparameters:")
print(grid_search_svr.best_params_)
print(f"Mean Squared Error on Test Set: {svr_mse}")
print(f"Time taken for Grid Search and Training: {end_time - start_time} seconds")

In [5]:
nn_model = Sequential()
nn_model.add(Dense(50, input_dim=len(features), activation='relu'))
nn_model.add(Dense(50, activation='relu'))

nn_model.add(Dense(50, activation='sigmoid'))
nn_model.add(Dense(1, activation='linear'))
nn_model.compile(loss='mean_squared_error', optimizer='adam')

start_time = time.time()
nn_model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=2)
end_time = time.time()

nn_pred = nn_model.predict(X_test)
nn_mse = mean_squared_error(y_test, nn_pred)

print(f"Neural Network (using TensorFlow) MSE: {nn_mse}")
print(f"Time taken for training: {end_time - start_time} seconds")



Epoch 1/10

17698/17698 - 17s - loss: 3.1157 - 17s/epoch - 976us/step
Epoch 2/10
17698/17698 - 16s - loss: 3.0950 - 16s/epoch - 925us/step
Epoch 3/10
17698/17698 - 16s - loss: 3.0917 - 16s/epoch - 927us/step
Epoch 4/10
17698/17698 - 16s - loss: 3.0894 - 16s/epoch - 925us/step
Epoch 5/10
17698/17698 - 16s - loss: 3.0915 - 16s/epoch - 931us/step
Epoch 6/10
17698/17698 - 16s - loss: 3.0934 - 16s/epoch - 925us/step
Epoch 7/10
17698/17698 - 16s - loss: 3.0927 - 16s/epoch - 931us/step
Epoch 8/10
17698/17698 - 16s - loss: 3.0886 - 16s/epoch - 927us/step
Epoch 9/10
17698/17698 - 16s - loss: 3.0871 - 16s/epoch - 931us/step
Epoch 10/10
17698/17698 - 16s - loss: 3.0946 - 16s/epoch - 923us/step
Neural Network (using TensorFlow) MSE: 3.085545726043548
Time taken for training: 165.09291887283325 seconds
