In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import f1_score
np.random.seed(33)

In [12]:
df=pd.read_csv("earthquake_data.csv")
df

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",7.0,22-11-2022 02:03,8,7,green,1,768,us,117,0.509,17.0,mww,14.000,-9.7963,159.596,"Malango, Solomon Islands",Oceania,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",6.9,18-11-2022 13:37,4,4,green,0,735,us,99,2.229,34.0,mww,25.000,-4.9559,100.738,"Bengkulu, Indonesia",,
2,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.000,-20.0508,-178.346,,Oceania,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",7.3,11-11-2022 10:48,5,5,green,1,833,us,149,1.865,21.0,mww,37.000,-19.2918,-172.129,"Neiafu, Tonga",,
4,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,"M 7.7 - 28 km SSW of Puerto El Triunfo, El Sal...",7.7,13-01-2001 17:33,0,8,,0,912,us,427,0.000,0.0,mwc,60.000,13.0490,-88.660,"Puerto El Triunfo, El Salvador",,
778,"M 6.9 - 47 km S of Old Harbor, Alaska",6.9,10-01-2001 16:02,5,7,,0,745,ak,0,0.000,0.0,mw,36.400,56.7744,-153.281,"Old Harbor, Alaska",North America,
779,"M 7.1 - 16 km NE of Port-Olry, Vanuatu",7.1,09-01-2001 16:49,0,7,,0,776,us,372,0.000,0.0,mwb,103.000,-14.9280,167.170,"Port-Olry, Vanuatu",,Vanuatu
780,"M 6.8 - Mindanao, Philippines",6.8,01-01-2001 08:54,0,5,,0,711,us,64,0.000,0.0,mwc,33.000,6.6310,126.899,"Mindanao, Philippines",,


In [13]:
df["alert"] = df["alert"].fillna("red")
df.isnull().sum()

title          0
magnitude      0
date_time      0
cdi            0
mmi            0
alert          0
tsunami        0
sig            0
net            0
nst            0
dmin           0
gap            0
magType        0
depth          0
latitude       0
longitude      0
location       5
continent    576
country      298
dtype: int64

In [14]:
df.drop(["title"], axis = 1, inplace = True)
df.drop(["location"], axis = 1, inplace = True)
df.drop(["continent"], axis = 1, inplace = True)  #since a lot of entries are nan values
df.head()

Unnamed: 0,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,country
0,7.0,22-11-2022 02:03,8,7,green,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,Solomon Islands
1,6.9,18-11-2022 13:37,4,4,green,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,
2,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,Fiji
3,7.3,11-11-2022 10:48,5,5,green,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,
4,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,


In [15]:
df["date_time"] = pd.to_datetime(df["date_time"])
df["date_time"] = pd.DatetimeIndex(df["date_time"]).month

In [16]:
# Encode target labels with value between 0 and n_classes - 1

from sklearn.preprocessing import LabelEncoder
country_le = LabelEncoder()
alert_le = LabelEncoder()
magtype_le = LabelEncoder()
net_le = LabelEncoder()
df["country"] = country_le.fit_transform(df["alert"])
df["alert"] = alert_le.fit_transform(df["alert"])
df["magType"] = magtype_le.fit_transform(df["magType"])
df["net"] = net_le.fit_transform(df["net"])
df

Unnamed: 0,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,country
0,7.0,11,8,7,0,1,768,9,117,0.509,17.0,8,14.000,-9.7963,159.596,0
1,6.9,11,4,4,0,0,735,9,99,2.229,34.0,8,25.000,-4.9559,100.738,0
2,7.0,12,3,3,0,1,755,9,147,3.125,18.0,8,579.000,-20.0508,-178.346,0
3,7.3,11,5,5,0,1,833,9,149,1.865,21.0,8,37.000,-19.2918,-172.129,0
4,6.6,9,0,2,0,1,670,9,131,4.998,27.0,8,624.464,-25.5948,178.278,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,7.7,1,0,8,2,0,912,9,427,0.000,0.0,7,60.000,13.0490,-88.660,2
778,6.9,10,5,7,2,0,745,0,0,0.000,0.0,5,36.400,56.7744,-153.281,2
779,7.1,9,0,7,2,0,776,9,372,0.000,0.0,6,103.000,-14.9280,167.170,2
780,6.8,1,0,5,2,0,711,9,64,0.000,0.0,7,33.000,6.6310,126.899,2


In [17]:
# See if there is a class imbalance
df["tsunami"].value_counts()

0    478
1    304
Name: tsunami, dtype: int64

In [22]:
features = df.iloc[:, [0,1,2,3,4,6,7,8,9,10,11,12,13,14, 15]] # Parameters
target = df.iloc[:, [5]] #Target

In [25]:
# Using SMOTE (Synthetic Minority Oversampling Technique) to balance the classes
# by randomly adding minority samples

from imblearn.over_sampling import SMOTE
smote_obj = SMOTE()
features_data, target_data = smote_obj.fit_resample(features, target)

In [27]:
from sklearn.preprocessing import StandardScaler
standardScalerObj = StandardScaler()
features_scaled = standardScalerObj.fit_transform(features_data)
features_scaled

array([[ 0.14830521,  1.25677293,  1.15679928, ..., -0.47344547,
         0.92060313, -0.97898442],
       [-0.08236512,  1.25677293, -0.13062809, ..., -0.29587223,
         0.43200314, -0.97898442],
       [ 0.14830521,  1.55654212, -0.45248493, ..., -0.84963851,
        -1.88476674, -0.97898442],
       ...,
       [ 0.27288906,  0.35746535, -0.77434178, ..., -0.61685442,
         0.93265752, -0.97898442],
       [ 0.84031623,  0.65723455,  0.19122875, ..., -0.05394939,
         0.66180019, -0.97898442],
       [-0.47115952, -0.54184222,  0.19122875, ..., -0.27110804,
        -1.05645949, -0.97898442]])

In [28]:
# Splitting data into train and test bins

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target_data, random_state=12, test_size=0.2)

In [29]:
# Using Logistic Regression Model

from sklearn.linear_model import LogisticRegression
logisticRegressionModel = LogisticRegression()
logisticRegressionModel.fit(x_train,y_train)

In [32]:
from sklearn.metrics import accuracy_score

y_pred = logisticRegressionModel.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy from the Logistic Regression Model is, ", accuracy)

Accuracy from the Logistic Regression Model is,  82.8125


In [33]:
# Using support vector machines (SVMs)

from sklearn.svm import SVC
SVM_model = SVC(kernel="linear", random_state = 4)
SVM_model.fit(x_train,y_train)

In [34]:
y_pred = SVM_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy from Support Vector Machines is, ", accuracy)

Accuracy from Support Vector Machines is,  83.33333333333334


In [35]:
# Using Gaussian Naive Bayes Classifier

from sklearn.naive_bayes import GaussianNB
naiveBayesModel = GaussianNB()
naiveBayesModel.fit(x_train,y_train)

In [36]:
y_pred = naiveBayesModel.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy from Naive Bayes Model is, ", accuracy)

Accuracy from Naive Bayes Model is,  80.72916666666666


In [37]:
# Using Decision Trees

from sklearn.tree import DecisionTreeClassifier
decisionTreeClassifierModel = DecisionTreeClassifier()
decisionTreeClassifierModel.fit(x_train, y_train)

In [38]:
y_pred = decisionTreeClassifierModel.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy from Decision Tree Classifier is, ", accuracy)

Accuracy from Decision Tree Classifier is,  89.0625


In [39]:
# Creating an emsemble model out of the models used

from sklearn.ensemble import VotingClassifier
estimator_models = [("Logistic Regression", logisticRegressionModel),("SVM", SVM_model),("Decision Tree Classifier", decisionTreeClassifierModel)]
ensembleModel = VotingClassifier(estimators = estimator_models)
ensembleModel.fit(x_train,y_train)

In [40]:
y_pred = ensembleModel.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy from the emsemble model is, ", accuracy)

Accuracy from the emsemble model is,  83.33333333333334


In [41]:
# Using K - Fold Cross Validation, so that there is no bias
# induced as a result of splitting data, usually used on small datasets

from sklearn.model_selection import KFold
KFoldObj = KFold()
KFoldObj.split(x_train,y_train)
KFoldObj

KFold(n_splits=5, random_state=None, shuffle=False)

In [42]:
# Utility to get predictions in a cross validation setting

from sklearn.model_selection import cross_val_predict
cross_pred = cross_val_predict(ensembleModel, x_test, y_test, cv = KFoldObj)

In [44]:
# Utility to get accuracy scores in a cross validation setting

from sklearn.model_selection import cross_val_score
cross_score = cross_val_score(ensembleModel, x_train, y_train, cv = KFoldObj)
accuracy = cross_score.mean()*100
print("Accuracy from the emsemble model in a K Fold cross validation setting is, ", accuracy)

Accuracy from the emsemble model in a K Fold cross validation setting is,  83.25077399380805
