In [1]:
# %% [markdown]
# # Exercise 1: Earthquake Data Analysis
# ## Manual Exploration of Data

# %% Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %% Load data
FILEPATH = '../Data/raw/earthquake_data.csv'
df = pd.read_csv(FILEPATH)
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",7.0,22-11-2022 02:03,8,7,green,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,"Malango, Solomon Islands",Oceania,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",6.9,18-11-2022 13:37,4,4,green,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,"Bengkulu, Indonesia",,
2,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,,Oceania,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",7.3,11-11-2022 10:48,5,5,green,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,"Neiafu, Tonga",,
4,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,


In [3]:
# %% Check column names
df.columns

# %% Display information about data
df.info()

# %% Drop specified columns
df = df.drop(columns=['title', 'continent', 'alert', 'location', 'country'])
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      782 non-null    object 
 1   magnitude  782 non-null    float64
 2   date_time  782 non-null    object 
 3   cdi        782 non-null    int64  
 4   mmi        782 non-null    int64  
 5   alert      415 non-null    object 
 6   tsunami    782 non-null    int64  
 7   sig        782 non-null    int64  
 8   net        782 non-null    object 
 9   nst        782 non-null    int64  
 10  dmin       782 non-null    float64
 11  gap        782 non-null    float64
 12  magType    782 non-null    object 
 13  depth      782 non-null    float64
 14  latitude   782 non-null    float64
 15  longitude  782 non-null    float64
 16  location   777 non-null    object 
 17  continent  206 non-null    object 
 18  country    484 non-null    object 
dtypes: float64(6), int64(5), object(8)
memory usage: 1

Unnamed: 0,magnitude,date_time,cdi,mmi,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude
0,7.0,22-11-2022 02:03,8,7,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596
1,6.9,18-11-2022 13:37,4,4,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738
2,7.0,12-11-2022 07:09,3,3,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346
3,7.3,11-11-2022 10:48,5,5,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129
4,6.6,09-11-2022 10:14,0,2,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278


In [4]:
# %% Check for missing values
df.isnull().sum()

# %% Drop rows with missing values
df = df.dropna()
df.head()

# %% Check new shape
df.shape

(782, 14)

In [5]:
# %% Descriptive statistics for numeric features
pd.set_option('display.float_format', '{:.2f}'.format)
numeric_features = df.select_dtypes(include=["number"]).columns
df[numeric_features].describe()


Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,6.94,4.33,5.96,0.39,870.11,230.25,1.33,25.04,75.88,3.54,52.61
std,0.45,3.17,1.46,0.49,322.47,250.19,2.22,24.23,137.28,27.3,117.9
min,6.5,0.0,1.0,0.0,650.0,0.0,0.0,0.0,2.7,-61.85,-179.97
25%,6.6,0.0,5.0,0.0,691.0,0.0,0.0,14.62,14.0,-14.6,-71.67
50%,6.8,5.0,6.0,0.0,754.0,140.0,0.0,20.0,26.3,-2.57,109.43
75%,7.1,7.0,7.0,1.0,909.75,445.0,1.86,30.0,49.75,24.65,148.94
max,9.1,9.0,9.0,1.0,2910.0,934.0,17.65,239.0,670.81,71.63,179.66


In [6]:
dfy = df.copy()

# %% Generate the profiling report
profile = ProfileReport(dfy, title="Profiling Report - Earthquake Data")
profile.to_file("earthquake_data_profiling.html")

Summarize dataset: 100%|██████████| 123/123 [00:25<00:00,  4.85it/s, Completed]                   
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.63s/it]
Render HTML: 100%|██████████| 1/1 [00:06<00:00,  6.26s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 13.73it/s]


In [7]:
# %% [markdown]
# ## Outliers Detection

# %% Visualize distributions
df[numeric_features].hist(figsize=(12, 10))
plt.show()

# %% Example: Univariate outlier detection for a feature, e.g., "magnitude"
from pyod.models.mad import MAD

mad = MAD(threshold=3.0)  # Adjust threshold as needed
mad.fit(df[["magnitude"]])  # Replace 'magnitude' with desired feature

outliers = mad.predict(df[["magnitude"]])  # Predict outliers
df_no_outliers = df[outliers == 0]  # Filter out outliers


  plt.show()


In [8]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Define features and label
FEATURES = ['magnitude', 'date_time', 'cdi', 'mmi', 'sig', 'net', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude', 'longitude']
LABEL = 'tsunami'

# Define X and y
X = df_no_outliers[FEATURES]
y = df_no_outliers[LABEL]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Display split results
X_train.shape, X_test.shape, y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)


((535, 13),
 (179, 13),
 tsunami
 0   0.61
 1   0.39
 Name: proportion, dtype: float64,
 tsunami
 0   0.61
 1   0.39
 Name: proportion, dtype: float64)

In [9]:
# %% [markdown]
# ## Profile Comparison of Train and Test Sets

dfy_train = pd.DataFrame(X_train, columns=FEATURES).assign(label=y_train)
dfy_test = pd.DataFrame(X_test, columns=FEATURES).assign(label=y_test)

profile_train = ProfileReport(dfy_train, title="Profiling Report - Train Set")
profile_test = ProfileReport(dfy_test, title="Profiling Report - Test Set")

# Generate comparison report
comparison_report = profile_train.compare(profile_test)
comparison_report.to_file("train_test_comparison_earthquake.html")

Summarize dataset: 100%|██████████| 123/123 [00:24<00:00,  5.00it/s, Completed]                   
Summarize dataset: 100%|██████████| 123/123 [00:23<00:00,  5.24it/s, Completed]                   
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.99s/it]
Render HTML: 100%|██████████| 1/1 [00:06<00:00,  6.97s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 14.68it/s]
