In [1]:
import pandas as pd


file_path = '/content/drive/MyDrive/earthquakes.csv'
earthquake_data = pd.read_csv(file_path)
columns_to_drop = [
    'id', 'url', 'detailUrl', 'types', 'what3words', 'locationDetails'
]
cleaned_data = earthquake_data.drop(columns=columns_to_drop)


for column in cleaned_data.columns:
    if cleaned_data[column].isnull().any():
        if cleaned_data[column].dtype == 'object':
            mode_value = cleaned_data[column].mode()[0]
            cleaned_data[column].fillna(mode_value, inplace=True)
        else:
            mean_value = cleaned_data[column].mean()
            cleaned_data[column].fillna(mean_value, inplace=True)


cleaned_data['datetime'] = pd.to_datetime(cleaned_data['date']) + pd.to_timedelta(cleaned_data['time'], unit='ms')
cleaned_data = cleaned_data.drop(columns=['date', 'time'])


cleaned_data.reset_index(drop=True, inplace=True)
print(cleaned_data.head(10))


   magnitude        type                                       title  \
0       4.80  earthquake         M 4.8 - 33 km WSW of Ackerly, Texas   
1       5.10  earthquake         M 5.1 - 34 km WSW of Ackerly, Texas   
2       3.70  earthquake                M 3.7 - 6 km N of Malibu, CA   
3       3.90  earthquake  M 3.9 - 58 km S of Whites City, New Mexico   
4       4.10  earthquake  M 4.1 - 60 km S of Whites City, New Mexico   
5       3.23  earthquake                M 3.2 - 6 km N of Malibu, CA   
6       3.43  earthquake                M 3.4 - 5 km N of Malibu, CA   
7       3.44  earthquake                M 3.4 - 7 km N of Malibu, CA   
8       4.69  earthquake                M 4.7 - 6 km N of Malibu, CA   
9       3.50  earthquake             M 3.5 - 7 km NW of Hiko, Nevada   

         updated   felt  cdi  mmi  alert    status  tsunami  ...  \
0  1726583895255   1893    6    5  green  reviewed        0  ...   
1  1726672002991   2042    6    5  green  reviewed        0  ...   
2  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_data[column].fillna(mode_value, inplace=True)  # Replace with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_data[column].fillna(mean_value, inplace=True)  # Replace with mean


In [4]:
import pandas as pd
columns = ['depth', 'latitude', 'longitude']

for column in columns:
    Q1 = cleaned_data[column].quantile(0.25)
    Q3 = cleaned_data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mean_value = cleaned_data[column].mean()


    cleaned_data[column] = cleaned_data[column].apply(lambda x: mean_value if x < lower_bound or x > upper_bound else x)
cleaned_data[columns].describe()

Unnamed: 0,depth,latitude,longitude
count,1137.0,1137.0,1137.0
mean,19.458511,31.990423,-3.930635
std,16.507155,9.677785,118.043697
min,-0.25,4.2633,-179.807
25%,7.55,27.308909,-104.452
50%,10.0,31.657,-68.682
75%,34.723,37.1959,126.628
max,73.0,57.1827,179.972


In [5]:
categorical_columns = cleaned_data.select_dtypes(include=['object']).columns.tolist()
cleaned_data = pd.get_dummies(cleaned_data, columns=categorical_columns, drop_first=True)
cleaned_data = cleaned_data.astype(int)
print(cleaned_data.info())
print(cleaned_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1137 entries, 0 to 1136
Columns: 3552 entries, magnitude to locality_دهستان گهره
dtypes: int64(3552)
memory usage: 30.8 MB
None
   magnitude        updated  felt  cdi  mmi  tsunami   sig  nst  dmin  rms  \
0          4  1726583895255  1893    6    5        0   994   37     0    0   
1          5  1726672002991  2042    6    5        0  1040   24     0    0   
2          3  1726637414586  1580    4    4        0   591  135     0    0   
3          3  1726584426218     5    3    4        0   236   38     0    0   
4          4  1726334616179     4    3    4        0   260   28     0    0   

   ...  locality_Wushi  locality_Yakutat  locality_Yamgan  locality_Yucaipa  \
0  ...               0                 0                0                 0   
1  ...               0                 0                0                 0   
2  ...               0                 0                0                 0   
3  ...               0               

In [7]:
from sklearn.model_selection import train_test_split
X = cleaned_data.drop(columns=['magnitude'])  # Features
y = cleaned_data['magnitude']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
X = cleaned_data.drop(columns=['magnitude'])
y = cleaned_data['magnitude']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)
print(f'XGBoost Mean Absolute Error: {xgb_mae}')
print(f'XGBoost R-squared: {xgb_r2}')

XGBoost Mean Absolute Error: 0.027594237996820817
XGBoost R-squared: 0.9860325455665588


In [10]:
import pickle
with open('earthquake_magnitude_xgboost2_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)