In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Load the dataset
file_path = 'filtered_earthquake_data.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Step 3: Select relevant features
# Features: Include a wider range of columns from the dataset for better model accuracy
X = data[['source_depth_km', 'source_distance_km', 'receiver_latitude', 
          'receiver_longitude', 'receiver_elevation_m', 'source_latitude', 
          'source_longitude', 'source_horizontal_uncertainty_km', 'p_weight', 
          's_weight', 'p_travel_sec', 's_arrival_sample', 'back_azimuth_deg']]

# Step 4: Define the target variable
y = data['source_magnitude']  # Assuming we are predicting 'source_magnitude'

# Step 5: Handle missing values
X.fillna(0, inplace=True) 

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust n_estimators for a bigger or smaller forest
rf_regressor.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = rf_regressor.predict(X_test)

# Step 9: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 10: Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² Score: {r2}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)  # Option to fill other missing values with 0


Mean Squared Error (MSE): 0.1385918101995335
R² Score: 0.784377619541546


In [1]:
tree_depths = [estimator.tree_.max_depth for estimator in rf_regressor.estimators_]
print(f"Depth of each tree: {tree_depths}")
print(f"Average tree depth: {sum(tree_depths) / len(tree_depths)}")

NameError: name 'rf_regressor' is not defined

In [3]:
# Step 1: Import necessary libraries for Linear Regression
from sklearn.linear_model import LinearRegression

# Step 2: Create a Linear Regression model
lr_model = LinearRegression()

# Step 3: Train the model
lr_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred_lr = lr_model.predict(X_test)

# Step 5: Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Step 6: Print the results
print(f"Linear Regression MSE: {mse_lr}")
print(f"Linear Regression R² Score: {r2_lr}")


Linear Regression MSE: 0.33460283301064486
Linear Regression R² Score: 0.4794219134736384


In [4]:
# Step 1: Import necessary libraries for Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Step 2: Create a Decision Tree Regressor model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Step 3: Train the model
dt_regressor.fit(X_train, y_train)

# Step 4: Make predictions
y_pred_dt = dt_regressor.predict(X_test)

# Step 5: Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Step 6: Print the results
print(f"Decision Tree MSE: {mse_dt}")
print(f"Decision Tree R² Score: {r2_dt}")


Decision Tree MSE: 0.26614606258341983
Decision Tree R² Score: 0.5859275704584539


In [6]:
# Create a DataFrame to store model results
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'MSE': [mse_lr, mse_dt, mse],
    'R² Score': [r2_lr, r2_dt, r2]
})

print(comparison_df)


               Model       MSE  R² Score
0  Linear Regression  0.334603  0.479422
1      Decision Tree  0.266146  0.585928
2      Random Forest  0.138592  0.784378


In [7]:
pip install xgboost lightgbm catboost


Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
     -------------------------------------- 124.9/124.9 MB 1.4 MB/s eta 0:00:00
Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 3.3 MB/s eta 0:00:00
Collecting catboost
  Downloading catboost-1.2.7-cp39-cp39-win_amd64.whl (101.8 MB)
     -------------------------------------- 101.8/101.8 MB 1.4 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.1/47.1 kB 1.2 MB/s eta 0:00:00
Installing collected packages: graphviz, xgboost, lightgbm, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3 lightgbm-4.5.0 xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Step 2: Load the dataset
file_path = 'filtered_earthquake_data.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Step 3: Select relevant features
X = data[['source_depth_km', 'source_distance_km', 'receiver_latitude', 
          'receiver_longitude', 'receiver_elevation_m', 'source_latitude', 
          'source_longitude', 'source_horizontal_uncertainty_km', 'p_weight', 
          's_weight', 'p_travel_sec', 's_arrival_sample', 'back_azimuth_deg']]

# Step 4: Define the target variable
y = data['source_magnitude']  # Assuming we are predicting 'source_magnitude'

# Step 5: Handle missing values
X.fillna(0, inplace=True)  # Fill other missing values with 0

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create and train the models

# XGBoost Regressor
xgb_regressor = XGBRegressor(n_estimators=100, random_state=42)
xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)

# LightGBM Regressor
lgbm_regressor = LGBMRegressor(n_estimators=100, random_state=42)
lgbm_regressor.fit(X_train, y_train)
y_pred_lgbm = lgbm_regressor.predict(X_test)

# CatBoost Regressor
cat_regressor = CatBoostRegressor(iterations=100, random_state=42, verbose=0)  # verbose=0 to suppress output
cat_regressor.fit(X_train, y_train)
y_pred_cat = cat_regressor.predict(X_test)

# Step 8: Evaluate the models
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

mse_cat = mean_squared_error(y_test, y_pred_cat)
r2_cat = r2_score(y_test, y_pred_cat)

# Step 9: Print the evaluation metrics
comparison_df = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'CatBoost'],
    'MSE': [mse_xgb, mse_lgbm, mse_cat],
    'R² Score': [r2_xgb, r2_lgbm, r2_cat]
})

print(comparison_df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)  # Fill other missing values with 0


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2978
[LightGBM] [Info] Number of data points in the train set: 296689, number of used features: 13
[LightGBM] [Info] Start training from score 1.458754
      Model       MSE  R² Score
0   XGBoost  0.156662  0.756265
1  LightGBM  0.170578  0.734613
2  CatBoost  0.168288  0.738176


In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Load the dataset
file_path = 'filtered_earthquake_data.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Step 3: Select relevant features
X = data[['source_depth_km', 'source_distance_km', 'receiver_latitude', 
          'receiver_longitude', 'receiver_elevation_m', 'source_latitude', 
          'source_longitude', 'source_horizontal_uncertainty_km', 'p_weight', 
          's_weight', 'p_travel_sec', 's_arrival_sample', 'back_azimuth_deg']]

# Step 4: Define the target variable
y = data['source_magnitude']

# Step 5: Handle missing values
X.fillna(0, inplace=True)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = rf_regressor.predict(X_test)

# Step 9: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 10: Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² Score: {r2}")

# Step 11: Calculate and print the depth of each tree
tree_depths = [estimator.tree_.max_depth for estimator in rf_regressor.estimators_]
print(f"Depth of each tree: {tree_depths}")
print(f"Average tree depth: {sum(tree_depths) / len(tree_depths)}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Mean Squared Error (MSE): 0.1385918101995335
R² Score: 0.784377619541546
Depth of each tree: [50, 52, 53, 53, 56, 54, 55, 52, 52, 51, 52, 56, 49, 55, 51, 52, 55, 56, 53, 59, 53, 51, 54, 54, 52, 51, 53, 55, 51, 57, 56, 56, 54, 52, 57, 54, 52, 50, 54, 49, 53, 59, 55, 59, 53, 51, 51, 50, 52, 56, 55, 64, 52, 55, 51, 51, 54, 54, 55, 54, 51, 52, 58, 55, 50, 51, 56, 59, 55, 55, 53, 51, 53, 53, 51, 53, 51, 57, 51, 62, 58, 53, 51, 53, 52, 52, 50, 52, 56, 52, 65, 55, 51, 56, 53, 51, 53, 56, 55, 48]
Average tree depth: 53.64


In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Load the dataset
file_path = 'filtered_earthquake_data.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Step 3: Select relevant features
X = data[['source_depth_km', 'source_distance_km', 'receiver_latitude', 
          'receiver_longitude', 'receiver_elevation_m', 'source_latitude', 
          'source_longitude', 'source_horizontal_uncertainty_km', 'p_weight', 
          's_weight', 'p_travel_sec', 's_arrival_sample', 'back_azimuth_deg']]

# Step 4: Define the target variable
y = data['source_magnitude']  # Assuming we are predicting 'source_magnitude'

# Step 5: Handle missing values
X.fillna(0, inplace=True) 

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust n_estimators for a bigger or smaller forest
rf_regressor.fit(X_train, y_train)

# Step 8: Make predictions on training and test data
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Step 9: Evaluate the model on training data
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Step 10: Evaluate the model on test data
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Step 11: Print the evaluation metrics
print("Training Data Evaluation:")
print(f"Mean Squared Error (MSE): {train_mse}")
print(f"R² Score: {train_r2}")

print("\nTest Data Evaluation:")
print(f"Mean Squared Error (MSE): {test_mse}")
print(f"R² Score: {test_r2}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Training Data Evaluation:
Mean Squared Error (MSE): 0.019276659725537503
R² Score: 0.9698824900943062

Test Data Evaluation:
Mean Squared Error (MSE): 0.1385918101995335
R² Score: 0.784377619541546
