In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Mock data (based on the structure from earlier analysis)
data = {
    'latitude': [29.9671, 28.6732, 22.7449, 25.1089, 23.0704, 25.1793, 27.1518, 28.1507, 29.9216, 29.5132],
    'longitude': [82.1727, 84.3555, 101.1232, 96.4115, 95.9626, 94.6614, 100.5409, 100.5776, 87.0502, 98.7281],
    'depth': [17.214, 10.0, 13.156, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 14.458],
    'magnitude': [4.4, 4.2, 4.7, 4.5, 4.4, 4.2, 4.0, 4.3, 4.5, 4.6],
    'nst': [28, 26, 100, 31, 26, 47, 23, 35, 53, 64],
    'gap': [83, 163, 57, 120, 114, 101, 93, 82, 77, 76],
    'dmin': [3.476, 1.192, 4.729, 4.121, 2.094, 3.719, 9.18, 8.219, 2.623, 7.666],
    'rms': [0.75, 0.55, 0.56, 0.51, 0.9, 0.65, 0.52, 0.34, 0.58, 0.4]
}

# Convert data into DataFrame
df = pd.DataFrame(data)

# Normalize data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Correlation Matrix
corr_matrix = df_scaled.corr()

# Split data into train and test sets
X = df_scaled.drop(columns=['magnitude'])
y = df_scaled['magnitude']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'KNN': KNeighborsRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'SVM': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Bagging': BaggingRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Dictionary to store results
results = {}

# Fit models and collect results
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}

# Feature importance from Extra Trees
extra_trees = ExtraTreesRegressor()
extra_trees.fit(X_train, y_train)
feature_importances = extra_trees.feature_importances_

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Output the results and feature importance
results, feature_importances, df_scaled.columns[:-1]


({'Linear Regression': {'MSE': 4.332875885065431, 'R2': -6.625861557715164},
  'Lasso': {'MSE': 0.6036931818181814, 'R2': -0.0625},
  'Ridge': {'MSE': 1.0225493099206173, 'R2': -0.7996867854602878},
  'KNN': {'MSE': 0.25252525252525265, 'R2': 0.555555555555555},
  'Extra Trees': {'MSE': 0.9124368686868689, 'R2': -0.6058888888888905},
  'SVM': {'MSE': 1.0427467017526293, 'R2': -0.8352341950846289},
  'Random Forest': {'MSE': 0.49042929292929316, 'R2': 0.13684444444444344},
  'Bagging': {'MSE': 0.6565656565656565, 'R2': -0.1555555555555561},
  'Gradient Boosting': {'MSE': 0.8174882620035825, 'R2': -0.4387793411263061}},
 array([0.05238846, 0.0845332 , 0.14962622, 0.36561802, 0.16312272,
        0.15496228, 0.0297491 ]),
 Index(['latitude', 'longitude', 'depth', 'magnitude', 'nst', 'gap', 'dmin'], dtype='object'))