In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [7]:
df = pd.read_csv('../Data/df_normalized_encoded.csv')

df.head()

Unnamed: 0,semi_major_axis,eccentricity,inclination,longitude_node,perihelion,perihelion_distance,aphelion_distance,orbital_period,data_arc,n_obs_used,absolute_magnitude,diameter,albedo,min_orbit_intersection,mean_motion,mean_anomaly,near_earth_object_N,near_earth_object_Y,physical_hazardous_asteroid_N,physical_hazardous_asteroid_Y
0,-0.50763,0.249189,-0.373376,-1.188765,-1.43335,-0.762226,-0.302052,-0.147092,8.133003,3.468336,-4.631043,9.155872,1.037744,-0.763128,0.700901,1.230759,0.077199,-0.077199,0.03832,-0.03832
1,-0.514063,-1.010012,-0.785863,-1.01374,-0.206433,-0.274327,-0.4987,-0.14876,8.054713,4.426078,-3.952145,6.852597,1.046813,-0.333389,0.714139,0.061231,0.077199,-0.077199,0.03832,-0.03832
2,-0.715834,-0.780444,-0.101824,0.162283,-0.472614,-0.645042,-0.592504,-0.200239,7.783764,3.327287,-4.211499,8.294674,0.638693,-0.687464,1.163836,0.327977,0.077199,-0.077199,0.03832,-0.03832
3,-0.869666,-1.045657,-1.381633,-0.09908,0.667505,-0.765284,-0.726954,-0.238404,6.846989,3.889743,-3.372412,4.87532,0.194296,-0.788116,1.558003,0.431769,0.077199,-0.077199,0.03832,-0.03832
4,-0.29341,-1.454459,-0.5676,-1.274602,-0.482708,0.224671,-0.432387,-0.090671,7.273796,3.154894,-4.585274,8.851156,0.928912,0.207375,0.293646,-1.605449,0.077199,-0.077199,0.03832,-0.03832


In [8]:
# Subset selection
features = ['diameter', 'data_arc', 'absolute_magnitude', 'albedo', 'min_orbit_intersection']

df = df[features]

In [9]:
df.corr()

Unnamed: 0,diameter,data_arc,absolute_magnitude,albedo,min_orbit_intersection
diameter,1.0,0.491908,-0.73667,-0.200817,0.479004
data_arc,0.491908,1.0,-0.627962,0.297693,-0.053604
absolute_magnitude,-0.73667,-0.627962,1.0,-0.272602,-0.378228
albedo,-0.200817,0.297693,-0.272602,1.0,-0.285879
min_orbit_intersection,0.479004,-0.053604,-0.378228,-0.285879,1.0


# MLP

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('diameter', axis=1), df['diameter'], test_size=0.2, random_state=42)

# Fit multilayer perceptron model
mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

# Make predictions on test set
y_pred = mlp.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

n = len(y_test)
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
print("Adjusted R^2 Score:", adj_r2)


Mean Squared Error: 0.027309886239456054
R^2 Score: 0.9733817299907619
Adjusted R^2 Score: 0.9733777869814653


# XGB

In [11]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Split data into training and testing sets
X = df.drop('diameter', axis=1)
y = df['diameter']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fit XGBoost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Calculate R-squared and RMSE
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Calculate adjusted R-squared
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# Print model performance metrics
print("R-squared: ", r2)
print("Adjusted R-squared: ", adj_r2)
print("RMSE: ", rmse)


R-squared:  0.9727286858334205
Adjusted R-squared:  0.9727246460875898
RMSE:  0.16371130694832972


# Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Split data into training and testing sets
X = df.drop('diameter', axis=1)
y = df['diameter']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fit Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

# Calculate R-squared, adjusted R-squared, and RMSE
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
n = len(y_test)
p = X.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

# Print model performance metrics
print("R-squared: ", r2)
print("Adjusted R-squared: ", adj_r2)
print("RMSE: ", rmse)


R-squared:  0.9716832329993251
Adjusted R-squared:  0.9716790383888003
RMSE:  0.16681975383846637


# Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression

# Split data into training and testing sets
X = df.drop('diameter', axis=1)
y = df['diameter']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fit Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Calculate R-squared, adjusted R-squared, and RMSE
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
n = len(y_test)
p = X.shape[1]
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)

# Print model performance metrics
print("R-squared: ", r2)
print("Adjusted R-squared: ", adj_r2)
print("RMSE: ", rmse)


R-squared:  0.7272718439647792
Adjusted R-squared:  0.7272314442823683
RMSE:  0.51771485962695
