In [6]:
import pandas as pd
import numpy as np
import time
import pickle
import joblib
import tempfile
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [4]:
df = pd.read_csv("../data/noalpha.csv")
print(df.shape)
df.head()

(5762, 17)


Unnamed: 0.1,Unnamed: 0,ID,RAdeg,DEdeg,e_RAdeg,e_DEdeg,RApeak,DEpeak,Sint,e_Sint,Speak,e_Speak,rmspeak,e_rmspeak,thetamaj,thetamin,PA
0,0,J022143.11-041344.6,35.42963,-4.22905,2.59,2.65,35.43002,-4.2294,469.39263,0.01424,301.11002,0.00914,0.02774,0.02761,24.06,14.99,136.56
1,1,J022255.74-051817.5,35.73225,-5.30485,2.29,2.07,35.73219,-5.3048,269.69099,0.0188,232.57525,0.01621,0.02984,0.02973,23.57,21.31,95.65
2,2,J022632.54-051328.8,36.63557,-5.22467,2.13,1.98,36.63563,-5.22469,71.48008,0.00953,68.15745,0.00909,0.0185,0.01847,20.27,17.55,60.05
3,3,J022915.86-044216.7,37.31609,-4.70464,3.94,3.38,37.31561,-4.70498,272.04369,0.04092,153.31504,0.02306,0.05453,0.05453,32.7,18.35,53.51
4,4,J021640.74-044404.4,34.16974,-4.73456,2.06,2.08,34.1698,-4.73445,60.58129,0.00945,52.00014,0.00811,0.02091,0.02088,17.02,16.45,34.36


In [25]:
df.dropna(inplace=True)
df = df.drop(columns=["ID","Unnamed: 0"])


In [26]:
X = df.drop(columns=['thetamin', 'thetamaj'])
y_min = df['thetamin']
y_maj = df['thetamaj']


In [27]:
X_train, X_test, y_train_min, y_test_min = train_test_split(X, y_min, test_size=0.2, random_state=42)
_, _, y_train_maj, y_test_maj = train_test_split(X, y_maj, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "ExtraTrees": ExtraTreesRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0),
    "LightGBM": LGBMRegressor(verbose=-1),
    "CatBoost": CatBoostRegressor(verbose=0)
}

In [29]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)

    metrics = {
        "MSE": mse,
        "RMSE": np.sqrt(mse),  # Manually compute RMSE
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds)
    }

    # Number of parameters (if applicable)
    try:
        if hasattr(model, 'coef_'):
            metrics["Num_Params"] = model.coef_.size
        elif hasattr(model, 'n_estimators_'):  # Ensemble methods
            metrics["Num_Params"] = len(model.estimators_)
        elif hasattr(model, 'feature_importances_'):
            metrics["Num_Params"] = len(model.feature_importances_)
        else:
            metrics["Num_Params"] = "N/A"
    except:
        metrics["Num_Params"] = "N/A"

    # Learning rate
    metrics["Learning_Rate"] = getattr(model, 'learning_rate', "N/A")

    # Model size in KB
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as tmp:
            joblib.dump(model, tmp.name)
            tmp.flush()
            metrics["Model_Size_KB"] = os.path.getsize(tmp.name) / 1024
        os.remove(tmp.name)
    except:
        metrics["Model_Size_KB"] = "N/A"

    return metrics

In [30]:
for target_name, y_train, y_test in [("thetamin", y_train_min, y_test_min), ("thetamaj", y_train_maj, y_test_maj)]:
    print(f"\n--- Regression Results for {target_name.upper()} ---")
    for name, model in regressors.items():
        model.fit(X_train_scaled, y_train)
        results = evaluate_model(model, X_test_scaled, y_test)
        print(f"\n{name}:")
        for metric, value in results.items():
            print(f"{metric}: {value}")



--- Regression Results for THETAMIN ---

LinearRegression:
MSE: 7.893106722943778
RMSE: 2.8094673379385955
MAE: 1.6405918223616116
R2: 0.380611587848257
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.75

Ridge:
MSE: 7.3901979791036565
RMSE: 2.7184918574650276
MAE: 1.6418430952256247
R2: 0.42007587728943496
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.640625

Lasso:
MSE: 7.696460208962492
RMSE: 2.774249485710053
MAE: 2.2233706653747918
R2: 0.39604284658139854
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.7177734375

ElasticNet:
MSE: 7.200857801225314
RMSE: 2.6834414100600954
MAE: 2.1585988375000382
R2: 0.43493379271476407
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.7177734375

DecisionTree:
MSE: 2.8306920208152646
RMSE: 1.6824660533916471
MAE: 0.9591587163920207
R2: 0.7778697415851648
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 642.7197265625

RandomForest:
MSE: 1.6459247058022546
RMSE: 1.28293597104542
MAE: 0.6748826539462274
R2: 0.8708408835921622
Num_Param




LightGBM:
MSE: 1.188967289253782
RMSE: 1.0903977665300777
MAE: 0.5894723260606833
R2: 0.9066992773263041
Num_Params: N/A
Learning_Rate: 0.1
Model_Size_KB: 282.376953125

CatBoost:
MSE: 1.1961116735614472
RMSE: 1.0936689049074437
MAE: 0.501926604311518
R2: 0.9061386427108791
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 1095.447265625

--- Regression Results for THETAMAJ ---

LinearRegression:
MSE: 5.0427160062615375
RMSE: 2.245599253264379
MAE: 1.2375884323577602
R2: 0.9225992454729103
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.75

Ridge:
MSE: 5.069782316594853
RMSE: 2.2516177110235325
MAE: 1.239743100743123
R2: 0.9221838041037232
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.640625

Lasso:
MSE: 5.624780839978562
RMSE: 2.371662041686918
MAE: 1.4536264808616017
R2: 0.9136651200418051
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 0.7177734375

ElasticNet:
MSE: 8.190194023407729
RMSE: 2.8618515026827875
MAE: 1.694367912354795
R2: 0.8742885388850231
Num_Params: 13
Learn




CatBoost:
MSE: 4.236860046640526
RMSE: 2.0583634389097876
MAE: 0.6662868097336125
R2: 0.9349683456239735
Num_Params: 13
Learning_Rate: N/A
Model_Size_KB: 1095.462890625


DEEP LEARNING

In [None]:
def ann(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

In [None]:
X_train_cnn = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
def cnn(input_shape): 
    model = Sequential([
        Conv1D(64, kernel_size=2, activation='relu', input_shape=input_shape),
        Dropout(0.2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

In [None]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32):
    early_stop = EarlyStopping(patience=10, restore_best_weights=True)

    # TensorFlow automatically uses the GPU if available — no need for device scope
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=0
    )

    preds = model.predict(X_test).flatten()
    mse = mean_squared_error(y_test, preds)

    # Save model temporarily to calculate its size
    model.save('temp_model.h5', save_format='h5')
    model_size = os.path.getsize('temp_model.h5') / (1024 ** 2)
    os.remove('temp_model.h5')

    metrics = {
        "MSE": mse,
        "RMSE": np.sqrt(mse),
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds),
        "Num_Params": model.count_params(),
        "Model_Size_MB": model_size
    }
    return metrics

In [40]:
# Train and evaluate ANN
ann_model = ann(X_train.shape[1])
ann_metrics = train_and_evaluate(ann_model, X_train, y_train, X_test, y_test)
print("🔷 ANN Metrics:")
for k, v in ann_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

print("\n" + "="*50 + "\n")

# Train and evaluate CNN
cnn_model = cnn((X_train.shape[1], 1))
cnn_metrics = train_and_evaluate(cnn_model, X_train_cnn, y_train, X_test_cnn, y_test)
print("🔶 CNN Metrics:")
for k, v in cnn_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op StatelessRandomGetKeyCounter in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op StatelessRandomUniformV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing o

🔷 ANN Metrics:
MSE: 4.3298
RMSE: 2.0808
MAE: 1.2182
R2: 0.9335
Num_Params: 10113
Model_Size_MB: 0.1484

🔶 CNN Metrics:
MSE: 4.5255
RMSE: 2.1273
MAE: 1.1798
R2: 0.9305
Num_Params: 49473
Model_Size_MB: 0.5978


In [11]:
features = ['RAdeg', 'DEdeg', 'e_RAdeg', 'e_DEdeg', 'RApeak',
            'DEpeak', 'Sint', 'e_Sint', 'Speak', 'e_Speak', 'rmspeak', 'e_rmspeak', 'PA']

target = ['thetamaj', 'thetamin']

X = df[features].values
y = df[target].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==== Define Models ====
base_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(),
    "XGBoost": XGBRegressor(verbosity=0),
    "LightGBM": LGBMRegressor(verbose=-1)
}

# ==== Multi-output Wrapper for models that need it ====
native_multi_output = ["DecisionTree", "RandomForest", "KNN", "CatBoost"]
wrapped_models = {
    name: model if name in native_multi_output else MultiOutputRegressor(model)
    for name, model in base_models.items()
}
for name, model in wrapped_models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    # Save model temporarily to get size
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as tmp_file:
        joblib.dump(model, tmp_file.name)
        size_mb = os.path.getsize(tmp_file.name) / (1024 * 1024)  # Convert bytes to MB
        os.remove(tmp_file.name)  # Clean up

    # Output
    print(f"\nModel: {name}")
    print(f"R² Score: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Model Size: {size_mb:.4f} MB")


Model: LinearRegression
R² Score: 0.6516
MSE: 6.4679
RMSE: 2.5432
MAE: 1.4391
Model Size: 0.0012 MB

Model: Ridge
R² Score: 0.6815
MSE: 6.0258
RMSE: 2.4548
MAE: 1.5226
Model Size: 0.0010 MB

Model: Lasso
R² Score: 0.6744
MSE: 6.3587
RMSE: 2.5217
MAE: 1.7947
Model Size: 0.0011 MB

Model: DecisionTree
R² Score: 0.7943
MSE: 6.8387
RMSE: 2.6151
MAE: 1.0964
Model Size: 0.7040 MB

Model: RandomForest
R² Score: 0.8944
MSE: 3.0498
RMSE: 1.7464
MAE: 0.7626
Model Size: 44.4561 MB

Model: GradientBoosting
R² Score: 0.8995
MSE: 2.7438
RMSE: 1.6564
MAE: 0.8236
Model Size: 0.2582 MB

Model: KNN
R² Score: 0.6459
MSE: 13.3936
RMSE: 3.6597
MAE: 1.9350
Model Size: 1.0794 MB

Model: XGBoost
R² Score: 0.9093
MSE: 3.2371
RMSE: 1.7992
MAE: 0.6793
Model Size: 0.7840 MB

Model: LightGBM
R² Score: 0.9188
MSE: 2.9718
RMSE: 1.7239
MAE: 0.6705
Model Size: 0.5465 MB


