In [4]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler 
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler 
import warnings
warnings.filterwarnings('ignore') 

In [5]:
dt=pd.read_csv('exoplanet_dataset.csv') 
dt.isnull().sum() 
dt2=dt[['P_TEMP_EQUIL','P_TEMP_SURF']] 
null_col1_rows = dt2[dt2['P_TEMP_EQUIL'].isna()]
non_null_col1_dt = dt2.dropna(subset=['P_TEMP_EQUIL'])
train_data = non_null_col1_dt[~non_null_col1_dt['P_TEMP_SURF'].isna()]
test_data = non_null_col1_dt[non_null_col1_dt['P_TEMP_SURF'].isna()]
x = train_data[['P_TEMP_EQUIL']]
y = train_data['P_TEMP_SURF']
model = XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=7, random_state=42)
model.fit(x, y)
x_test = test_data[['P_TEMP_EQUIL']]
test_data['P_TEMP_SURF'] = model.predict(x_test)
filled_data = pd.concat([train_data, test_data]).sort_index()
final_dt = pd.concat([filled_data, null_col1_rows]).sort_index()
dt['P_TEMP_EQUIL']=final_dt['P_TEMP_EQUIL']
dt['P_TEMP_SURF']=final_dt['P_TEMP_SURF']

In [6]:
dt=dt.drop(['P_INCLINATION','P_OMEGA','S_NAME_HD','S_NAME_HIP','S_TYPE'],axis=1) 
columns_to_predict_eccentricity = [
    "P_MASS",
    "P_PERIASTRON",
    "P_APASTRON",
    "P_DISTANCE_EFF",
    "S_TIDAL_LOCK",
    "P_HILL_SPHERE" ,
    "P_ECCENTRICITY"
]
dt2=dt[columns_to_predict_eccentricity]
columns = [
    "P_MASS",
    "P_PERIASTRON",
    "P_APASTRON",
    "P_DISTANCE_EFF",
    "S_TIDAL_LOCK",
    "P_HILL_SPHERE"
] 
dt_with_nulls = dt2[dt2[columns].isnull().any(axis=1)]
dt_no_nulls = dt2.dropna(subset=columns)
train_data = dt_no_nulls.dropna(subset=['P_ECCENTRICITY'])
x = train_data[columns]
y = train_data['P_ECCENTRICITY']
x_predict = dt_no_nulls[dt_no_nulls['P_ECCENTRICITY'].isnull()][columns]
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_predict = scaler.transform(x_predict)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
model = Sequential([
    Dense(64, activation='relu', input_shape=(x.shape[1],)),
    Dense(32, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


history = model.fit(
    x_train, y_train ,
    validation_data=(x_val, y_val),
    epochs=50,
    batch_size=8,
    verbose=1
) 
predicted_values = model.predict(x_predict)
dt_no_nulls.loc[dt_no_nulls['P_ECCENTRICITY'].isnull(), 'P_ECCENTRICITY'] = predicted_values
dt_final = pd.concat([dt_no_nulls, dt_with_nulls]).sort_index() 
dt['P_ECCENTRICITY']=dt_final['P_ECCENTRICITY']
dt=dt[['P_NAME','P_MASS','P_RADIUS','P_PERIOD','P_SEMI_MAJOR_AXIS','P_ECCENTRICITY','S_DISTANCE','S_TEMPERATURE','S_MASS','S_RADIUS','P_ESCAPE','P_POTENTIAL','P_GRAVITY','P_DENSITY','P_HILL_SPHERE','P_DISTANCE','P_PERIASTRON','P_APASTRON','P_DISTANCE_EFF','P_FLUX','P_TEMP_EQUIL','P_TEMP_SURF','P_TYPE','S_TYPE_TEMP','S_LUMINOSITY','S_SNOW_LINE','S_ABIO_ZONE','S_TIDAL_LOCK','P_HABZONE_OPT','P_HABZONE_CON','P_TYPE_TEMP','P_HABITABLE','S_LOG_G']]

Epoch 1/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0284 - mae: 0.0903 - val_loss: 0.0170 - val_mae: 0.0822
Epoch 2/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0154 - mae: 0.0775 - val_loss: 0.0174 - val_mae: 0.0741
Epoch 3/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0151 - mae: 0.0749 - val_loss: 0.0125 - val_mae: 0.0679
Epoch 4/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0109 - mae: 0.0661 - val_loss: 0.0109 - val_mae: 0.0613
Epoch 5/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0119 - mae: 0.0670 - val_loss: 0.0109 - val_mae: 0.0623
Epoch 6/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0101 - mae: 0.0633 - val_loss: 0.0107 - val_mae: 0.0604
Epoch 7/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - 

In [7]:
numerical_features = dt.select_dtypes(include=['float64', 'int64']).columns
data = dt[numerical_features]


data.replace([np.inf, -np.inf], np.nan, inplace=True)


scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)


k = 5
imputer = KNNImputer(n_neighbors=k, weights='distance')


scaled_data_imputed = imputer.fit_transform(scaled_data)


kd_tree = NearestNeighbors(n_neighbors=k, algorithm='kd_tree')
kd_tree.fit(scaled_data_imputed)


imputed_data_rescaled = scaler.inverse_transform(scaled_data_imputed)


dt[numerical_features] = imputed_data_rescaled

In [8]:
p_type_features = ['P_TEMP_EQUIL', 'P_FLUX', 'P_RADIUS', 'P_GRAVITY']
s_type_features = ['S_TEMPERATURE', 'S_LUMINOSITY', 'S_RADIUS', 'S_MASS']


imputer = SimpleImputer(strategy='mean')
dt[p_type_features] = imputer.fit_transform(dt[p_type_features])
dt[s_type_features] = imputer.fit_transform(dt[s_type_features])


label_encoder_p = LabelEncoder()
label_encoder_s = LabelEncoder()

dt['P_TYPE_TEMP_ENC'] = label_encoder_p.fit_transform(dt['P_TYPE_TEMP'].fillna('Unknown'))
dt['S_TYPE_TEMP_ENC'] = label_encoder_s.fit_transform(dt['S_TYPE_TEMP'].fillna('Unknown'))


def impute_with_model(dt, target_col, features, label_encoder):

    train_data = dt[dt[target_col] != 'Unknown']
    predict_data = dt[dt[target_col] == 'Unknown']

    x_train = train_data[features]
    y_train = train_data[target_col]

    if predict_data.empty:
        return dt

    x_predict = predict_data[features]


    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)


    predicted_values = clf.predict(x_predict)


    dt.loc[dt[target_col] == 'Unknown', target_col] = predicted_values
    return dt


dt = impute_with_model(dt, 'P_TYPE_TEMP_ENC', p_type_features, label_encoder_p)


dt = impute_with_model(dt, 'S_TYPE_TEMP_ENC', s_type_features, label_encoder_s)


dt['P_TYPE_TEMP'] = label_encoder_p.inverse_transform(dt['P_TYPE_TEMP_ENC'])
dt['S_TYPE_TEMP'] = label_encoder_s.inverse_transform(dt['S_TYPE_TEMP_ENC'])


dt.drop(columns=['P_TYPE_TEMP_ENC', 'S_TYPE_TEMP_ENC'], inplace=True)

In [9]:
p_type_features = ['P_TEMP_EQUIL', 'P_FLUX', 'P_RADIUS', 'P_GRAVITY']



imputer = SimpleImputer(strategy='mean')
dt[p_type_features] = imputer.fit_transform(dt[p_type_features])



label_encoder_p = LabelEncoder()


dt['P_TYPE_TEMP_ENC'] = label_encoder_p.fit_transform(dt['P_TYPE'].fillna('Unknown'))


def impute_with_model(dt, target_col, features, label_encoder):

    train_data = dt[dt[target_col] != 'Unknown']
    predict_data = dt[dt[target_col] == 'Unknown']

    x_train = train_data[features]
    y_train = train_data[target_col]

    if predict_data.empty:
        return dt

    x_predict = predict_data[features]


    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)


    predicted_values = clf.predict(x_predict)


    dt.loc[dt[target_col] == 'Unknown', target_col] = predicted_values
    return dt


dt = impute_with_model(dt, 'P_TYPE_TEMP_ENC', p_type_features, label_encoder_p)




dt['P_TYPE'] = label_encoder_p.inverse_transform(dt['P_TYPE_TEMP_ENC'])



dt.drop(columns=['P_TYPE_TEMP_ENC'], inplace=True)

In [11]:
def compute_atmospheric_retention(row):
    # Earth's reference values
    earth_escape = 11.2  # km/s
    earth_temp = 288.0   # K

    v_escape = row['P_ESCAPE']
    T_surf = row['P_TEMP_SURF']

    # If data is missing or T_surf==0, return NaN
    if np.isnan(v_escape) or np.isnan(T_surf) or T_surf == 0:
        return np.nan

    # Compute the ratio; for Earth, ratio_AR = (11.2/11.2)*(288/288) = 1.
    ratio_AR = (v_escape / earth_escape) * (earth_temp / T_surf)

    # Use piecewise linear mapping: if ratio >= 1 then score = 100, else 100 * ratio.
    AR = 100 if ratio_AR >= 1 else 100 * ratio_AR
    return AR

# Revised formula for Long-Term Stability:
def compute_long_term_stability(row):
    # Earth's reference values
    earth_lum = 1.0      # solar luminosity
    earth_sma = 1.0      # AU

    e = row['P_ECCENTRICITY']
    S_lum = row['S_LUMINOSITY']
    sma = row['P_SEMI_MAJOR_AXIS']

    # If data is missing or invalid, return NaN
    if np.isnan(e) or np.isnan(S_lum) or np.isnan(sma) or S_lum == 0:
        return np.nan

    # Compute the ratio; for Earth: (1 - 0) * (1/1) * (1/1) = 1.
    ratio_LTS = (1 - e) * (earth_lum / S_lum) * (sma / earth_sma)

    # Piecewise mapping: if ratio >= 1 then score = 100, else 100 * ratio.
    LTS = 100 if ratio_LTS >= 1 else 100 * ratio_LTS
    return LTS

# Apply these functions to create new columns:
dt['Atmospheric_Retention'] = dt.apply(compute_atmospheric_retention, axis=1)
dt['Long_Term_Stability'] = dt.apply(compute_long_term_stability, axis=1) 
def compute_esi(row):
    # Reference Earth values:
    earth_radius = 1.0    # Earth radii (assuming normalized units, or set actual units if available)
    earth_density = 5.51  # g/cm³ (Earth's average density)
    earth_escape = 11.2   # km/s (Earth's escape velocity)
    earth_temp = 288.0    # K (Earth's average surface temperature)

    # Weights for each parameter (tunable based on literature; example values)
    w_radius = 0.57
    w_density = 1.07
    w_escape = 0.70
    w_temp = 5.58

    # Helper function to compute similarity for one parameter:
    def esi_component(x, x_earth, w):
        # Avoid division by zero; assume if x and x_earth are both 0, similarity is 1.
        if (x + x_earth) == 0:
            return 1.0
        return (1 - abs(x - x_earth) / (x + x_earth)) ** w

    # Compute each component:
    esi_radius = esi_component(row['P_RADIUS'], earth_radius, w_radius)
    esi_density = esi_component(row['P_DENSITY'], earth_density, w_density)
    esi_escape = esi_component(row['P_ESCAPE'], earth_escape, w_escape)
    esi_temp = esi_component(row['P_TEMP_SURF'], earth_temp, w_temp)

    # Compute interior and surface components:
    esi_interior = np.sqrt(esi_radius * esi_density)
    esi_surface = np.sqrt(esi_escape * esi_temp)

    # Overall ESI is the geometric mean of the interior and surface components:
    esi = np.sqrt(esi_interior * esi_surface)
    # Optionally, you can scale the overall ESI to a 0-100 scale:
    # For instance: ESI_percent = 100 * esi
    return 100 * esi
dt['ESI']=dt.apply(compute_esi,axis=1)  
dt.to_csv("processed_dataset.csv",index=False)