In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)

from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv("cleaned_data.csv")

df.head()


Unnamed: 0,P_STATUS,P_MASS,P_MASS_ERROR_MIN,P_MASS_ERROR_MAX,P_RADIUS,P_RADIUS_ERROR_MIN,P_RADIUS_ERROR_MAX,P_YEAR,P_PERIOD,P_PERIOD_ERROR_MIN,...,S_CONSTELLATION_ENG_Toucan,S_CONSTELLATION_ENG_Triangle,S_CONSTELLATION_ENG_Twins,S_CONSTELLATION_ENG_Unicorn,S_CONSTELLATION_ENG_Virgin,S_CONSTELLATION_ENG_Water Carrier,S_CONSTELLATION_ENG_Water Snake,S_CONSTELLATION_ENG_Whale,S_CONSTELLATION_ENG_Winged Horse,S_CONSTELLATION_ENG_Wolf
0,3,273.33208,-24.154928,25.108412,2.33168,-0.23541,0.32509,2011,93.36204,-0.002274,...,False,False,False,False,False,False,False,False,False,False
1,3,273.33208,-24.154928,25.108412,2.33168,-0.23541,0.32509,2011,93.36204,-0.002274,...,False,False,False,False,False,False,False,False,False,False
2,3,273.33208,-24.154928,25.108412,2.33168,-0.23541,0.32509,2011,93.36204,-0.002274,...,False,False,False,False,False,False,False,False,False,False
3,3,273.33208,-24.154928,25.108412,2.33168,-0.23541,0.32509,2011,93.36204,-0.002274,...,False,False,False,False,False,False,False,False,False,False
4,3,273.33208,-24.154928,25.108412,2.33168,-0.23541,0.32509,2011,93.36204,-0.002274,...,False,False,False,False,False,False,False,False,False,False


In [3]:
X = df.drop(columns=['P_HABITABLE'])
y = df['P_HABITABLE']

print(y.value_counts())


P_HABITABLE
0    3993
2      34
1      21
Name: count, dtype: int64


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (3238, 15932)
Test shape: (810, 15932)


In [5]:
smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(
    X_train, y_train
)

print("Before SMOTE:")
print(y_train.value_counts())

print("\nAfter SMOTE:")
print(y_train_smote.value_counts())


Before SMOTE:
P_HABITABLE
0    3194
2      27
1      17
Name: count, dtype: int64

After SMOTE:
P_HABITABLE
0    3194
2    3194
1    3194
Name: count, dtype: int64


In [6]:
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    'Linear SVM': SVC(
        kernel='linear',
        class_weight='balanced',
        probability=True,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    )
}


In [None]:
results = {}

for name, model in models.items():
    print(f"\n{'='*10} {name} (SMOTE) {'='*10}")
    
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print(f"Weighted F1-score: {f1:.4f}")
    print(f"Weighted Precision: {precision:.4f}")
    print(f"Weighted Recall: {recall:.4f}")
    
    # ROC-AUC
if hasattr(model, "predict_proba"):
    y_proba = model.predict_proba(X_test)
    roc_auc = roc_auc_score(
        y_test,
        y_proba,
        multi_class='ovr',
        average='weighted'
    )
    print(f"ROC-AUC Score (OvR): {roc_auc:.4f}")






STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix:
[[762   8  29]
 [  0   4   0]
 [  1   4   2]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9987    0.9537    0.9757       799
           1     0.2500    1.0000    0.4000         4
           2     0.0645    0.2857    0.1053         7

    accuracy                         0.9481       810
   macro avg     0.4377    0.7465    0.4936       810
weighted avg     0.9869    0.9481    0.9653       810

Weighted F1-score: 0.9653
Weighted Precision: 0.9869
Weighted Recall: 0.9481



In [None]:
results_df = pd.DataFrame(results).T
results_df.sort_values(by="F1", ascending=False)


In [None]:
best_model = models['Random Forest']

importances = best_model.feature_importances_
feature_importance = pd.Series(
    importances, index=X.columns
).sort_values(ascending=False)

feature_importance.head(10)


In [None]:
feature_importance.head(10).plot(
    kind='barh',
    title='Top 10 Important Features'
)


In [None]:
df['habitability_score'] = best_model.predict_proba(X)[:, 1]

ranked_planets = df.sort_values(
    by='habitability_score',
    ascending=False
)

ranked_planets.head(10)


In [None]:
ranked_planets.to_csv(
    "ranked_exoplanets_by_habitability.csv",
    index=False
)


In [None]:
# ===============================
# Visualization of Baseline Models
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Ensure inline plotting
%matplotlib inline

# List of models to plot
model_names = ['Logistic Regression', 'Linear SVM', 'Random Forest']

for name in model_names:
    model = models[name]
    y_pred = model.predict(X_test)
    
    # -------------------------------
    # 1. Confusion Matrix
    # -------------------------------
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} - Confusion Matrix (Baseline)")
    plt.xlabel("Predicted Class")
    plt.ylabel("Actual Class")
    plt.show()
    
    # -------------------------------
    # 2. Scatter Plot: Actual vs Predicted
    # -------------------------------
    plt.figure(figsize=(8,5))
    plt.scatter(range(len(y_test)), y_test, c='red', label='Actual', alpha=0.6)
    plt.scatter(range(len(y_pred)), y_pred, c='green', marker='x', label='Predicted', alpha=0.6)
    plt.title(f"{name} - Actual vs Predicted Classes")
    plt.xlabel("Planet Index")
    plt.ylabel("Class (0=Non-Habitable, 1=Conservative, 2=Optimistic)")
    plt.legend()
    plt.show()
