In [1]:
import sys
print(sys.executable)  # just to confirm your current Python env

# Install packages if needed
!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.4 matplotlib seaborn pandas


C:\Users\Raji_kabilan\miniconda3\envs\exoplanet\python.exe


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# ===============================
# 1. Load preprocessed data
# ===============================
use_cols = ['P_RADIUS', 'P_MASS', 'P_PERIOD', 'P_HABITABLE']
df = pd.read_csv("cleaned_data.csv", usecols=use_cols)

# Features and target
X = df[['P_RADIUS', 'P_MASS', 'P_PERIOD']]
y = df['P_HABITABLE']

# ===============================
# 2. Scale Features
# ===============================
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# ===============================
# 3. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [3]:

# ===============================
# 4. Baseline Models (Without SMOTE)
# ===============================
models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'LinearSVM': SVC(class_weight='balanced', probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
}

print("=== Baseline Models (No SMOTE) ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\nModel: {name}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


=== Baseline Models (No SMOTE) ===

Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.99      0.52      0.69       799
           1       0.00      0.25      0.01         4
           2       0.02      0.43      0.04         7

    accuracy                           0.52       810
   macro avg       0.34      0.40      0.24       810
weighted avg       0.97      0.52      0.68       810

Confusion Matrix:
[[419 223 157]
 [  3   1   0]
 [  2   2   3]]

Model: LinearSVM
              precision    recall  f1-score   support

           0       1.00      0.50      0.67       799
           1       0.00      0.25      0.01         4
           2       0.04      1.00      0.07         7

    accuracy                           0.51       810
   macro avg       0.35      0.58      0.25       810
weighted avg       0.98      0.51      0.66       810

Confusion Matrix:
[[403 207 189]
 [  2   1   1]
 [  0   0   7]]

Model: RandomForest
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:

# ===============================
# 5. Apply SMOTE on Training Data
# ===============================
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("\nBefore SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_sm.value_counts())


Before SMOTE: P_HABITABLE
0    3194
2      27
1      17
Name: count, dtype: int64
After SMOTE: P_HABITABLE
0    3194
2    3194
1    3194
Name: count, dtype: int64


In [5]:
# ===============================
# 6. Models with SMOTE
# ===============================
print("\n=== Models Trained with SMOTE ===")
for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)
    y_pred_sm = model.predict(X_test)
    
    print(f"\nModel: {name}")
    print(classification_report(y_test, y_pred_sm))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_sm))



=== Models Trained with SMOTE ===

Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.99      0.52      0.68       799
           1       0.00      0.25      0.01         4
           2       0.02      0.43      0.04         7

    accuracy                           0.51       810
   macro avg       0.34      0.40      0.24       810
weighted avg       0.97      0.51      0.67       810

Confusion Matrix:
[[412 232 155]
 [  3   1   0]
 [  2   2   3]]

Model: LinearSVM
              precision    recall  f1-score   support

           0       1.00      0.46      0.63       799
           1       0.01      0.50      0.02         4
           2       0.04      1.00      0.07         7

    accuracy                           0.46       810
   macro avg       0.35      0.65      0.24       810
weighted avg       0.98      0.46      0.62       810

Confusion Matrix:
[[364 246 189]
 [  1   2   1]
 [  0   0   7]]

Model: RandomForest
         

In [6]:

# ===============================
# 7. Optional: Cross-Validation (Stratified K-Fold)
# ===============================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    scores = cross_val_score(model, X_train_sm, y_train_sm, cv=skf, scoring='f1_weighted')
    print(f"\n{name} - CV Weighted F1: {np.mean(scores):.4f} ± {np.std(scores):.4f}")


LogisticRegression - CV Weighted F1: 0.4573 ± 0.0115

LinearSVM - CV Weighted F1: 0.7134 ± 0.0144

RandomForest - CV Weighted F1: 0.9059 ± 0.0014
