In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# === 1. Load dataset ===
df = pd.read_csv(r"C:\Users\sohie\OneDrive\Desktop\NASA Space Apps 2025\New folder\SET 4.csv")

# === 2. Define features and label ===
features = [
    "ra", "dec", "pl_orbper", "pl_trandurh", "pl_trandep",
    "pl_rade", "pl_insol", "pl_eqt", "st_teff", "st_rad"
]
label_col = "tfopwg_disp"

# Keep only necessary columns
df = df[features + [label_col]]

# Drop rows with missing labels
df = df.dropna(subset=[label_col])

# Map labels: Candidate=1, False Positive=0
y = df[label_col].map({"CANDIDATE": 1, "FALSE POSITIVE": 0})
X = df[features]

# Fill missing values with median
X = X.fillna(X.median())

# === 3. Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 4. Scale features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 5. Train model ===
clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
clf.fit(X_train_scaled, y_train)

# === 6. Evaluate ===
y_pred = clf.predict(X_test_scaled)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["FALSE POSITIVE", "CANDIDATE"]))

# === 7. Save model and scaler ===
joblib.dump(clf, "exoplanet_model.joblib")
joblib.dump(scaler, "exoplanet_scaler.joblib")
joblib.dump(features, "exoplanet_features.joblib")

print("\n✅ Model, scaler, and feature list saved.")



Confusion Matrix:
[[ 811  274]
 [ 147 1562]]

Classification Report:
                precision    recall  f1-score   support

FALSE POSITIVE       0.85      0.75      0.79      1085
     CANDIDATE       0.85      0.91      0.88      1709

      accuracy                           0.85      2794
     macro avg       0.85      0.83      0.84      2794
  weighted avg       0.85      0.85      0.85      2794


✅ Model, scaler, and feature list saved.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# === 1. Load dataset ===
df = pd.read_csv(r"C:\Users\sohie\OneDrive\Desktop\NASA Space Apps 2025\New folder\SET 4.csv")

# === 2. Define features and label ===
features = [
    "ra", "dec", "pl_orbper", "pl_trandurh", "pl_trandep",
    "pl_rade", "pl_insol", "pl_eqt", "st_teff", "st_rad"
]
label_col = "tfopwg_disp"

# Keep only necessary columns
df = df[features + [label_col]]

# Drop rows with missing labels
df = df.dropna(subset=[label_col])

# Map labels: Candidate=1, False Positive=0
y = df[label_col].map({"CANDIDATE": 1, "FALSE POSITIVE": 0})
X = df[features]

# --- MODIFICATION START ---

# Calculate the medians of the training data features.
# We save this to fill missing values in the prediction script.
X_medians = X.median()

# Fill missing values with the calculated medians
X = X.fillna(X_medians)

# --- MODIFICATION END ---

# === 3. Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 4. Scale features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 5. Train model ===
clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
clf.fit(X_train_scaled, y_train)

# === 6. Evaluate ===
y_pred = clf.predict(X_test_scaled)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["FALSE POSITIVE", "CANDIDATE"]))

# === 7. Save model and scaler ===
joblib.dump(clf, "exoplanet_model.joblib")
joblib.dump(scaler, "exoplanet_scaler.joblib")
joblib.dump(features, "exoplanet_features.joblib")
# NEW: Save the medians file
joblib.dump(X_medians, "exoplanet_medians.joblib")

print("\n✅ Model, scaler, feature list, and MEDIANS saved.")



Confusion Matrix:
[[ 811  274]
 [ 147 1562]]

Classification Report:
                precision    recall  f1-score   support

FALSE POSITIVE       0.85      0.75      0.79      1085
     CANDIDATE       0.85      0.91      0.88      1709

      accuracy                           0.85      2794
     macro avg       0.85      0.83      0.84      2794
  weighted avg       0.85      0.85      0.85      2794


✅ Model, scaler, feature list, and MEDIANS saved.
