In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
file_path = r"E:\Nasa Space App 2025\Hunting_For_Exoplanets_With_AI_Nasa2025\DataSet\K2\k2_data.csv"
df = pd.read_csv(file_path, comment="#")
df.head()

Unnamed: 0,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1.0,1.0,Transit,2016.0,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2018-03,2018-02-15
1,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1.0,1.0,Transit,2016.0,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2016-10,2016-07-28
2,BD+20 594 b,BD+20 594,1,CONFIRMED,Espinoza et al. 2016,1.0,1.0,Transit,2016.0,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2017-03,2018-04-26
3,EPIC 201111557.01,EPIC 201111557,0,CANDIDATE,Livingston et al. 2018,1.0,0.0,Transit,2018.0,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,2018-02-15,2018-03,2018-02-15
4,EPIC 201111557.01,EPIC 201111557,1,CANDIDATE,Livingston et al. 2018,1.0,0.0,Transit,2018.0,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,2018-08-02,2018-08,2018-08-02


In [None]:
df.isnull().sum()

pl_name            0
hostname           0
default_flag       0
disposition        0
disp_refname       0
                  ..
sy_gaiamagerr1    79
sy_gaiamagerr2    79
rowupdate         23
pl_pubdate        23
releasedate       23
Length: 94, dtype: int64

In [381]:
df.shape
df.tail()

Unnamed: 0,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
3987,WASP-85 A b,WASP-85 A,0,CONFIRMED,Mo&,,,,,,...,,,,,,,,,,
3988,Wolf 503 b,Wolf 503,0,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337,2018-09-04,2018-11,2018-09-06
3989,Wolf 503 b,Wolf 503,0,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337,2022-05-23,2021-12,2022-05-23
3990,Wolf 503 b,Wolf 503,0,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337,2025-09-17,2017-07,2025-08-28
3991,Wolf 503 b,Wolf 503,1,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337,2023-04-17,2023-04,2023-04-17


In [382]:
TARGET_COL = "disposition"
CLASS_ORDER = ["FALSE POSITIVE", "CANDIDATE", "CONFIRMED"]
CLASS_TO_ID = {c: i for i, c in enumerate(CLASS_ORDER)}
ID_TO_CLASS = {i: c for c, i in CLASS_TO_ID.items()}

In [383]:
print("Raw shape:", df.shape)
print("Columns:", len(df.columns))
print("\nDisposition counts:\n", df[TARGET_COL].value_counts(dropna=False))

Raw shape: (3992, 94)
Columns: 94

Disposition counts:
 disposition
CONFIRMED         2308
CANDIDATE         1369
FALSE POSITIVE     293
REFUTED             22
Name: count, dtype: int64


In [384]:
print(df.columns.tolist())

['pl_name', 'hostname', 'default_flag', 'disposition', 'disp_refname', 'sy_snum', 'sy_pnum', 'discoverymethod', 'disc_year', 'disc_facility', 'soltype', 'pl_controv_flag', 'pl_refname', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2', 'pl_eqtlim', 'ttv_flag', 'st_refname', 'st_spectype', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad', 'st_raderr1', 'st_raderr2', 'st_radlim', 'st_mass', 'st_masserr1', 'st_masserr2', 'st_masslim', 'st_met', 'st_meterr1', 'st_meterr2', 'st_metlim'

In [385]:
for col in df.columns:
    print(f"{col}-->{df[col].dtype}")

pl_name-->object
hostname-->object
default_flag-->int64
disposition-->object
disp_refname-->object
sy_snum-->float64
sy_pnum-->float64
discoverymethod-->object
disc_year-->float64
disc_facility-->object
soltype-->object
pl_controv_flag-->float64
pl_refname-->object
pl_orbper-->float64
pl_orbpererr1-->float64
pl_orbpererr2-->float64
pl_orbperlim-->float64
pl_orbsmax-->float64
pl_orbsmaxerr1-->float64
pl_orbsmaxerr2-->float64
pl_orbsmaxlim-->float64
pl_rade-->float64
pl_radeerr1-->float64
pl_radeerr2-->float64
pl_radelim-->float64
pl_radj-->float64
pl_radjerr1-->float64
pl_radjerr2-->float64
pl_radjlim-->float64
pl_bmasse-->float64
pl_bmasseerr1-->float64
pl_bmasseerr2-->float64
pl_bmasselim-->float64
pl_bmassj-->float64
pl_bmassjerr1-->float64
pl_bmassjerr2-->float64
pl_bmassjlim-->float64
pl_bmassprov-->object
pl_orbeccen-->float64
pl_orbeccenerr1-->float64
pl_orbeccenerr2-->float64
pl_orbeccenlim-->float64
pl_insol-->float64
pl_insolerr1-->float64
pl_insolerr2-->float64
pl_insollim-->

In [386]:
DROP = [
    "pl_name","hostname","disp_refname","pl_refname","st_refname","sy_refname",
    "pl_pubdate","releasedate","rowupdate",
    "rastr","decstr",
    "discoverymethod","disc_facility",
    "default_flag","pl_controv_flag","ttv_flag","soltype","disc_year"
]
df.drop(columns=[c for c in DROP if c in df.columns], inplace=True, errors="ignore")

In [387]:
df.tail()

Unnamed: 0,disposition,sy_snum,sy_pnum,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,pl_orbsmaxerr1,pl_orbsmaxerr2,...,sy_disterr2,sy_vmag,sy_vmagerr1,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2
3987,CONFIRMED,,,,,,,,,,...,,,,,,,,,,
3988,CONFIRMED,1.0,1.0,6.00118,8e-05,-0.00011,0.0,0.0571,0.002,-0.002,...,-0.0961,10.27,0.03,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337
3989,CONFIRMED,1.0,1.0,6.00127,2.1e-05,-2.1e-05,0.0,0.05706,0.00055,-0.00055,...,-0.0961,10.27,0.03,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337
3990,CONFIRMED,1.0,1.0,,,,,,,,...,-0.0961,10.27,0.03,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337
3991,CONFIRMED,1.0,1.0,6.00127,2.1e-05,-2.1e-05,0.0,0.05712,0.00063,-0.00045,...,-0.0961,10.27,0.03,-0.03,7.617,0.023,-0.023,9.89816,0.000337,-0.000337


In [388]:
# Check/encode target

if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in data")

y = df[TARGET_COL].map(CLASS_TO_ID)
keep = y.notna()
df, y = df.loc[keep].copy(), y.loc[keep].astype(int)

In [389]:
def coalesce_units(frame, prefer, fallback, factor):
    """prefer col in given units; if NaN use fallback*factor to convert units."""
    a = frame[prefer] if prefer in frame.columns else pd.Series(np.nan, index=frame.index)
    b = frame[fallback] if fallback in frame.columns else pd.Series(np.nan, index=frame.index)
    out = a.copy()
    use_b = out.isna() & (~b.isna())
    out[use_b] = b[use_b] * factor
    return out

def log1p_safe(s): 
    return np.log1p(s.clip(lower=0))

def spec_first_letter(s):
    return s.astype(str).str.strip().str.upper().str[0].replace({"N": np.nan, "": np.nan})

# 4) Engineer unified physical features
#   - radius in Earth radii: prefer pl_rade, else pl_radj * 11.21
#   - mass in Earth masses: prefer pl_bmasse, else pl_bmassj * 317.8
df["pl_radius_earth"] = coalesce_units(df, "pl_rade", "pl_radj", 11.21)
df["pl_mass_earth"]   = coalesce_units(df, "pl_bmasse", "pl_bmassj", 317.8)

density_proxy = df["pl_mass_earth"] / (df["pl_radius_earth"] ** 3)
density_proxy = density_proxy.replace([np.inf, -np.inf], np.nan)
df["pl_density_proxy"] = density_proxy

# 5) Log transforms for skewed variables (created alongside raw; model can choose)
for c in ["pl_orbper","pl_orbsmax","sy_dist","pl_eqt","pl_insol"]:
    if c in df.columns:
        df[f"log_{c}"] = log1p_safe(df[c])

# 6) Simple photometric colors (stellar proxies)
if all(c in df.columns for c in ["sy_vmag","sy_kmag"]):
    df["color_v_minus_k"] = df["sy_vmag"] - df["sy_kmag"]
if all(c in df.columns for c in ["sy_gaiamag","sy_vmag"]):
    df["color_g_minus_v"] = df["sy_gaiamag"] - df["sy_vmag"]

# 7) Compact stellar spectral class (O/B/A/F/G/K/M)
if "st_spectype" in df.columns:
    df["st_specclass"] = spec_first_letter(df["st_spectype"])

In [390]:
df.shape, y.shape

((3970, 87), (3970,))

In [391]:
missing = df.isna().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({"missing": missing, "missing_%": missing_pct})
print(missing_df.to_string())



                  missing  missing_%
disposition             0   0.000000
sy_snum                17   0.428212
sy_pnum                17   0.428212
pl_orbper              67   1.687657
pl_orbpererr1         947  23.853904
pl_orbpererr2         947  23.853904
pl_orbperlim           67   1.687657
pl_orbsmax           3159  79.571788
pl_orbsmaxerr1       3166  79.748111
pl_orbsmaxerr2       3166  79.748111
pl_orbsmaxlim        3159  79.571788
pl_rade               839  21.133501
pl_radeerr1          1121  28.236776
pl_radeerr2          1121  28.236776
pl_radelim            839  21.133501
pl_radj               839  21.133501
pl_radjerr1          1121  28.236776
pl_radjerr2          1121  28.236776
pl_radjlim            839  21.133501
pl_bmasse            3542  89.219144
pl_bmasseerr1        3584  90.277078
pl_bmasseerr2        3584  90.277078
pl_bmasselim         3542  89.219144
pl_bmassj            3542  89.219144
pl_bmassjerr1        3584  90.277078
pl_bmassjerr2        3584  90.277078
p

In [392]:
missing_df.shape

(87, 2)

In [393]:
# 8) Final feature list
numeric_cols = [c for c in [
    "pl_orbper","pl_orbsmax","pl_orbeccen",
    "pl_rade","pl_radj","pl_bmasse","pl_bmassj",
    "pl_insol","pl_eqt",
    "st_teff","st_rad","st_mass","st_met","st_logg",
    "sy_snum","sy_pnum","ra","dec","sy_dist",
    "sy_vmag","sy_kmag","sy_gaiamag",
    # engineered
    "pl_radius_earth","pl_mass_earth","pl_density_proxy",
    "log_pl_orbper","log_pl_orbsmax","log_sy_dist","log_pl_eqt","log_pl_insol",
    "color_v_minus_k","color_g_minus_v"
] if c in df.columns]

categorical_cols = ["st_specclass"] if "st_specclass" in df.columns else []
X = df[numeric_cols + categorical_cols].copy()

In [394]:
print("X shape:", X.shape, "| y shape:", y.shape)
print("Numeric cols ({}): {}".format(len(numeric_cols), numeric_cols[:12]))
print("Categorical cols:", categorical_cols)

X shape: (3970, 33) | y shape: (3970,)
Numeric cols (32): ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_rade', 'pl_radj', 'pl_bmasse', 'pl_bmassj', 'pl_insol', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass']
Categorical cols: ['st_specclass']


In [395]:
# 9) Persist a clean, model-ready CSV (good for reproducibility & serving)
import json
clean = X.copy()
clean[TARGET_COL] = y.map(ID_TO_CLASS)
CLEAN_PATH = "k2_processed_features.csv"
clean.to_csv(CLEAN_PATH, index=False)
with open("k2_feature_meta.json", "w") as f:
    json.dump({"numeric": numeric_cols, "categorical": categorical_cols}, f, indent=2)

print(f"\nSaved processed dataset → {CLEAN_PATH}")


Saved processed dataset → k2_processed_features.csv


In [396]:
X.shape

(3970, 33)

In [397]:
# Determine numeric vs categorical (robust to fresh sessions)
numeric_cols = [c for c in X.columns if X[c].dtype != "object"]
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [398]:
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler(with_centering=True, with_scaling=True)),
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_tf, numeric_cols),
        ("cat", cat_tf, categorical_cols)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)


In [399]:

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

USE_XGB = False  

USE_RF = False
if USE_XGB:
    clf = XGBClassifier(
        n_estimators=600,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="multi:softprob",
        eval_metric="mlogloss",
        tree_method="hist",
        random_state=42
    )
elif USE_RF:  
    clf = RandomForestClassifier(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=2,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
else:  # Default to Decision Tree
    clf = DecisionTreeClassifier(
        max_depth=None,  # Or set a value for better generalization
        min_samples_split=2,
        class_weight="balanced",
        random_state=42
    )



In [400]:
X_train.shape

(3176, 33)

In [401]:
pipe = Pipeline([("pre", pre), ("clf", clf)])
pipe.fit(X_train, y_train)


0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [402]:
preds = pipe.predict(X_val)
print("\n=== VALIDATION REPORT ===")
print(classification_report(y_val, preds, target_names=CLASS_ORDER, digits=4))


=== VALIDATION REPORT ===
                precision    recall  f1-score   support

FALSE POSITIVE     0.6441    0.6441    0.6441        59
     CANDIDATE     0.8986    0.9051    0.9018       274
     CONFIRMED     0.9891    0.9848    0.9870       461

      accuracy                         0.9320       794
     macro avg     0.8439    0.8447    0.8443       794
  weighted avg     0.9322    0.9320    0.9321       794



In [403]:
cm = confusion_matrix(y_val, preds)
print("\n=== CONFUSION MATRIX ===")
print(pd.DataFrame(cm, index=CLASS_ORDER, columns=CLASS_ORDER))


=== CONFUSION MATRIX ===
                FALSE POSITIVE  CANDIDATE  CONFIRMED
FALSE POSITIVE              38         21          0
CANDIDATE                   21        248          5
CONFIRMED                    0          7        454


In [404]:
try:
    feat_names = pipe.named_steps["pre"].get_feature_names_out()
    model = pipe.named_steps["clf"]
    if hasattr(model, "feature_importances_"):
        fi = pd.DataFrame({"feature": feat_names, "importance": model.feature_importances_}) \
               .sort_values("importance", ascending=False)
        print("\nTop 30 features:\n", fi.head(30).to_string(index=False))
except Exception as e:
    print("\nFeature importance unavailable:", e)

# Save model
MODEL_PATH = "k2_model_pipeline.joblib"
joblib.dump(pipe, MODEL_PATH)
print(f"\nSaved model → {MODEL_PATH}")


Top 30 features:
         feature   importance
        sy_pnum 4.790203e-01
            dec 1.108478e-01
             ra 5.877660e-02
        st_teff 4.758317e-02
pl_radius_earth 4.308790e-02
      pl_orbper 2.895834e-02
        sy_kmag 2.873590e-02
  log_pl_orbper 2.522069e-02
     sy_gaiamag 2.510301e-02
        sy_dist 2.074178e-02
        st_mass 1.779467e-02
         st_rad 1.679498e-02
    log_sy_dist 1.666883e-02
color_g_minus_v 1.588004e-02
        pl_radj 1.415634e-02
        st_logg 8.616949e-03
        sy_vmag 6.200601e-03
        pl_rade 5.884722e-03
         st_met 5.667763e-03
color_v_minus_k 5.482424e-03
 log_pl_orbsmax 5.207186e-03
     pl_orbsmax 4.511404e-03
   log_pl_insol 3.351074e-03
         pl_eqt 2.512681e-03
 st_specclass_F 1.510497e-03
        sy_snum 8.491248e-04
       pl_insol 8.351939e-04
     log_pl_eqt 2.524600e-18
    pl_orbeccen 0.000000e+00
      pl_bmasse 0.000000e+00

Saved model → k2_model_pipeline.joblib


In [405]:
# ==== CELL 4: PREDICT (batch & manual) ====
import joblib, pandas as pd

pipe = joblib.load("k2_model_pipeline.joblib")
CLASS_ORDER = ["FALSE POSITIVE", "CANDIDATE", "CONFIRMED"]
ID_TO_CLASS = {i:c for i,c in enumerate(CLASS_ORDER)}

# A) Batch on processed CSV
dfp = pd.read_csv("k2_processed_features.csv")
X = dfp.drop(columns=[TARGET_COL])
y_true = dfp[TARGET_COL]

sample = X.sample(5, random_state=7)
pred_ids = pipe.predict(sample)
pred_probs = pipe.predict_proba(sample)

print("Batch predictions (5 rows):")
for i in range(len(sample)):
    name = ID_TO_CLASS[int(pred_ids[i])]
    probs = {ID_TO_CLASS[j]: float(pred_probs[i, j]) for j in range(len(CLASS_ORDER))}
    print(f"- Row {i}: {name} | {probs}")

# B) Manual single-row (edit values as you wish; leave missing as None)
manual = {
    "pl_orbper": 2.30, "pl_orbsmax": 0.0,
    "pl_rade": 1.31, "pl_radj": 0.12, 
    "pl_bmasse": 0, "pl_bmassj": None,
    "pl_insol": 0, "pl_eqt": 0.0,
    "st_teff": 4720.0, "st_rad": 0.71, "st_mass": 0.0, "st_met": -0.6, "st_logg": 4.5,
    "sy_dist": 97.17, "sy_vmag": 12.3, "sy_kmag": 10.1, "sy_gaiamag": 12.0,
    "ra": 183.84, "dec": -6.26, "sy_snum": 1.0, "sy_pnum": 0,
    "st_specclass": None
}
manual_df = pd.DataFrame([manual])
# Ensure schema: add any missing columns as NaN and reorder
for col in X.columns:
    if col not in manual_df.columns:
        manual_df[col] = np.nan
manual_df = manual_df[X.columns]

pred = pipe.predict(manual_df)[0]
proba = pipe.predict_proba(manual_df)[0]
print("\nManual prediction:")
print("Predicted:", ID_TO_CLASS[int(pred)])
print("Probabilities:", {ID_TO_CLASS[i]: float(p) for i,p in enumerate(proba)})


Batch predictions (5 rows):
- Row 0: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 1: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 2: CONFIRMED | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 0.0, 'CONFIRMED': 1.0}
- Row 3: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 4: CONFIRMED | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 0.0, 'CONFIRMED': 1.0}

Manual prediction:
Predicted: CANDIDATE
Probabilities: {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}


In [406]:
# ==== CELL 4: PREDICT (batch & manual) ====
import joblib, pandas as pd

pipe = joblib.load("k2_model_pipeline.joblib")
CLASS_ORDER = ["FALSE POSITIVE", "CANDIDATE", "CONFIRMED"]
ID_TO_CLASS = {i:c for i,c in enumerate(CLASS_ORDER)}

# A) Batch on processed CSV
dfp = pd.read_csv("k2_processed_features.csv")
X = dfp.drop(columns=[TARGET_COL])
y_true = dfp[TARGET_COL]

sample = X.sample(5, random_state=7)
pred_ids = pipe.predict(sample)
pred_probs = pipe.predict_proba(sample)

print("Batch predictions (5 rows):")
for i in range(len(sample)):
    name = ID_TO_CLASS[int(pred_ids[i])]
    probs = {ID_TO_CLASS[j]: float(pred_probs[i, j]) for j in range(len(CLASS_ORDER))}
    print(f"- Row {i}: {name} | {probs}")

# B) Manual single-row (edit values as you wish; leave missing as None)
manual = {
    "pl_orbper": 12.34, "pl_orbsmax": 0.09,
    "pl_rade": 2.1, "pl_radj": None, 
    "pl_bmasse": 8.0, "pl_bmassj": None,
    "pl_insol": 120.0, "pl_eqt": 900.0,
    "st_teff": 5500.0, "st_rad": 0.9, "st_mass": 0.95, "st_met": 0.0, "st_logg": 4.5,
    "sy_dist": 300.0, "sy_vmag": 12.3, "sy_kmag": 10.1, "sy_gaiamag": 12.0,
    "ra": 123.4, "dec": -12.3, "sy_snum": 1.0, "sy_pnum": 1.0,
    "st_specclass": "G"
}
manual_df = pd.DataFrame([manual])
# Ensure schema: add any missing columns as NaN and reorder
for col in X.columns:
    if col not in manual_df.columns:
        manual_df[col] = np.nan
manual_df = manual_df[X.columns]

pred = pipe.predict(manual_df)[0]
proba = pipe.predict_proba(manual_df)[0]
print("\nManual prediction:")
print("Predicted:", ID_TO_CLASS[int(pred)])
print("Probabilities:", {ID_TO_CLASS[i]: float(p) for i,p in enumerate(proba)})


Batch predictions (5 rows):
- Row 0: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 1: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 2: CONFIRMED | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 0.0, 'CONFIRMED': 1.0}
- Row 3: CANDIDATE | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 1.0, 'CONFIRMED': 0.0}
- Row 4: CONFIRMED | {'FALSE POSITIVE': 0.0, 'CANDIDATE': 0.0, 'CONFIRMED': 1.0}

Manual prediction:
Predicted: CONFIRMED
Probabilities: {'FALSE POSITIVE': 0.0, 'CANDIDATE': 0.0, 'CONFIRMED': 1.0}
