In [41]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

koi_file = '../data/cumulative_2025.09.21_17.22.39.csv'
toi_file = '../data/TOI_2025.09.21_17.24.45.csv'
k2_file  = '../data/k2pandc_2025.09.21_17.26.00.csv'

In [42]:
df_koi = pd.read_csv(koi_file)
df_toi = pd.read_csv(toi_file)
df_k2  = pd.read_csv(k2_file)


In [43]:
schema_map = {
    'orbital_period': {'koi':'koi_period','toi':'pl_orbper','k2':'pl_orbper'},
    'transit_duration': {'koi':'koi_duration','toi':'pl_trandurh','k2':'pl_trandur'},
    'transit_depth': {'koi':'koi_depth','toi':'pl_trandep','k2':'pl_trandep'},
    'planet_radius': {'koi':'koi_prad','toi':'pl_rade','k2':'pl_rade'},
    'radius_ratio': {'koi':'koi_ror','toi':None,'k2':'pl_ratror'},
    'stellar_teff': {'koi':'koi_steff','toi':'st_teff','k2':'st_teff'},
    'stellar_radius': {'koi':'koi_srad','toi':'st_rad','k2':'st_rad'},
    'stellar_mass': {'koi':'koi_smass','toi':None,'k2':'st_mass'},
    'insolation_flux': {'koi':'koi_insol','toi':'pl_insol','k2':'pl_insol'},
    'teq': {'koi':'koi_teq','toi':'pl_eqt','k2':'pl_eqt'},
    'label': {'koi':'koi_disposition','toi':'tfopwg_disp','k2':'disposition'}
}

def standardize(df, mission):
    out = {}
    for std_col, mapping in schema_map.items():
        src = mapping.get(mission)
        if src and src in df.columns:
            out[std_col] = df[src]
        else:
            out[std_col] = pd.Series([None]*len(df))
    res = pd.DataFrame(out)
    res['mission'] = mission
    return res

std_koi = standardize(df_koi, 'koi')
std_toi = standardize(df_toi, 'toi')
std_k2  = standardize(df_k2, 'k2')
unified = pd.concat([std_koi, std_toi, std_k2], ignore_index=True)
unified['mission'] = unified['mission'].map({'koi':'Kepler','toi':'TESS','k2':'K2'})
print('Unified shape:', unified.shape)
# save snapshot
unified.to_csv('../data/unified_exoplanets_raw_rebuilt_from_notebook.csv', index=False)
unified.head(10)


Unified shape: (21224, 12)


Unnamed: 0,orbital_period,transit_duration,transit_depth,planet_radius,radius_ratio,stellar_teff,stellar_radius,stellar_mass,insolation_flux,teq,label,mission
0,9.488036,2.9575,615.8,2.26,0.022344,5455.0,0.927,0.919,93.59,793.0,CONFIRMED,Kepler
1,54.418383,4.507,874.8,2.83,0.027954,5455.0,0.927,0.919,9.11,443.0,CONFIRMED,Kepler
2,19.89914,1.7822,10829.0,14.6,0.154046,5853.0,0.868,0.961,39.3,638.0,CANDIDATE,Kepler
3,1.736952,2.40641,8079.2,33.46,0.387394,5805.0,0.791,0.836,891.96,1395.0,FALSE POSITIVE,Kepler
4,2.525592,1.6545,603.3,2.75,0.024064,6031.0,1.046,1.095,926.16,1406.0,CONFIRMED,Kepler
5,11.094321,4.5945,1517.5,3.9,0.036779,6046.0,0.972,1.053,114.81,835.0,CONFIRMED,Kepler
6,4.134435,3.1402,686.0,2.77,0.026133,6046.0,0.972,1.053,427.65,1160.0,CONFIRMED,Kepler
7,2.566589,2.429,226.5,1.59,0.014983,6046.0,0.972,1.053,807.74,1360.0,CONFIRMED,Kepler
8,7.36179,5.022,233.7,39.21,0.183387,6227.0,1.958,1.358,767.22,1342.0,FALSE POSITIVE,Kepler
9,16.068647,3.5347,4914.3,5.76,0.062161,5031.0,0.848,0.801,30.75,600.0,CONFIRMED,Kepler


In [44]:
unified['label'].value_counts()

label
FALSE POSITIVE    5132
CONFIRMED         5054
PC                4675
CANDIDATE         3348
FP                1192
CP                 679
KP                 565
APC                459
FA                  98
REFUTED             22
Name: count, dtype: int64

In [45]:
# based on the documentation, the following mappings are applied
#False Positive
#-----------------------
#FALSE POSITIVE
    #FP
    #APC
    #FA
#----------------------
#   CONFIRMED
    #CP
    #KP
#----------------------
#   CANDIDATE
#   PC


In [46]:
def normalize_label(x):
    if pd.isna(x): return None
    txt = str(x).strip().upper()
    if txt in ('CONFIRMED','CP','KP'): return 'Confirmed'
    if txt in ('CANDIDATE','PC'): return 'Candidate'
    if txt in ('FALSE POSITIVE', 'FP', 'APC', 'FA', 'REFUTED'): return 'False Positive'
    return txt.title()

unified['label'] = unified['label'].apply(normalize_label)

In [47]:
unified['label'].value_counts()


label
Candidate         8023
False Positive    6903
Confirmed         6298
Name: count, dtype: int64

In [49]:
def depth_to_ppm(row):
    v = row['transit_depth']
    try:
        vv = float(v)
    except:
        return np.nan
    if row['mission']=='K2':
        return vv * 10000.0
    return vv
unified['transit_depth_ppm'] = unified.apply(depth_to_ppm, axis=1)

In [50]:
unified.shape

(21224, 13)

In [52]:
df = unified.dropna()
df.shape

(9232, 13)

In [53]:
df

Unnamed: 0,orbital_period,transit_duration,transit_depth,planet_radius,radius_ratio,stellar_teff,stellar_radius,stellar_mass,insolation_flux,teq,label,mission,transit_depth_ppm
0,9.488036,2.95750,615.800,2.26,0.022344,5455.0,0.927,0.919,93.59,793.0,Confirmed,Kepler,615.8
1,54.418383,4.50700,874.800,2.83,0.027954,5455.0,0.927,0.919,9.11,443.0,Confirmed,Kepler,874.8
2,19.899140,1.78220,10829.000,14.60,0.154046,5853.0,0.868,0.961,39.30,638.0,Candidate,Kepler,10829.0
3,1.736952,2.40641,8079.200,33.46,0.387394,5805.0,0.791,0.836,891.96,1395.0,False Positive,Kepler,8079.2
4,2.525592,1.65450,603.300,2.75,0.024064,6031.0,1.046,1.095,926.16,1406.0,Confirmed,Kepler,603.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20192,3.471745,2.18000,0.097,2.59,0.030130,4975.0,0.787,0.830,234.31,1088.9,Confirmed,K2,970.0
20194,7.138048,2.52000,0.210,3.53,0.041148,4975.0,0.787,0.830,50.35,741.4,Confirmed,K2,2100.0
20196,10.455820,2.50400,0.100,2.48,0.028896,4975.0,0.787,0.830,24.61,619.9,Confirmed,K2,1000.0
20197,14.762890,2.30000,0.068,1.95,0.022704,4975.0,0.787,0.830,10.50,500.9,Confirmed,K2,680.0


In [54]:
# since drop ratio is too high , we will impute missing values with median
num_cols = ['orbital_period','transit_duration','transit_depth_ppm','planet_radius',
            'radius_ratio','stellar_teff','stellar_radius','stellar_mass','insolation_flux','teq']

for c in num_cols:
    unified[c] = pd.to_numeric(unified[c], errors='coerce')
    unified[c] = unified.groupby('mission')[c].transform(lambda g: g.fillna(g.median()))
    unified[c] = unified[c].fillna(unified[c].median())


In [55]:
df = unified.dropna()
df.shape

(18957, 13)

In [56]:
unified.columns

Index(['orbital_period', 'transit_duration', 'transit_depth', 'planet_radius',
       'radius_ratio', 'stellar_teff', 'stellar_radius', 'stellar_mass',
       'insolation_flux', 'teq', 'label', 'mission', 'transit_depth_ppm'],
      dtype='object')

In [59]:
final_features = ['orbital_period', 'transit_duration', 'transit_depth', 'planet_radius',
       'radius_ratio', 'stellar_teff', 'stellar_radius', 'stellar_mass',
       'insolation_flux', 'teq','transit_depth_ppm']
labeled = unified[unified['label'].notna()].copy()

In [61]:
labeled.shape

(21224, 13)

In [62]:
for c in final_features:
    labeled[c] = pd.to_numeric(labeled[c], errors='coerce')
    labeled[c] = labeled.groupby('mission')[c].transform(lambda g: g.fillna(g.median()))
    labeled[c] = labeled[c].fillna(labeled[c].median())


In [63]:
print('Labeled rows:', len(labeled))

Labeled rows: 21224


In [65]:
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import json, pickle

X = labeled[final_features].values
le = LabelEncoder()
y = le.fit_transform(labeled['label'].astype(str).values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = RobustScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=1),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=1),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42)
}

# stacking
base_estimators = [('rf', models['RandomForest']), ('et', models['ExtraTrees']), ('hgb', models['HistGradientBoosting'])]
stack = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression(max_iter=1000), n_jobs=1)
models['Stacking'] = stack

results = {}
for name, clf in models.items():
    print('\nTraining', name)
    try:
        clf.fit(X_train_s, y_train)
        preds = clf.predict(X_test_s)
        acc = accuracy_score(y_test, preds)
        report = classification_report(y_test, preds, target_names=le.classes_, digits=4)
        conf = confusion_matrix(y_test, preds)
        results[name] = {'accuracy': float(acc), 'report': report, 'confusion_matrix': conf.tolist()}
        # Save model and feature importances if available
        with open(f'model_{name.lower()}_full_notebook.pkl','wb') as f:
            pickle.dump({'model': clf, 'label_encoder': le, 'features': final_features, 'scaler': scaler}, f)
        if hasattr(clf, 'feature_importances_'):
            imp = clf.feature_importances_
            feat_imp = pd.DataFrame({'feature': final_features, 'importance': imp}).sort_values('importance', ascending=False)
            #feat_imp.to_csv(BASE / f'feature_importances_{name.lower()}_full_notebook.csv', index=False)
            display(feat_imp.head(10))
    except Exception as e:
        print('Failed', name, e)



Training RandomForest


Unnamed: 0,feature,importance
3,planet_radius,0.12923
2,transit_depth,0.117376
0,orbital_period,0.108895
4,radius_ratio,0.101379
1,transit_duration,0.098351
10,transit_depth_ppm,0.088588
8,insolation_flux,0.083344
6,stellar_radius,0.07527
5,stellar_teff,0.072663
9,teq,0.072582



Training ExtraTrees


Unnamed: 0,feature,importance
2,transit_depth,0.121265
9,teq,0.1116
10,transit_depth_ppm,0.104522
1,transit_duration,0.097858
3,planet_radius,0.08976
0,orbital_period,0.088885
5,stellar_teff,0.082017
6,stellar_radius,0.081882
8,insolation_flux,0.077546
4,radius_ratio,0.075447



Training HistGradientBoosting

Training Stacking


In [66]:
results

{'RandomForest': {'accuracy': 0.7097762073027091,
  'report': '                precision    recall  f1-score   support\n\n     Candidate     0.6406    0.6997    0.6689      1605\n     Confirmed     0.7525    0.7196    0.7357      1259\nFalse Positive     0.7640    0.7125    0.7374      1381\n\n      accuracy                         0.7098      4245\n     macro avg     0.7190    0.7106    0.7140      4245\n  weighted avg     0.7139    0.7098    0.7110      4245\n',
  'confusion_matrix': [[1123, 228, 254], [303, 906, 50], [327, 70, 984]]},
 'ExtraTrees': {'accuracy': 0.7010600706713781,
  'report': '                precision    recall  f1-score   support\n\n     Candidate     0.6400    0.6766    0.6578      1605\n     Confirmed     0.7174    0.7339    0.7256      1259\nFalse Positive     0.7667    0.6995    0.7315      1381\n\n      accuracy                         0.7011      4245\n     macro avg     0.7080    0.7033    0.7050      4245\n  weighted avg     0.7041    0.7011    0.7019    

In [67]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
# --- K-Fold cross-validation (Stratified) ---
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}
for name, clf in [('RandomForest', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))]:
    print('\nRunning CV for', name)
    scores = cross_val_score(clf, scaler.transform(X), y, cv=kf, scoring='accuracy', n_jobs=1)
    cv_results[name] = {'mean_accuracy': float(scores.mean()), 'std': float(scores.std()), 'fold_scores': scores.tolist()}
    print(f"{name} CV mean acc: {scores.mean():.4f} +- {scores.std():.4f}")

# Save CV results
#with open(BASE / 'cv_results_full_notebook.json','w') as f:
#    json.dump(cv_results, f, indent=2)
#print('Saved CV results to cv_results_full_notebook.json')


Running CV for RandomForest
RandomForest CV mean acc: 0.7090 +- 0.0044
