In [27]:
# Cell 1 — Imports & configuration
import os
import sys
import math
import random
import logging
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
log = logging.getLogger("prep")

# output dir
OUT_DIR = Path("preprocessed_tabular_data")
OUT_DIR.mkdir(exist_ok=True)


In [28]:
local_koi = Path("/tabular_dataset/koi.csv")
local_toi = Path("/tabular_dataset/toi.csv")
local_k2  = Path("/tabular_dataset/k2.csv")

In [29]:
koi_path = local_koi if local_koi.exists() else Path("tabular_dataset/koi.csv")
toi_path = local_toi if local_toi.exists() else Path("tabular_dataset/toi.csv")
k2_path  = local_k2 if local_k2.exists() else Path("tabular_dataset/k2.csv")

In [30]:
log.info(f"Loading KOI from {koi_path}")
koi = pd.read_csv(koi_path, low_memory=False)
log.info(f"Loaded KOI: {koi.shape}")

log.info(f"Loading TOI from {toi_path}")
toi = pd.read_csv(toi_path, low_memory=False)
log.info(f"Loaded TOI: {toi.shape}")

log.info(f"Loading K2 from {k2_path}")
k2 = pd.read_csv(k2_path, low_memory=False)
log.info(f"Loaded K2: {k2.shape}")

2025-10-05 12:23:37,048 INFO Loading KOI from tabular_dataset\koi.csv
2025-10-05 12:23:37,343 INFO Loaded KOI: (9564, 83)
2025-10-05 12:23:37,344 INFO Loading TOI from tabular_dataset\toi.csv
2025-10-05 12:23:37,431 INFO Loaded TOI: (7703, 27)
2025-10-05 12:23:37,433 INFO Loading K2 from tabular_dataset\k2.csv
2025-10-05 12:23:37,592 INFO Loaded K2: (4004, 129)


In [31]:
toi.head()
koi.head()
k2.head()

Unnamed: 0,rowid,pl_name,hostname,pl_letter,k2_name,epic_hostname,epic_candname,hd_name,hip_name,tic_id,...,releasedate,pl_nnotes,k2_campaigns,k2_campaigns_num,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,1,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,15-02-2018,1,4,1.0,0,0,0,0,0,0
1,2,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,28-07-2016,1,4,1.0,0,0,0,0,0,0
2,3,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,26-04-2018,1,4,1.0,0,0,0,0,0,0
3,4,EPIC 201111557.01,EPIC 201111557,,,EPIC 201111557,EPIC 201111557.01,,,TIC 176942156,...,02-08-2018,0,10,1.0,0,0,0,0,0,0
4,5,EPIC 201111557.01,EPIC 201111557,,,EPIC 201111557,EPIC 201111557.01,,,TIC 176942156,...,15-02-2018,0,10,1.0,0,0,0,0,0,0


In [32]:
# Cell 4 — Quick peek (inspect columns; unify column names later)
def peek(df, name, n=5):
    log.info(f"--- {name} shape {df.shape} ---")
    print(df.columns.tolist()[:40])
    display(df.head(n))

peek(koi, "KOI")
peek(toi, "TOI")
peek(k2,  "K2")


2025-10-05 12:23:40,868 INFO --- KOI shape (9564, 83) ---


['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov', 'koi_comment', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_longp', 'koi_impact', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_limbdark_mod', 'koi_ldm_coeff4', 'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_parm_prov', 'koi_max_sngle_ev']


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_fwm_srao,koi_fwm_sdeco,koi_fwm_prao,koi_fwm_pdeco,koi_dicco_mra,koi_dicco_mdec,koi_dicco_msky,koi_dikco_mra,koi_dikco_mdec,koi_dikco_msky
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,16-08-2018,CANDIDATE,1.0,0,...,0.43,0.94,-0.0002,-0.00055,-0.01,0.2,0.2,0.08,0.31,0.32
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,16-08-2018,CANDIDATE,0.969,0,...,-0.63,1.23,0.00066,-0.00105,0.39,0.0,0.39,0.49,0.12,0.5
2,3,10811496,K00753.01,,CANDIDATE,Done,16-08-2018,CANDIDATE,0.0,0,...,-0.021,-0.038,0.0007,0.0006,-0.025,-0.034,0.042,0.002,-0.027,0.027
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,16-08-2018,FALSE POSITIVE,0.0,0,...,-0.111,0.002,0.00302,-0.00142,-0.249,0.147,0.289,-0.257,0.099,0.276
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,16-08-2018,CANDIDATE,1.0,0,...,-0.01,0.23,8e-05,-7e-05,0.03,-0.09,0.1,0.07,0.02,0.07


2025-10-05 12:23:40,906 INFO --- TOI shape (7703, 27) ---


['rowid', 'toi', 'toipfx', 'tid', 'ctoi_alias', 'pl_pnum', 'tfopwg_disp', 'rastr', 'ra', 'decstr', 'dec', 'st_pmra', 'st_pmdec', 'pl_tranmid', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad', 'toi_created', 'rowupdate']


Unnamed: 0,rowid,toi,toipfx,tid,ctoi_alias,pl_pnum,tfopwg_disp,rastr,ra,decstr,...,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_logg,st_rad,toi_created,rowupdate
0,1,1000.01,1000,50365310,50365310.0,1,FP,07h29m25.85s,112.357708,-12d41m45.46s,...,5.818163,22601.94858,3127.204052,9.604,485.735,10249.0,4.19,2.16986,24-07-2019 15:58,09-09-2024 10:08
1,2,1001.01,1001,88863718,88863720.0,1,PC,08h10m19.31s,122.580465,-05d30m49.87s,...,11.2154,44464.5,4045.0,9.42344,295.862,7070.0,4.03,2.01,24-07-2019 15:58,03-04-2023 14:31
2,3,1002.01,1002,124709665,124709700.0,1,FP,06h58m54.47s,104.726966,-10d34m49.64s,...,23.7529,2860.61,2037.0,9.299501,943.109,8924.0,,5.73,24-07-2019 15:58,11-07-2022 16:02
3,4,1003.01,1003,106997505,106997500.0,1,FP,07h22m14.39s,110.559945,-25d12m25.26s,...,,1177.36,1631.0,9.3003,7728.17,5388.5,4.15,,24-07-2019 15:58,23-02-2022 10:10
4,5,1004.01,1004,238597883,238597900.0,1,FP,08h08m42.77s,122.178195,-48d48m10.12s,...,11.3113,54679.3,4260.0,9.1355,356.437,9219.0,4.14,2.15,24-07-2019 15:58,09-09-2024 10:08


2025-10-05 12:23:40,948 INFO --- K2 shape (4004, 129) ---


['rowid', 'pl_name', 'hostname', 'pl_letter', 'k2_name', 'epic_hostname', 'epic_candname', 'hd_name', 'hip_name', 'tic_id', 'gaia_id', 'default_flag', 'disposition', 'disp_refname', 'sy_snum', 'sy_pnum', 'sy_mnum', 'cb_flag', 'discoverymethod', 'disc_year', 'disc_refname', 'disc_pubdate', 'disc_locale', 'disc_facility', 'disc_telescope', 'disc_instrument', 'rv_flag', 'pul_flag', 'ptv_flag', 'tran_flag', 'ast_flag', 'obm_flag', 'micro_flag', 'etv_flag', 'ima_flag', 'dkin_flag', 'soltype', 'pl_controv_flag', 'pl_refname', 'pl_orbper']


Unnamed: 0,rowid,pl_name,hostname,pl_letter,k2_name,epic_hostname,epic_candname,hd_name,hip_name,tic_id,...,releasedate,pl_nnotes,k2_campaigns,k2_campaigns_num,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,1,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,15-02-2018,1,4,1.0,0,0,0,0,0,0
1,2,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,28-07-2016,1,4,1.0,0,0,0,0,0,0
2,3,BD+20 594 b,BD+20 594,b,K2-56 b,EPIC 210848071,EPIC 210848071.01,,,TIC 26123781,...,26-04-2018,1,4,1.0,0,0,0,0,0,0
3,4,EPIC 201111557.01,EPIC 201111557,,,EPIC 201111557,EPIC 201111557.01,,,TIC 176942156,...,02-08-2018,0,10,1.0,0,0,0,0,0,0
4,5,EPIC 201111557.01,EPIC 201111557,,,EPIC 201111557,EPIC 201111557.01,,,TIC 176942156,...,15-02-2018,0,10,1.0,0,0,0,0,0,0


In [33]:
# Cell 5 — Standardise key columns and rename common fields across catalogs
# We will create a minimal common schema. Add more columns if present in your CSVs.
def safe_rename(df, mapping):
    return df.rename(columns={k:v for k,v in mapping.items() if k in df.columns})

# Example mapping — adjust if column names differ in your files
koi_map = {
    'kepid': 'kepid',  
    'kepoi_name':'object_name',           # Kepler ID         # KOI name
    'koi_period': 'period',
    'koi_duration': 'duration',
    'koi_depth': 'depth',
    #'koi_prad': 'planet_radius',
    #'koi_kepmag': 'stellar_mag',
    #'koi_snr': 'SNR',
    'koi_disposition': 'disposition_kepler',  # label
    #'ra': 'ra',
    #'dec': 'dec'
}
toi_map = {
    'toi': 'toi_id',
    'tid': 'object_name',
    'pl_orbper': 'period',
    'pl_trandurh': 'duration',
    'pl_trandep': 'depth',
    #'planet_radius': 'planet_radius',
    'tfopwg_disp': 'disposition_toi',    # column might be TFOPWG Disposition or similar
    #'SNR': 'SNR',
    #'Tmag': 'stellar_mag',
    #'RA': 'ra',
    #'DEC': 'dec'
}
k2_map = {
    #'pl_name': 'pl_name',
    'pl_name': 'object_name',
    'pl_orbper': 'period',
    'pl_trandur': 'duration',
    'pl_trandep': 'depth',
    #'prad': 'planet_radius',
    'disposition': 'disposition_k2',   # column might be 'Archive Disposition' in different name
    #'SNR': 'SNR',
    #'kepmag': 'stellar_mag',
    #'ra': 'ra',
    #'dec': 'dec'
}

koi = safe_rename(koi, koi_map)
toi = safe_rename(toi, toi_map)
k2  = safe_rename(k2, k2_map)

log.info("Renaming done — sample columns after rename")
print("KOI columns:", [c for c in koi.columns if c in koi_map.values()])
print("TOI columns:", [c for c in toi.columns if c in toi_map.values()])
print("K2 columns:",  [c for c in k2.columns  if c in k2_map.values()])


2025-10-05 12:23:44,006 INFO Renaming done — sample columns after rename


KOI columns: ['kepid', 'object_name', 'disposition_kepler', 'period', 'duration', 'depth']
TOI columns: ['toi_id', 'object_name', 'disposition_toi', 'period', 'duration', 'depth']
K2 columns: ['object_name', 'disposition_k2', 'period', 'depth', 'duration']


In [34]:
# Cell 5 — Standardise key columns and rename common fields across catalogs
# We will create a minimal common schema. Add more columns if present in your CSVs.
def safe_rename(df, mapping):
    return df.rename(columns={k:v for k,v in mapping.items() if k in df.columns})

# Example mapping — adjust if column names differ in your files
koi_map = {
    'kepid': 'kepid',                     # Kepler ID
    'kepoi_name': 'object_name',          # KOI name
    'koi_period': 'period',
    'koi_duration': 'duration',
    'koi_depth': 'depth',
    'koi_prad': 'planet_radius',
    'koi_kepmag': 'stellar_mag',
    'koi_snr': 'SNR',
    'koi_disposition': 'disposition_kepler',  # label
    'ra': 'ra',
    'dec': 'dec',
    'koi_smass': 'stellar_mass',          # Stellar mass in solar masses
    'koi_srad': 'stellar_radius',         # Stellar radius in solar radii
    'koi_steff': 'stellar_teff',          # Effective temperature (K)
    'koi_smet': 'stellar_metallicity'
}
toi_map = {
    'toi': 'toi_id',
    'tid': 'tic_id',
    'object': 'object_name',
    'pl_orbper': 'period',
    'pl_trandurh': 'duration',
    'pl_trandep': 'depth',
    'pl_rade': 'planet_radius',
    'tfopwg_disp': 'disposition_toi',    # column might be TFOPWG Disposition or similar
    'SNR': 'SNR',
    'st_tmag': 'stellar_mag',
    'RA': 'ra',
    'DEC': 'dec',
    'stellar_mass': 'stellar_mass',
    'st_rad': 'stellar_radius',
    'st_teff': 'stellar_teff',
    'stellar_metallicity': 'stellar_metallicity'
}
k2_map = {
    # Planet identifiers
    'pl_name': 'object_name',            # Planet name
    'hostname': 'host_name',             # Star / Host name
    'k2_name': 'kepid',                  # K2 ID / EPIC ID

    # Planet parameters
    'pl_orbper': 'period',               # Orbital period [days]
    'pl_trandur': 'duration',            # Transit duration [hours]
    'pl_trandep': 'depth',               # Transit depth [%]
    'pl_rade': 'planet_radius',          # Planet radius [Earth radius]

    # Disposition / label
    'disposition': 'disposition_k2',     # Archive Disposition

    # Stellar properties
    'st_mass': 'stellar_mass',           # Stellar mass [solar mass]
    'st_rad': 'stellar_radius',          # Stellar radius [solar radius]
    'st_teff': 'stellar_teff',           # Effective temperature [K]
    'st_met': 'stellar_metallicity',     # Metallicity [dex]
    'sy_tmag': 'stellar_mag',            # TESS magnitude
    'st_spectype': 'stellar_sptype',     # Spectral type (optional)

    # Coordinates
    'ra': 'ra',
    'dec': 'dec'
}

koi = safe_rename(koi, koi_map)
toi = safe_rename(toi, toi_map)
k2  = safe_rename(k2, k2_map)

log.info("Renaming done — sample columns after rename")
print("KOI columns:", [c for c in koi.columns if c in koi_map.values()])
print("TOI columns:", [c for c in toi.columns if c in toi_map.values()])
print("K2 columns:",  [c for c in k2.columns  if c in k2_map.values()])


2025-10-05 12:23:53,187 INFO Renaming done — sample columns after rename


KOI columns: ['kepid', 'object_name', 'disposition_kepler', 'period', 'duration', 'depth', 'planet_radius', 'stellar_teff', 'stellar_metallicity', 'stellar_radius', 'stellar_mass', 'ra', 'dec', 'stellar_mag']
TOI columns: ['toi_id', 'object_name', 'disposition_toi', 'ra', 'dec', 'period', 'duration', 'depth', 'planet_radius', 'stellar_mag', 'stellar_teff', 'stellar_radius']
K2 columns: ['object_name', 'host_name', 'kepid', 'disposition_k2', 'period', 'planet_radius', 'depth', 'duration', 'stellar_sptype', 'stellar_teff', 'stellar_radius', 'stellar_mass', 'stellar_metallicity', 'ra', 'dec', 'stellar_mag']


In [35]:
# Cell 6 — Define label-mapping functions for binary classification
# We map various dispositions to a binary 'label' where:
#   1 -> Candidate/Planet (positive)
#   0 -> False Positive (negative)
# Keep a record of mapping choices in a column 'label_source' for traceability.

def map_koi_disposition(vals):
    # Acceptable strings differ per file. Normalize to lowercase.
    if pd.isna(vals):
        return None
    s = str(vals).lower()
    if any(tok in s for tok in ['confirmed', 'candidate', 'planet']):
        return 1
    if any(tok in s for tok in ['false', 'fp', 'false positive', 'not a planet']):
        return 0
    return None

def map_toi_disposition(vals):
    if pd.isna(vals):
        return None
    s = str(vals).lower()
    # TFOPWG dispositions often contain: PC, FP, AP (ambiguous), KP
    if any(tok in s for tok in ['confirmed', 'kp', 'known planet', 'pc', 'planetary candidate']):
        return 1
    if any(tok in s for tok in ['fp', 'false positive', 'not a planet', 'eclipsing']):
        return 0
    if 'apc' in s or 'ambiguous' in s:
        # treat ambiguous as negative for conservative approach, or None to drop
        return 0
    return None

def map_k2_disposition(vals):
    if pd.isna(vals):
        return None
    s = str(vals).lower()
    if any(tok in s for tok in ['confirmed', 'candidate', 'planet']):
        return 1
    if any(tok in s for tok in ['false', 'fp', 'false positive']):
        return 0
    return None

# Apply mapping
if 'disposition_kepler' in koi.columns:
    koi['label'] = koi['disposition_kepler'].apply(map_koi_disposition)
    koi['label_source'] = koi['disposition_kepler']
elif 'koi_disposition' in koi.columns:
    koi['label'] = koi['koi_disposition'].apply(map_koi_disposition)
    koi['label_source'] = koi.get('koi_disposition')

if 'disposition_toi' in toi.columns:
    toi['label'] = toi['disposition_toi'].apply(map_toi_disposition)
    toi['label_source'] = toi['disposition_toi']
elif 'TFOPWG Disposition' in toi.columns:
    toi['label'] = toi['TFOPWG Disposition'].apply(map_toi_disposition)
    toi['label_source'] = toi['TFOPWG Disposition']

if 'disposition_k2' in k2.columns:
    k2['label'] = k2['disposition_k2'].apply(map_k2_disposition)
    k2['label_source'] = k2['disposition_k2']
elif 'disp' in k2.columns:
    k2['label'] = k2['disp'].apply(map_k2_disposition)
    k2['label_source'] = k2['disp']

log.info("Label mapping applied. Sample label distributions:")
print("KOI labels:\n", koi['label'].value_counts(dropna=False))
print("TOI labels:\n", toi['label'].value_counts(dropna=False))
print("K2 labels:\n",  k2['label'].value_counts(dropna=False))


2025-10-05 12:23:58,406 INFO Label mapping applied. Sample label distributions:


KOI labels:
 label
0    4839
1    4725
Name: count, dtype: int64
TOI labels:
 label
1.0    5724
0.0    1197
NaN     782
Name: count, dtype: int64
K2 labels:
 label
1.0    3689
0.0     293
NaN      22
Name: count, dtype: int64


In [44]:
# Cell 7 — Select core features & keep relevant columns
# Choose a set of features that appear across catalogs. Add or remove columns depending on your CSVs.
core_features = [
    'object_name', 'kepid', 'toi_id', 'tic_id', 'period', 'duration', 'depth','stellar_radius', 'stellar_mass',
    'stellar_mag','label', 'label_source'
]
def reduce_df(df, features):
    present = [c for c in features if c in df.columns]
    return df[present].copy()

koi_r = reduce_df(koi, core_features)
toi_r = reduce_df(toi, core_features)
k2_r  = reduce_df(k2, core_features)

log.info("Reduced shapes:")
log.info(f"KOI: {koi_r.shape}, TOI: {toi_r.shape}, K2: {k2_r.shape}")



2025-10-05 12:50:47,492 INFO Reduced shapes:
2025-10-05 12:50:47,493 INFO KOI: (9564, 10), TOI: (7703, 9), K2: (4004, 11)


In [45]:
# Cell 8 — Add provenance column and unify ID column names
koi_r['source'] = 'KOI'
toi_r['source'] = 'TOI'
k2_r['source']  = 'K2'

# unify an 'id' column to merge - prefer kepid -> tic_id -> object_name if none
def unify_id(df):
    if 'kepid' in df.columns:
        df['obj_id'] = df['kepid'].astype(str)
    elif 'tic_id' in df.columns:
        df['obj_id'] = df['tic_id'].astype(str)
    else:
        df['obj_id'] = df['object_name'].astype(str)
    return df

koi_r = unify_id(koi_r)
toi_r = unify_id(toi_r)
k2_r  = unify_id(k2_r)


In [46]:
# Cell 9 — Concatenate datasets
combined = pd.concat([koi_r, toi_r, k2_r], ignore_index=True, sort=False)
log.info(f"Combined shape: {combined.shape}")


2025-10-05 12:50:50,875 INFO Combined shape: (21271, 14)


In [47]:
# Cell 10 — Basic cleaning & deduplication
# 1) Drop rows without label (we only keep rows we can map to 0/1)
before = combined.shape[0]
combined = combined[combined['label'].isin([0,1])].copy()
log.info(f"Dropped unlabeled rows: {before - combined.shape[0]} rows removed")

# 2) Convert numeric columns to numeric types
num_cols = ['period','duration','depth']
for c in num_cols:
    if c in combined.columns:
        combined[c] = pd.to_numeric(combined[c], errors='coerce')

# 3) Deduplicate based on obj_id + source priority (we will keep KOI first, then TOI, then K2)
# But since an object may appear in multiple catalogs, keep one row per unique obj_id preferring KOI -> TOI -> K2.
priority = {'KOI': 0, 'TOI': 1, 'K2': 2}
combined['_src_priority'] = combined['source'].map(priority).fillna(10)
combined.sort_values(['obj_id','_src_priority'], inplace=True)
combined = combined.drop_duplicates(subset='obj_id', keep='first').reset_index(drop=True)
log.info(f"After deduplication: {combined.shape}")


2025-10-05 12:50:52,795 INFO Dropped unlabeled rows: 804 rows removed
2025-10-05 12:50:52,831 INFO After deduplication: (15557, 15)


In [49]:
# Cell 11 — Feature engineering -optional
# Examples:
# - compute log(period)
# - planet radius estimate if missing but depth and stellar_radius present (simple approximation)
import numpy as np

if 'period' in combined.columns:
    combined['log_period'] = np.log10(combined['period'].replace(0, np.nan))



In [51]:
feature_cols = [c for c in [
    'period','duration','depth',
    'stellar_radius','stellar_mass','stellar_mag'
] if c in combined.columns]

log.info("Final features to keep for ML:")
print(feature_cols)

2025-10-05 12:52:39,786 INFO Final features to keep for ML:


['period', 'duration', 'depth', 'stellar_radius', 'stellar_mass', 'stellar_mag']


In [52]:
# Impute missing numeric values with median
impute_values = {}
for c in feature_cols:
    med = combined[c].median()
    impute_values[c] = med
    combined[c] = combined[c].fillna(med)

pd.Series(impute_values).to_csv(OUT_DIR / "impute_values.csv")


In [53]:
# Cell 13 — Optional: scaling and save prepared dataset
X = combined[feature_cols].copy()
y = combined['label'].astype(int).copy()

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Save scaler params (mean/std)
scaler_params = pd.DataFrame({'mean':scaler.mean_, 'scale':scaler.scale_}, index=X.columns)
scaler_params.to_csv(OUT_DIR / "scaler_params.csv")

# Prepare final dataframe for export
final_df = pd.concat([
    combined[['obj_id','object_name','source','label','label_source']].reset_index(drop=True),
    X_scaled.reset_index(drop=True)
], axis=1)

final_df.to_csv(OUT_DIR / "prepared_dataset.csv", index=False)
log.info(f"Wrote prepared dataset to {OUT_DIR / 'prepared_dataset.csv'}; rows: {final_df.shape[0]}")

# also save an unscaled version for debugging
pd.concat([combined[['obj_id','object_name','source','label','label_source']].reset_index(drop=True), X.reset_index(drop=True)], axis=1).to_csv(OUT_DIR / "prepared_dataset_unscaled.csv", index=False)
log.info("Saved unscaled and scaled prepared datasets.")


2025-10-05 12:52:42,476 INFO Wrote prepared dataset to preprocessed_tabular_data\prepared_dataset.csv; rows: 15557
2025-10-05 12:52:42,601 INFO Saved unscaled and scaled prepared datasets.


In [54]:
# Cell 14 — Quick class balance report & train/test split (optional)
print("Label distribution:\n", final_df['label'].value_counts(normalize=False))
# stratified split ready for training (if you want to save train/test)
train_df, test_df = train_test_split(final_df, test_size=0.2, stratify=final_df['label'], random_state=SEED)
train_df.to_csv(OUT_DIR / "train_prepared.csv", index=False)
test_df.to_csv(OUT_DIR / "test_prepared.csv", index=False)
log.info("Wrote train_prepared.csv and test_prepared.csv")


2025-10-05 12:52:43,059 INFO Wrote train_prepared.csv and test_prepared.csv


Label distribution:
 label
1.0    9726
0.0    5831
Name: count, dtype: int64


In [55]:
# Cell 15 — README & provenance notes (auto-generate)
readme = f"""
Prepared dataset generated with notebook.
Rows: {final_df.shape[0]}
Features: {', '.join(final_df.columns.tolist())}

Label mapping logic:
 - KOI: dispositions containing 'confirmed' or 'candidate' -> label 1; 'false'/'fp' -> label 0
 - TOI: TFOPWG dispositions with 'pc','kp','confirmed' -> label 1; 'fp','false' -> label 0; 'apc' ambiguous treated as 0
 - K2: similar logic to KOI

Files written:
 - {OUT_DIR / 'prepared_dataset.csv'} (scaled)
 - {OUT_DIR / 'prepared_dataset_unscaled.csv'}
 - {OUT_DIR / 'train_prepared.csv'}
 - {OUT_DIR / 'test_prepared.csv'}
 - {OUT_DIR / 'impute_values.csv'}
 - {OUT_DIR / 'scaler_params.csv'}

Notes:
 - Adjust label mapping functions in cell 6 if your CSVs use different wording.
 - Check column names (cell 4) if rename mappings need tweaking.
 - Deduplication kept the first occurrence per obj_id with priority KOI -> TOI -> K2.
"""

with open(OUT_DIR / "README_prepared_dataset.txt", "w") as f:
    f.write(readme)

log.info("Generated README_prepared_dataset.txt")
print(readme)


2025-10-05 12:52:44,172 INFO Generated README_prepared_dataset.txt



Prepared dataset generated with notebook.
Rows: 15557
Features: obj_id, object_name, source, label, label_source, period, duration, depth, stellar_radius, stellar_mass, stellar_mag

Label mapping logic:
 - KOI: dispositions containing 'confirmed' or 'candidate' -> label 1; 'false'/'fp' -> label 0
 - TOI: TFOPWG dispositions with 'pc','kp','confirmed' -> label 1; 'fp','false' -> label 0; 'apc' ambiguous treated as 0
 - K2: similar logic to KOI

Files written:
 - preprocessed_tabular_data\prepared_dataset.csv (scaled)
 - preprocessed_tabular_data\prepared_dataset_unscaled.csv
 - preprocessed_tabular_data\train_prepared.csv
 - preprocessed_tabular_data\test_prepared.csv
 - preprocessed_tabular_data\impute_values.csv
 - preprocessed_tabular_data\scaler_params.csv

Notes:
 - Adjust label mapping functions in cell 6 if your CSVs use different wording.
 - Check column names (cell 4) if rename mappings need tweaking.
 - Deduplication kept the first occurrence per obj_id with priority KOI -> T