In [41]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

koi_file = '../data/cumulative_2025.09.21_17.22.39.csv'
toi_file = '../data/TOI_2025.09.21_17.24.45.csv'
k2_file  = '../data/k2pandc_2025.09.21_17.26.00.csv'

In [42]:
df_koi = pd.read_csv(koi_file)
df_toi = pd.read_csv(toi_file)
df_k2  = pd.read_csv(k2_file)


In [43]:
schema_map = {
    'orbital_period': {'koi':'koi_period','toi':'pl_orbper','k2':'pl_orbper'},
    'transit_duration': {'koi':'koi_duration','toi':'pl_trandurh','k2':'pl_trandur'},
    'transit_depth': {'koi':'koi_depth','toi':'pl_trandep','k2':'pl_trandep'},
    'planet_radius': {'koi':'koi_prad','toi':'pl_rade','k2':'pl_rade'},
    'radius_ratio': {'koi':'koi_ror','toi':None,'k2':'pl_ratror'},
    'stellar_teff': {'koi':'koi_steff','toi':'st_teff','k2':'st_teff'},
    'stellar_radius': {'koi':'koi_srad','toi':'st_rad','k2':'st_rad'},
    'stellar_mass': {'koi':'koi_smass','toi':None,'k2':'st_mass'},
    'insolation_flux': {'koi':'koi_insol','toi':'pl_insol','k2':'pl_insol'},
    'teq': {'koi':'koi_teq','toi':'pl_eqt','k2':'pl_eqt'},
    'label': {'koi':'koi_disposition','toi':'tfopwg_disp','k2':'disposition'}
}

def standardize(df, mission):
    out = {}
    for std_col, mapping in schema_map.items():
        src = mapping.get(mission)
        if src and src in df.columns:
            out[std_col] = df[src]
        else:
            out[std_col] = pd.Series([None]*len(df))
    res = pd.DataFrame(out)
    res['mission'] = mission
    return res

std_koi = standardize(df_koi, 'koi')
std_toi = standardize(df_toi, 'toi')
std_k2  = standardize(df_k2, 'k2')
unified = pd.concat([std_koi, std_toi, std_k2], ignore_index=True)
unified['mission'] = unified['mission'].map({'koi':'Kepler','toi':'TESS','k2':'K2'})
print('Unified shape:', unified.shape)
# save snapshot
unified.to_csv('../data/unified_exoplanets_raw_rebuilt_from_notebook.csv', index=False)
unified.head(10)


Unified shape: (21224, 12)


Unnamed: 0,orbital_period,transit_duration,transit_depth,planet_radius,radius_ratio,stellar_teff,stellar_radius,stellar_mass,insolation_flux,teq,label,mission
0,9.488036,2.9575,615.8,2.26,0.022344,5455.0,0.927,0.919,93.59,793.0,CONFIRMED,Kepler
1,54.418383,4.507,874.8,2.83,0.027954,5455.0,0.927,0.919,9.11,443.0,CONFIRMED,Kepler
2,19.89914,1.7822,10829.0,14.6,0.154046,5853.0,0.868,0.961,39.3,638.0,CANDIDATE,Kepler
3,1.736952,2.40641,8079.2,33.46,0.387394,5805.0,0.791,0.836,891.96,1395.0,FALSE POSITIVE,Kepler
4,2.525592,1.6545,603.3,2.75,0.024064,6031.0,1.046,1.095,926.16,1406.0,CONFIRMED,Kepler
5,11.094321,4.5945,1517.5,3.9,0.036779,6046.0,0.972,1.053,114.81,835.0,CONFIRMED,Kepler
6,4.134435,3.1402,686.0,2.77,0.026133,6046.0,0.972,1.053,427.65,1160.0,CONFIRMED,Kepler
7,2.566589,2.429,226.5,1.59,0.014983,6046.0,0.972,1.053,807.74,1360.0,CONFIRMED,Kepler
8,7.36179,5.022,233.7,39.21,0.183387,6227.0,1.958,1.358,767.22,1342.0,FALSE POSITIVE,Kepler
9,16.068647,3.5347,4914.3,5.76,0.062161,5031.0,0.848,0.801,30.75,600.0,CONFIRMED,Kepler


In [44]:
unified['label'].value_counts()

label
FALSE POSITIVE    5132
CONFIRMED         5054
PC                4675
CANDIDATE         3348
FP                1192
CP                 679
KP                 565
APC                459
FA                  98
REFUTED             22
Name: count, dtype: int64

In [45]:
# based on the documentation, the following mappings are applied
#False Positive
#-----------------------
#FALSE POSITIVE
    #FP
    #APC
    #FA
#----------------------
#   CONFIRMED
    #CP
    #KP
#----------------------
#   CANDIDATE
#   PC


In [46]:
def normalize_label(x):
    if pd.isna(x): return None
    txt = str(x).strip().upper()
    if txt in ('CONFIRMED','CP','KP'): return 'Confirmed'
    if txt in ('CANDIDATE','PC'): return 'Candidate'
    if txt in ('FALSE POSITIVE', 'FP', 'APC', 'FA', 'REFUTED'): return 'False Positive'
    return txt.title()

unified['label'] = unified['label'].apply(normalize_label)

In [47]:
unified['label'].value_counts()


label
Candidate         8023
False Positive    6903
Confirmed         6298
Name: count, dtype: int64

In [48]:
def depth_to_ppm(row):
    v = row['transit_depth']
    try:
        vv = float(v)
    except:
        return np.nan
    if row['mission']=='K2':
        return vv * 10000.0
    return vv
unified['transit_depth_ppm'] = unified.apply(depth_to_ppm, axis=1)