In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv(r'TESS.csv', comment='#' )

In [3]:
df.head()

Unnamed: 0,toi,tid,tfopwg_disp,rastr,ra,decstr,dec,st_pmra,st_pmraerr1,st_pmraerr2,...,st_logg,st_loggerr1,st_loggerr2,st_logglim,st_rad,st_raderr1,st_raderr2,st_radlim,toi_created,rowupdate
0,1000.01,50365310,FP,07h29m25.85s,112.357708,-12d41m45.46s,-12.69596,-5.964,0.085,-0.085,...,4.19,0.07,-0.07,0,2.16986,0.072573,-0.072573,0,2019-07-24 15:58:33,2024-09-09 10:08:01
1,1001.01,88863718,PC,08h10m19.31s,122.580465,-05d30m49.87s,-5.513852,-4.956,0.102,-0.102,...,4.03,0.09,-0.09,0,2.01,0.09,-0.09,0,2019-07-24 15:58:33,2023-04-03 14:31:04
2,1002.01,124709665,FP,06h58m54.47s,104.726966,-10d34m49.64s,-10.580455,-1.462,0.206,-0.206,...,,,,0,5.73,,,0,2019-07-24 15:58:33,2022-07-11 16:02:02
3,1003.01,106997505,FP,07h22m14.39s,110.559945,-25d12m25.26s,-25.207017,-0.939,0.041,-0.041,...,4.15,1.64,-1.64,0,,,,0,2019-07-24 15:58:33,2022-02-23 10:10:02
4,1004.01,238597883,FP,08h08m42.77s,122.178195,-48d48m10.12s,-48.802811,-4.496,0.069,-0.069,...,4.14,0.07,-0.07,0,2.15,0.06,-0.06,0,2019-07-24 15:58:33,2024-09-09 10:08:01


In [75]:
df = df.drop(columns=['toi', 'tid','rastr','ra','decstr','st_pmra','st_pmdec','pl_tranmid','pl_insol','pl_eqt','st_dist','toi_created','rowupdate','st_pmralim','pl_tranmidlim','pl_trandeplim','pl_trandurh','pl_trandurhlim','st_tefflim','st_distlim','st_tmaglim','st_radlim','st_pmdeclim','pl_orbperlim','pl_radelim','pl_insolerr1','pl_insolerr2','pl_insollim','pl_eqterr1','pl_eqterr2','pl_eqtlim','st_logglim'],errors='ignore')


In [77]:
mapping = {
    "CP": "Planet",
    "KP": "Planet",
    "PC": "Candidate",
    "APC": "Candidate",
    "FP": "False",
    "FA": "False"
}

df["disp_3class"] = df["tfopwg_disp"].map(mapping)
print(df["disp_3class"].value_counts())


disp_3class
Candidate    5139
False        1294
Planet       1266
Name: count, dtype: int64


In [79]:

numeric_df = df.select_dtypes(include=['float64', 'int64'])


# log transformations to stabilize distributions 

*Many features (orbital period) are highly skewed, we could reduce influence of extreme values, make distributions closer to normal.*

In [88]:
df["log_orbper"] = np.log1p(df["pl_orbper"])
df['sqrt_pl_rade']=np.sqrt(df['pl_rade']) #small rocky vs giant planets, can linearize relation with depth.
df['log_trandep'] = np.log1p(df['pl_trandep']*1e6)



# Ratios&Normalizations (instead of raw numbers)

In [93]:
df["rel_radius"] = df["pl_rade"] / df["st_rad"] #Planet radius / star radius= relative planet size
df["depth_norm"] = df["pl_trandep"] / df["st_rad"] #Transit depth / stellar radius=signal relative to star size

df['compactness'] = df['pl_orbper'] / df['st_rad'] #Orbital Compactness. Encodes how “close-in” the planet is relative to star size.


# PrincipalComponentAnalysis

*linear combinations of features that capture maximum variance, mathematical compression*

In [86]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
for i in range(5):
    df[f"pca_{i+1}"] = X_pca[:, i]


Noise reduction: Instead of keeping 50 noisy features, you keep top 5 PCs explaining ~70% variance.

Pattern detection: Sometimes clusters separate more cleanly in PC-space than raw space.

Feature compression: PCs can be added as new engineered features alongside physical ones.