In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif


In [2]:
from google.colab import files
uploaded = files.upload()


Saving Kepler_Threshold_Crossing_Events_Table.csv to Kepler_Threshold_Crossing_Events_Table.csv


In [3]:
filename = list(uploaded.keys())[0]

df = pd.read_csv(
    filename,
    sep='|',
    engine='python',
    on_bad_lines='skip'
)

# Clean column names
df.columns = df.columns.str.strip()
df.dropna(how='all', inplace=True)

df.head()


Unnamed: 0,# This file was produced by the NASA Exoplanet Archive http://exoplanetarchive.ipac.caltech.edu
0,# Sun Oct 5 09:40:39 2025
1,#
2,# COLUMN kepid: KepID
3,# COLUMN tce_plnt_num: Planet Number
4,# COLUMN tce_rogue_flag: Rogue Flag


In [8]:
for col in df.columns:
    if df[col].dtype != 'object':
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [10]:
df.columns = df.columns.str.strip()
print(df.columns.tolist())


['# This file was produced by the NASA Exoplanet Archive  http://exoplanetarchive.ipac.caltech.edu']


In [11]:
# Keep only numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove ID-like columns if present
drop_like = ['id', 'kepid', 'row', 'index']
numeric_cols = [c for c in numeric_cols if not any(d in c.lower() for d in drop_like)]

numeric_cols[:10], len(numeric_cols)


([], 0)

In [12]:
df['Habitability_Score'] = 0

if 'tce_period' in df.columns:
    df['Habitability_Score'] += (1 / (1 + abs(df['tce_period'] - 365))) * 0.4

if 'tce_insol' in df.columns:
    df['Habitability_Score'] += (1 / (1 + abs(df['tce_insol'] - 1))) * 0.35

if 'tce_steff' in df.columns:
    df['Habitability_Score'] += (1 / (1 + abs(df['tce_steff'] - 5778))) * 0.25

df[['Habitability_Score']].head()


Unnamed: 0,Habitability_Score
0,0
1,0
2,0
3,0
4,0


In [13]:
df['Habitability_Class'] = np.where(
    df['Habitability_Score'] >= df['Habitability_Score'].median(),
    'Habitable',
    'Non-Habitable'
)


In [14]:
corr = df[numeric_cols + ['Habitability_Score']].corr()['Habitability_Score']
corr = corr.sort_values(ascending=False)

corr


Unnamed: 0,Habitability_Score
Habitability_Score,


In [24]:
selected_features = corr[abs(corr) > 0.15].index.tolist()
selected_features.remove('Habitability_Score')
selected_features


['Habitability_Score']

In [25]:
X = df[selected_features]
y = df['Habitability_Class']


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)


Training size: (27249, 1)
Testing size: (6813, 1)


In [27]:
num_features = X.select_dtypes(include=['float64','int64']).columns
cat_features = X.select_dtypes(include=['object']).columns


In [28]:
numeric_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k='all'))
])

categorical_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, num_features),
        ('cat', categorical_pipeline, cat_features)
    ]
)


In [30]:
X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape


  msb = ssbn / float(dfbn)


((27249, 1), (6813, 1))