In [None]:
#Imports
import pandas as pd

#Dataset 1 loading
file_path = "/kaggle/input/nasa-exoplanet/nasa_exoplanet.csv"

#loading into a dataframe
df=pd.read_csv(file_path,comment='#',engine="python",on_bad_lines="skip")
df.isnull().sum()



In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
needed = [
    "pl_rade", "pl_bmasse", "pl_dens", "pl_eqt",
    "pl_orbper", "sy_dist", "st_teff", 
    "st_lum", "st_spectype", "st_metfe",
    "pl_orbeccen", "pl_orbsmax"
]
df = df[[col for col in needed if col in df.columns]]

df = df.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
})
df.isnull().sum()



In [None]:
selected_cols = [
    "radius", "mass", "temp", "orbital_period",
    "distance_star", "star_temp", "star_type",
    "eccentricity", "semi_major_axis"
]
df=df[selected_cols]


In [None]:
file2_path="/kaggle/input/exoplanetsdata1/exoplanetsdata1.csv"
df2 = pd.read_csv(file2_path, comment='#', engine="python", on_bad_lines="skip")
df2 = df2.loc[:, ~df2.columns.str.contains("^Unnamed")]
df2.isnull().sum()



#Removes all columns whose name starts with "Unnamed" (those are usually junk index columns in CSVs).

#df2.columns.str.contains("^Unnamed") ‚Üí boolean mask for which columns are ‚ÄúUnnamed‚Ä¶‚Äù

#~ negates the mask.

#df2.loc[:, mask] ‚Üí selects all rows (:) and only columns where mask is True.

In [None]:
df2 = df2.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
})
df2=df2[selected_cols]

In [None]:
combined_df=pd.concat([df,df2],ignore_index=True)

combined_df=combined_df.drop_duplicates()
combined_df["star_type"]=combined_df["star_type"].fillna("Unknown")


In [None]:
numeric_cols = [
    "radius",
    "mass",
    "temp",
    "orbital_period",
    "distance_star",
    "star_temp",
    "eccentricity",
    "semi_major_axis"
]

for col in numeric_cols:
    combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce")
#converts all values to numeric,if not possible coerces to convert to "NaN"

In [None]:
combined_df = combined_df[
    combined_df["eccentricity"].isna() |
    combined_df["eccentricity"].between(0, 1)
]

combined_df = combined_df[
    combined_df["radius"].isna() |
    (combined_df["radius"] > 0)
]

combined_df = combined_df[
    combined_df["mass"].isna() |
    (combined_df["mass"] > 0)
]

combined_df = combined_df[
    combined_df["orbital_period"].isna() |
    (combined_df["orbital_period"] > 0)
]


In [None]:
combined_df.isnull().sum()

In [None]:
df_clean = combined_df.copy()


In [None]:
for col in selected_cols:
    print(df_clean[col].describe())

"""""
## üîç Outlier Detection Using Summary Statistics
### What we check:
- Minimum and maximum values  
- Median and quartiles  
- Domain validity (e.g., eccentricity must be 0‚Äì1, radius must be < 30 Earth radii)

"""

In [None]:
df_clean.shape

In [None]:
import matplotlib.pyplot as plt

numeric_cols = [
    "radius", "mass", "temp", "orbital_period",
    "distance_star", "star_temp", "eccentricity",
    "semi_major_axis"
]

for col in numeric_cols:
    plt.figure(figsize=(6,6))
    df_clean[col].dropna().plot(kind="box", vert=True)
    plt.yscale('log')
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
#One hot Encoding
print("Before encoding:",df_clean.shape)
df_encoded=pd.get_dummies(df_clean,columns=['star_type'],prefix='star')
print("after encoding:",df_encoded.shape)
df_encoded.head()
df_encoded = df_encoded.loc[:, (df_encoded != 0).any(axis=0)] #columns with one non - zero value ar ekept..all zero columns are removed
print("Shape after cleanup:", df_encoded.shape)


In [None]:
# ---------------------------------------------------
# Feature Engineering Example: Habitability Score Index
# (Analysis-only, NOT used as ML label)
# ---------------------------------------------------

df_hab = df_clean.copy()

def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# Individual component scores (Earth-like references)
radius_score = 1 - normalize(abs(df_hab["radius"] - 1.0))        # Earth radius ‚âà 1
temp_score   = 1 - normalize(abs(df_hab["temp"] - 288))          # Earth temp ‚âà 288 K
orbit_score  = 1 - normalize(abs(df_hab["orbital_period"] - 365))# Earth year ‚âà 365 days
ecc_score    = 1 - normalize(df_hab["eccentricity"])             # Low eccentricity preferred

# Weighted habitability score
df_hab["habitability_score"] = (
      0.35 * radius_score
    + 0.35 * temp_score
    + 0.20 * orbit_score
    + 0.10 * ecc_score
)

# Ensure score lies between 0 and 1
df_hab["habitability_score"] = df_hab["habitability_score"].clip(0, 1)

# Preview
df_hab[["radius", "temp", "eccentricity", "habitability_score"]].head()


In [None]:
# ---------------------------------------------------
# Feature Engineering Example 2: Stellar Compatibility Index
# (Analysis-only, NOT used as ML label)
# ---------------------------------------------------

df_sci = df_clean.copy()

def safe_normalize(series):
    if series.max() == series.min():
        return pd.Series([0.5] * len(series)) #creates a new series where each value is 0.5(neutral point)
    return (series - series.min()) / (series.max() - series.min())

# 1Ô∏è‚É£ Stellar temperature suitability (Sun ‚âà 5778 K)
df_sci["star_temp_score"] = 1 - safe_normalize(
    abs(df_sci["star_temp"] - 5778)
)

# 2Ô∏è‚É£ Spectral type compatibility (domain knowledge)
spectral_weights = {
    "O": 0.1,
    "B": 0.2,
    "A": 0.3,
    "F": 0.6,
    "G": 1.0,
    "K": 0.8,
    "M": 0.6,
    "Unknown": 0.5
}

df_sci["spectral_score"] = df_sci["star_type"].map(spectral_weights)

# If any star_type was unseen
df_sci["spectral_score"] = df_sci["spectral_score"].fillna(0.5)

# 3Ô∏è‚É£ Final Stellar Compatibility Index
df_sci["stellar_compatibility_index"] = (
      0.6 * df_sci["star_temp_score"]
    + 0.4 * df_sci["spectral_score"]
)

# Ensure valid range
df_sci["stellar_compatibility_index"] = df_sci[
    "stellar_compatibility_index"
].clip(0, 1)

# Preview
df_sci[
    ["star_temp", "star_type", "stellar_compatibility_index"]
].head()


In [None]:
df_cls = df_clean.copy()
df_cls["habitable_class"] = (
    (df_cls["radius"].between(0.5, 2.0)) &
    (df_cls["temp"].between(180, 320)) &
    (df_cls["eccentricity"] < 0.3)
).astype(int)

print(df_cls["habitable_class"].value_counts())
print(df_cls["habitable_class"].value_counts(normalize=True) * 100)


In [None]:
features = [
    "radius",
    "mass",
    "temp",
    "orbital_period",
    "distance_star",
    "star_temp",
    "eccentricity",
    "semi_major_axis",
    "star_type"
] #independent features

X = df_cls[features]
y = df_cls["habitable_class"] #depended features
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y, #stratify --ensures train and test splits have same class distribution
    random_state=42,
    )

print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:


print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#It prepares raw data so that a machine-learning model can safely and correctly learn from it.
#MinMaxScaler ‚Üí scale numbers to 0‚Äì1

#OneHotEncoder ‚Üí convert text ‚Üí numbers
numeric_features = [
    "radius",
    "mass",
    "temp",
    "orbital_period",
    "distance_star",
    "star_temp",
    "eccentricity",
    "semi_major_axis"
]

categorical_features = ["star_type"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([ #ColumnTransformer ‚Üí apply different steps to different columns
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])
###Applies:
#numeric_pipeline ‚Üí numeric columns

#categorical_pipeline ‚Üí categorical columns

#Then combines the results into one matrix 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf_model)
])


In [None]:
rf_pipeline.fit(X_train, y_train)


In [None]:
import pandas as pd
import numpy as np

feature_names=rf_pipeline.named_steps["preprocessor"].get_feature_names_out()

importances=rf_pipeline.named_steps["model"].feature_importances_

importance_df=pd.DataFrame({
    "feature":feature_names,
    "importance":importances
}).sort_values(by="importance",ascending=False)
importance_df.head(15)

In [None]:
y_pred = rf_pipeline.predict(X_test)
y_proba = rf_pipeline.predict_proba(X_test)[:, 1]
threshold=0.3
y_pred_custom=(y_proba>=threshold).astype(int)
pd.Series(y_proba).describe()


In [None]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=["Non-Habitable","Habitable"])
disp.plot(cmap="Blues")

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

accuracy = accuracy_score(y_test, y_pred_custom)
precision = precision_score(y_test, y_pred_custom, zero_division=0)
recall = recall_score(y_test, y_pred_custom)
f1 = f1_score(y_test, y_pred_custom)
roc_auc = roc_auc_score(y_test, y_proba)
print("Random Forest")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)
print("ROC-AUC  :", roc_auc)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
logreg_model=LogisticRegression(max_iter=1000,class_weight="balanced",random_state=42)
logreg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", logreg_model)
])
logreg_pipeline.fit(X_train, y_train)
# ---------------------------------------------------

y_proba_lr = logreg_pipeline.predict_proba(X_test)[:, 1]
threshold=0.3
y_pred_lr=(y_proba_lr>=threshold).astype(int)
print("Logistic Regression Metrics")
print("Accuracy :", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_lr))
print("F1-score :", f1_score(y_test, y_pred_lr))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_lr))

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
svm_model=SVC(kernel="rbf",class_weight="balanced",probability=True,random_state=42)

svm_pipeline=Pipeline([ ("preprocessor", preprocessor),
    ("model", svm_model)
])
svm_pipeline.fit(X_train, y_train)

# Predict


y_proba_svm = svm_pipeline.predict_proba(X_test)[:, 1]
threshold=0.3
y_pred_svm = (y_proba_svm>=threshold).astype(int)

# Evaluate
print("SVM Metrics")
print("Accuracy :", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_svm))
print("F1-score :", f1_score(y_test, y_pred_svm))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_svm))

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()


In [None]:
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])
xgb_pipeline.fit(X_train, y_train)

y_proba_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]
threshold=0.3
y_pred_xgb=(y_proba_xgb>=threshold).astype(int)
print("XGBoost Metrics")
print("Accuracy :", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_xgb))
print("F1-score :", f1_score(y_test, y_pred_xgb))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_xgb))


In [None]:
df_ranking = df_cls.copy()

df_ranking["predicted_habitability_probability"] = (
    rf_pipeline.predict_proba(df_cls[features])[:, 1]
)
df_ranking = df_ranking.sort_values(
    by="predicted_habitability_probability",
    ascending=False
)
df_ranking[
    [
        "radius",
        "temp",
        "eccentricity",
        "star_type",
        "predicted_habitability_probability"
    ]
].head(10)


In [None]:
y.value_counts(), y.value_counts(normalize=True) * 100


In [None]:
import joblib
joblib.dump(xgb_pipeline,"/kaggle/working/habitability_trained.pkl")


In [None]:
import os
os.listdir("/kaggle/working")

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler #to balance values
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans  #to implement K-CLustering
df_cluster_base=df_clean[["radius","mass","temp"]].copy()
#creating an isolated dataframe only for clustering
imputer_cluster=SimpleImputer(strategy="median")
cluster_scale=imputer_cluster.fit_transform(df_cluster_base)



In [None]:
from sklearn.cluster import KMeans
kmeans_cluster = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)
cluster_labels = kmeans_cluster.fit_predict(cluster_scale)

In [None]:
cluster_analysis_df = df_cluster_base.copy()
cluster_analysis_df["cluster"] = cluster_labels

cluster_analysis_df.groupby("cluster").mean()

In [None]:
cluster_defaults = {}

for c in sorted(cluster_analysis_df["cluster"].unique()):
    mask = cluster_analysis_df["cluster"] == c

    cluster_defaults[c] = {
        "orbital_period": df_clean.loc[mask, "orbital_period"].median(),
        "distance_star": df_clean.loc[mask, "distance_star"].median(),
        "star_temp": df_clean.loc[mask, "star_temp"].median(),
        "eccentricity": df_clean.loc[mask, "eccentricity"].median(),
        "semi_major_axis": df_clean.loc[mask, "semi_major_axis"].median(),
        "star_type": df_clean.loc[mask, "star_type"].mode()[0]
    }


In [None]:
import joblib

joblib.dump(cluster_defaults, "cluster_defaults.pkl")
joblib.dump(kmeans_cluster, "cluster_model.pkl")
joblib.dump(scaler_cluster, "cluster_scaler.pkl")
