In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
import tensorflow as tf
import os

In [None]:

df = pd.read_csv("exoplanetFeatures.csv")
df3 = pd.read_csv("exoplanetLabels.csv")
df['label'] = df3.iloc[:, 0]

In [None]:

print("DataFrame shape:", df.shape)
print("\nDataFrame info:")
print(df.info())
display(df.head())

print("\nMissing values per column:")
print(df.isnull().sum())

print("\nClass distribution:")
print(df['label'].value_counts())

df['label'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

df.iloc[:, :5].hist(figsize=(12, 8))
plt.suptitle('Feature Distributions')
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:

df = df.fillna(df.mean(numeric_only=True))
print("\nMissing values after imputation:\n", df.isnull().sum())

In [None]:


X = df.drop('label', axis=1)
y = df['label']
estimator = LogisticRegression(max_iter=1000)
selector = RFE(estimator, n_features_to_select=10)
selector = selector.fit(X, y)
selected_features = X.columns[selector.support_]
print("Selected features:", list(selected_features))
X_selected = X[selected_features]

In [None]:

RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())

In [None]:

scaler = preprocessing.MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

def build_model(input_dim, learning_rate=0.001):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(12, activation='tanh', input_dim=input_dim),
        tf.keras.layers.Dense(8, activation='tanh'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_model(X_train_scaled.shape[1])

In [None]:

history = model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

In [None]:

y_pred_prob = model.predict(X_test_scaled).flatten()
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC-AUC Score: {roc_auc:.4f}")

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.2f}")

y_pred = (y_pred_prob >= optimal_threshold).astype(int)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()