In [6]:
# This notebook sets up Logistic Regression, Decision Tree, KNN, and Naive Bayes classifiers
# with stratified k-fold cross-validation.


import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [7]:
train = pd.read_csv("../prepared_data/train_prepared.csv")
test = pd.read_csv("../prepared_data/test_prepared.csv")


In [12]:
# Identify label column
label_col = 'label' if 'label' in train.columns else 'operation_kind_id'


# Map labels to binary (planet candidate = 1, false positive = 0)
label_mapping = {
'planet candidate': 1,
'false positive': 0,
1: 1,
0: 0
}
train[label_col] = train[label_col].map(label_mapping)
test[label_col] = test[label_col].map(label_mapping)





# Drop obvious non-numeric identifiers
X = train.drop(columns=["label", "tic_id", "star_name"], errors="ignore")
y = train["label"]

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

# Check
print("Features being used:", X.columns.tolist())
print("Shape:", X.shape)




Features being used: ['period', 'log_period', 'duration', 'depth', 'planet_radius', 'planet_radius_est', 'stellar_radius', 'stellar_mass', 'stellar_mag', 'ra', 'dec']
Shape: (13222, 11)


In [13]:
# Define baseline models
models = {
"Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced"),
"Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
"KNN": KNeighborsClassifier(n_neighbors=5),
"Naive Bayes": GaussianNB()
}

In [14]:

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics to collect
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = []

for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    result_summary = {metric: np.mean(cv_results[f'test_{metric}']) for metric in scoring}
    result_summary['model'] = name
    results.append(result_summary)


# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


   accuracy  precision    recall        f1   roc_auc                model
0  0.713054   0.799870  0.728870  0.762671  0.768192  Logistic Regression
1  0.770609   0.820609  0.815780  0.818165  0.754299        Decision Tree
2  0.790804   0.808837  0.876509  0.841309  0.831393                  KNN
3  0.715777   0.704777  0.947878  0.808435  0.756498          Naive Bayes


In [16]:
results_df = pd.DataFrame(results)
print(results_df)


# Save baseline results
results_df.to_csv("../metrics/baseline_results.csv", index=False)
print("../Baseline results saved to metrics/baseline_results.csv")

   accuracy  precision    recall        f1   roc_auc                model
0  0.713054   0.799870  0.728870  0.762671  0.768192  Logistic Regression
1  0.770609   0.820609  0.815780  0.818165  0.754299        Decision Tree
2  0.790804   0.808837  0.876509  0.841309  0.831393                  KNN
3  0.715777   0.704777  0.947878  0.808435  0.756498          Naive Bayes
../Baseline results saved to metrics/baseline_results.csv
