In [None]:
import lightkurve as lk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os
import warnings

warnings.filterwarnings("ignore", category=UserWarning, append=True)

# ===============================
# 1. Load Labels
# ===============================
labels_df = pd.read_csv(
    r"C:\Users\sohie\OneDrive\Desktop\NASA Space Apps 2025\Current Datasets\SET 5.csv"
)
# Expected format: id,label
# Example:
# 8462852,CANDIDATE
# 11446443,FALSE POSITIVE

label_map = {"CANDIDATE": 1, "FALSE POSITIVE": 0}
labels_df["label"] = labels_df["label"].map(label_map)

# ===============================
# 2. Feature Extraction Function
# ===============================
def extract_features(lc):
    """Extracts simple statistical features from a light curve."""
    flux = lc.flux.value
    time = lc.time.value

    # If flux or time is masked, convert to plain numpy arrays
    if hasattr(flux, "filled"):
        flux = flux.filled(np.nan)
    if hasattr(time, "filled"):
        time = time.filled(np.nan)

    return {
        "mean_flux": np.nanmean(flux),
        "std_flux": np.nanstd(flux),
        "median_flux": np.nanmedian(flux),
        "q25_flux": np.nanpercentile(flux, 25),
        "q75_flux": np.nanpercentile(flux, 75),
        "flux_range": np.nanmax(flux) - np.nanmin(flux),
        "flux_skew": pd.Series(flux).skew(skipna=True),
        "flux_kurtosis": pd.Series(flux).kurtosis(skipna=True),
        "time_span": np.nanmax(time) - np.nanmin(time),
    }


# ===============================
# 3. Collect Data
# ===============================
X, y = [], []
processed_ids = set()

for idx, row in labels_df.iterrows():
    tic_id, label = row["id"], row["label"]

    if pd.isna(label):
        continue

    # Avoid reprocessing
    if tic_id in processed_ids:
        continue

    try:
        print(f"[{idx+1}/{len(labels_df)}] Processing TIC {tic_id} ... ", end="")
        search = lk.search_lightcurve(f"TIC {tic_id}", mission="TESS")

        if len(search) == 0:
            print("⚠ No data, skipped.")
            continue

        lc = search[0].download()
        if lc is None:
            print("⚠ Download failed, skipped.")
            continue

        features = extract_features(lc)
        X.append(list(features.values()))
        y.append(label)

        processed_ids.add(tic_id)
        print("✅ Done")

    except Exception as e:
        print(f"⚠ Skipped due to error: {e}")

# Convert to DataFrame
if len(X) == 0:
    raise ValueError("❌ No light curves processed successfully!")

feature_names = list(features.keys())
X = pd.DataFrame(X, columns=feature_names)
y = np.array(y)

# ===============================
# 4. Train-Test Split (Safe)
# ===============================
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("\nClass Distribution:", class_distribution)

if len(unique) < 2 or np.min(counts) < 2:
    print("⚠ Not enough samples for stratified split. Using random split.")
    stratify_option = None
else:
    stratify_option = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_option
)

# ===============================
# 5. Scale Features
# ===============================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ===============================
# 6. Train Model (SVM)
# ===============================
clf = SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42)
clf.fit(X_train_scaled, y_train)

# ===============================
# 7. Evaluate Model
# ===============================
y_pred = clf.predict(X_test_scaled)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(
    classification_report(
        y_test,
        y_pred,
        labels=[0, 1],  # Explicitly enforce both classes
        target_names=["FALSE POSITIVE", "CANDIDATE"],
        zero_division=0  # Prevents division by zero errors
    )
)


# ===============================
# 8. Save Model + Scaler
# ===============================
joblib.dump(clf, "lightcurve_svm_model.joblib")
joblib.dump(scaler, "lightcurve_svm_scaler.joblib")
joblib.dump(feature_names, "lightcurve_feature_names.joblib")

print("\n✅ Model, scaler, and feature list saved.")