ED-RAY AUTRA PROJECT

In [1]:
"""
RAY AUTRA TEAM
NASA SPACE APP CHALLENGE 2025
ED-RAY AUTRA PROJECT

Title: Training a Tabular Data Classifier for Exoplanetology (PyTorch MLP)

Objective:
This script implements a complete pipeline for training an exoplanet classification model
(or similar tabular data, here 'cumulative.csv').
The model is a Multi-Layer Perceptron (MLP) built with PyTorch.

Key Steps:
1.  **Data Loading and Cleaning**: Reading a CSV, identifying and dropping
    ID/high-cardinality columns, and managing data types.
2.  **Target Encoding**: The target column ('koi_disposition' by default) is numerically encoded.
    Class weights are calculated to handle imbalance.
3.  **Feature Preparation (scikit-learn)**: A ColumnTransformer is used to apply:
    * **Numeric**: Median imputation, scaling (StandardScaler).
    * **Categorical**: Constant imputation, One-Hot Encoding.
    * Numeric outliers are quantile-clipped.
4.  **Data Split**: Division into stratified training, validation, and test sets.
5.  **PyTorch Model**: Definition and training of an MLP with Batch Normalization and Dropout.
6.  **Monitoring**: TensorBoard is used for tracking metrics and weights.
7.  **Saving**: The best model (based on validation loss with Early Stopping),
    the preprocessor, and all configurations/metrics (JSON) are saved to a
    timestamped folder, then compressed (.zip).

Key Hyperparameters (configurable via environment variables):
* CSV_PATH: Path to the input file.
* TARGET_COLUMN: Column to predict.
* EPOCHS, LR, BATCH_SIZE: Network training parameters.
* HIDDEN_UNITS: MLP architecture.
* PATIENCE: Threshold for Early Stopping.

Author: Automatically detected (otherwise 'unknown_author')
Date: Automatically generated upon execution.
"""


import os
import sys
import json
import time
import shutil
import logging
from datetime import datetime
import getpass
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Config / hyperparams

CSV_PATH = os.environ.get("CSV_PATH", "/content/cumulative.csv")
TARGET_COLUMN = os.environ.get("TARGET_COLUMN", "koi_disposition")
OUTPUT_BASE = os.environ.get("OUTPUT_BASE", "/content/output_model")
AUTHOR = None  # si None -> getpass.getuser()
SEED = int(os.environ.get("SEED", 42))
TEST_SIZE = float(os.environ.get("TEST_SIZE", 0.1))
VAL_SIZE = float(os.environ.get("VAL_SIZE", 0.1))
BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 128))
EPOCHS = int(os.environ.get("EPOCHS", 40))
LR = float(os.environ.get("LR", 1e-3))
HIDDEN_UNITS = [256, 128]
PATIENCE = int(os.environ.get("PATIENCE", 8))
CLIP_LOWER_Q = 0.001
CLIP_UPPER_Q = 0.999
MAX_CAT_CARDINALITY = 200  # drop categorical columns with > this many unique values (likely names/ids)

# ---------------------------
# Logging
# ---------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("train_tabular_v2")

# ---------------------------
# Reproductibilité
# ---------------------------
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    try:
        torch.cuda.manual_seed_all(seed)
    except Exception:
        pass
    # determinism (may slow down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)


# Utilitaires
def now_iso():
    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


def safe_mkdir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

# ---------------------------
# Charger CSV
# ---------------------------
if not os.path.exists(CSV_PATH):
    logger.error(f"CSV not found: {CSV_PATH}. Place your cumulative.csv at this path in Colab.")
    sys.exit(1)

logger.info(f"Loading CSV from {CSV_PATH}...")
df = pd.read_csv(CSV_PATH)
logger.info(f"Loaded CSV with shape {df.shape}")

# ---------------------------
# Déterminer target
# ---------------------------
if TARGET_COLUMN not in df.columns:
    TARGET_COLUMN = df.columns[-1]
    logger.warning(f"TARGET_COLUMN not found; using last column as target: {TARGET_COLUMN}")


# Retirer colonnes identifiantes et très haute cardinalité
# heuristiques : colonnes contenant 'name', 'kepoi', 'kepler', 'id', 'rowid' souvent non-informatives
id_like = [c for c in df.columns if any(tok in c.lower() for tok in ("rowid", "kepid", "kepoi", "kepler", "kepoi_name", "kepoiid", "kepoi", "kepoi_name")) or c.lower().endswith("id")]
# don't drop the target by mistake
id_like = [c for c in id_like if c != TARGET_COLUMN]
logger.info(f"Auto-detected id-like columns to drop: {id_like}")

# Start X,y
y_raw = df[TARGET_COLUMN].astype(str).fillna("NaN")
X = df.drop(columns=[TARGET_COLUMN])
# drop id-like columns if present
X = X.drop(columns=[c for c in id_like if c in X.columns])

# Force coercion for columns that should be numeric but may contain strings
# We'll detect numeric candidates by attempting to convert most entries to numeric
candidate_numeric = []
for c in X.columns:
    # skip obviously categorical (object dtype) but try to detect numeric-like
    if X[c].dtype == object:
        non_null = X[c].dropna().head(1000)
        # fraction of values that parse as numeric
        parsed = pd.to_numeric(non_null, errors='coerce')
        frac_numeric = parsed.notnull().mean() if len(parsed) > 0 else 0.0
        if frac_numeric > 0.7:  # heuristique: >70% convertible -> numeric
            candidate_numeric.append(c)

if candidate_numeric:
    logger.info(f"Columns that look numeric but had object dtype (will coerce): {candidate_numeric}")
    for c in candidate_numeric:
        X[c] = pd.to_numeric(X[c], errors='coerce')

# Now detect numeric and categorical columns robustly
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
logger.info(f"Detected numeric cols: {len(numeric_cols)} / categorical cols: {len(cat_cols)}")

# Drop categorical columns with extremely high cardinality (likely names/ids)
high_card = [c for c in cat_cols if X[c].nunique(dropna=False) > MAX_CAT_CARDINALITY]
if high_card:
    logger.info(f"Dropping high-cardinality categorical columns (likely names/ids): {high_card}")
    X = X.drop(columns=high_card)
    cat_cols = [c for c in cat_cols if c not in high_card]

# final lists
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
logger.info(f"Final numeric cols: {len(numeric_cols)} - Final categorical cols: {len(cat_cols)}")

# Target encoding
label_to_idx = {lab: i for i, lab in enumerate(sorted(y_raw.unique()))}
idx_to_label = {v: k for k, v in label_to_idx.items()}
y = y_raw.map(label_to_idx).values
num_classes = len(label_to_idx)
logger.info(f"Target mapping: {label_to_idx} | num_classes={num_classes}")

# Clip numeric outliers (quantile-based)
if len(numeric_cols) > 0:
    lower = X[numeric_cols].quantile(CLIP_LOWER_Q)
    upper = X[numeric_cols].quantile(CLIP_UPPER_Q)
    X[numeric_cols] = X[numeric_cols].clip(lower=lower, upper=upper, axis=1)
    logger.info("Applied quantile clipping to numeric columns to reduce extreme outliers.")

# Préprocesseur sklearn (impute + scale + onehot)
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# OneHotEncoder compatibility
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    # older sklearn
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("ohe", ohe)
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, cat_cols)
], remainder="drop", sparse_threshold=0)

# Fit transformer on X (coercion already applied)
logger.info("Fitting preprocessor on full dataset...")
X_pre = preprocessor.fit_transform(X)
if hasattr(X_pre, "toarray"):
    X_numpy = X_pre.toarray()
else:
    X_numpy = np.asarray(X_pre)
logger.info(f"Preprocessed X shape: {X_numpy.shape}")

# save schema info for configdata.json
schema = {}
for c in numeric_cols:
    schema[c] = {"dtype": "numeric"}
for c in cat_cols:
    nunique = int(df[c].nunique(dropna=False)) if c in df.columns else None
    sample_unique = df[c].dropna().unique().tolist()[:50] if c in df.columns else []
    schema[c] = {"dtype": "categorical", "nunique": nunique, "sample_values": sample_unique}

configdata = {
    "expected_columns": list(X.columns),
    "numeric_columns": numeric_cols,
    "categorical_columns": cat_cols,
    "target_column": TARGET_COLUMN,
    "label_mapping": label_to_idx,
    "schema_summary": schema,
    "preprocessor": {
        "description": "ColumnTransformer with SimpleImputer+StandardScaler for numeric, SimpleImputer+OneHotEncoder for categorical. Saved in 'preprocessor.joblib'."
    }
}

# Split train/val/test (stratified)
X_trainval, X_test, y_trainval, y_test = train_test_split(X_numpy, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
val_rel = VAL_SIZE / (1.0 - TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=val_rel, random_state=SEED, stratify=y_trainval)
logger.info(f"Train/Val/Test shapes: {X_train.shape}, {X_val.shape}, {X_test.shape}")

# Convert to torch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_val_t = torch.tensor(X_val, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_val_t = torch.tensor(y_val, dtype=torch.long)
y_test_t = torch.tensor(y_test, dtype=torch.long)

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds = TensorDataset(X_val_t, y_val_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

# Définir modèle PyTorch MLP
input_dim = X_train.shape[1]
hidden_layers = HIDDEN_UNITS
output_dim = num_classes

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim, dropout=0.2):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, output_dim))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

model = MLP(input_dim, hidden_layers, output_dim).to(device)

# Loss (with optional class weights) and optimizer
# compute class weights inversely proportional to frequency
class_counts = np.bincount(y_trainval)
inv_freq = 1.0 / (class_counts + 1e-12)
weights = inv_freq / inv_freq.sum() * len(class_counts)  # normalized-ish
weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=weights_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# Préparer dossier de sortie et TensorBoard
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
output_dir = f"{OUTPUT_BASE}_{timestamp}"
safe_mkdir(output_dir)
writer = SummaryWriter(log_dir=os.path.join(output_dir, "tb_logs"))

# Training loop with early stopping + TB logging
best_val_loss = float("inf")
best_epoch = -1
epochs_no_improve = 0
history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}


def evaluate(loader):
    model.eval()
    total = 0
    correct = 0
    loss_accum = 0.0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = criterion(out, yb)
            loss_accum += loss.item() * xb.size(0)
            preds = out.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += xb.size(0)
    return loss_accum / total, correct / total

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
        preds = out.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)
    train_loss = running_loss / total
    train_acc = correct / total
    val_loss, val_acc = evaluate(val_loader)

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_acc"].append(train_acc)
    history["val_acc"].append(val_acc)

    # TensorBoard logs
    writer.add_scalar("loss/train", train_loss, epoch)
    writer.add_scalar("loss/val", val_loss, epoch)
    writer.add_scalar("acc/train", train_acc, epoch)
    writer.add_scalar("acc/val", val_acc, epoch)

    # log histograms of weights (every 5 epochs)
    if epoch % 5 == 0:
        for name, param in model.named_parameters():
            writer.add_histogram(name, param.detach().cpu().numpy(), epoch)

    logger.info(f"Epoch {epoch}/{EPOCHS} | train_loss={train_loss:.6f} val_loss={val_loss:.6f} train_acc={train_acc:.4f} val_acc={val_acc:.4f}")

    # early stopping
    if val_loss < best_val_loss - 1e-6:
        best_val_loss = val_loss
        best_epoch = epoch
        epochs_no_improve = 0
        # Save best model state temporarily
        best_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            logger.info(f"Early stopping at epoch {epoch} (no improvement for {PATIENCE} epochs)")
            break

# Load best state
if 'best_state' in locals():
    model.load_state_dict(best_state)

# Evaluate on test
test_loss, test_acc = evaluate(test_loader)
logger.info(f"Test loss={test_loss:.6f} test_acc={test_acc:.4f}")

# Sauvegardes : modèle, preprocessor, config.json, configdata.json
# model: save both state_dict and a full checkpoint
model_path = os.path.join(output_dir, "model.pth")
torch.save(model.state_dict(), model_path)
# full checkpoint
checkpoint_path = os.path.join(output_dir, "checkpoint.pt")
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'input_dim': input_dim,
        'hidden_layers': hidden_layers,
        'output_dim': output_dim,
        'num_classes': num_classes,
        'preprocessor_file': 'preprocessor.joblib'
    }
}, checkpoint_path)

# preprocessor
preproc_path = os.path.join(output_dir, "preprocessor.joblib")
joblib.dump(preprocessor, preproc_path)

# config.json
if AUTHOR is None:
    try:
        AUTHOR = getpass.getuser()
    except Exception:
        AUTHOR = "unknown_author"

config = {
    "author": str(AUTHOR),
    "created_at_utc": now_iso(),
    "model_type": "PyTorch_MLP_tabular",
    "model_file": os.path.basename(model_path),
    "checkpoint_file": os.path.basename(checkpoint_path),
    "preprocessor_file": os.path.basename(preproc_path),
    "input_dim": input_dim,
    "hidden_layers": HIDDEN_UNITS,
    "output_dim": output_dim,
    "num_classes": num_classes,
    "hyperparameters": {
        "batch_size": BATCH_SIZE,
        "epochs_requested": EPOCHS,
        "learning_rate": LR,
        "patience": PATIENCE
    },
    "data_split": {
        "train_size": int(len(X_train)),
        "val_size": int(len(X_val)),
        "test_size": int(len(X_test))
    },
    "target_mapping": label_to_idx,
    "final_metrics": {
        "best_epoch": int(best_epoch),
        "test_loss": float(test_loss),
        "test_accuracy": float(test_acc)
    },
    "notes": "Preprocessor is a ColumnTransformer (num imputer+scaler, cat imputer+onehot). High-cardinality cat columns were dropped automatically."
}

with open(os.path.join(output_dir, "config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2, ensure_ascii=False)

# configdata.json
with open(os.path.join(output_dir, "configdata.json"), "w", encoding="utf-8") as f:
    json.dump(configdata, f, indent=2, ensure_ascii=False)

# Save training history
with open(os.path.join(output_dir, "train_history.json"), "w", encoding="utf-8") as f:
    json.dump(history, f, indent=2)

# Zip the folder
zipname = f"{output_dir}.zip"
try:
    shutil.make_archive(output_dir, 'zip', output_dir)
    logger.info(f"All outputs written to: {output_dir}")
    logger.info(f"Zipped artifact produced: {zipname}")
except Exception as e:
    logger.warning(f"Failed to create zip archive: {e}")

print("\n--- FIN ---")
print(f"Outputs in: {output_dir}")
print("Files: model.pth, checkpoint.pt, preprocessor.joblib, config.json, configdata.json, train_history.json, tb_logs/")
print("To inspect TensorBoard logs (in Colab), run:\n  %load_ext tensorboard\n  %tensorboard --logdir {output_dir}/tb_logs")


  timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")



--- FIN ---
Outputs in: /content/output_model_20251004T134145Z
Files: model.pth, checkpoint.pt, preprocessor.joblib, config.json, configdata.json, train_history.json, tb_logs/
To inspect TensorBoard logs (in Colab), run:
  %load_ext tensorboard
  %tensorboard --logdir {output_dir}/tb_logs


  return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


In [21]:
import torch
import torch.nn as nn
import pandas as pd
import joblib
from pathlib import Path
import numpy as np
import json
import io

# Config
model_dir = Path("output_model_20251004T134145Z")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load configdata to get column information
try:
    with open(model_dir / "configdata.json", "r") as f:
        configdata = json.load(f)
    expected_columns = configdata.get("expected_columns", [])
    # Explicitly get the lists of numeric and categorical columns from configdata
    numeric_cols_train = configdata.get("numeric_columns", [])
    cat_cols_train = configdata.get("categorical_columns", [])
    label_mapping = configdata.get("label_mapping", {})
    idx_to_label = {v: k for k, v in label_mapping.items()}
except FileNotFoundError:
    print(f"Error: configdata.json not found in {model_dir}. Cannot proceed without column information.")
    exit()

# 🔹 Données de test,  Test data
data = """rowid,kepid,kepoi_name,kepler_name,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
1,10797460,K00752.01,Kepler-227 b,CANDIDATE,1.0000,0,0,0,0,9.488035570,2.7750000e-05,-2.7750000e-05,170.5387500,2.160000e-03,-2.160000e-03,0.1460,0.3180,-0.1460,2.95750,0.08190,-0.08190,6.1580e+02,1.950e+01,-1.950e+01,2.26,2.600e-01,-1.500e-01,793.0,,,93.59,29.45,-16.65,35.80,1,q1_q17_dr25_tce,5455.00,81.00,-81.00,4.467,0.064,-0.096,0.9270,0.1050,-0.0610,291.934230,48.141651,15.347
2,10797460,K00752.02,Kepler-227 c,CANDIDATE,0.9690,0,0,0,0,54.418382700,2.4790000e-04,-2.4790000e-04,162.5138400,3.520000e-03,-3.520000e-03,0.5860,0.0590,-0.4430,4.50700,0.11600,-0.11600,8.7480e+02,3.550e+01,-3.550e+01,2.83,3.200e-01,-1.900e-01,443.0,,,9.11,2.87,-1.62,25.80,2,q1_q17_dr25_tce,5455.00,81.00,-81.00,4.467,0.064,-0.096,0.9270,0.1050,-0.0610,291.934230,48.141651,15.347
3,10811496,K00753.01,FALSE POSITIVE,0.0000,0,1,0,0,19.899139950,1.4940000e-05,-1.4940000e-05,175.8502520,5.810000e-04,-5.810000e-04,0.9690,5.1260,-0.0770,1.78220,0.03410,-0.03410,1.0829e+04,1.710e+02,-1.710e+02,14.60,3.920e+00,-1.310e+00,638.0,,,39.30,31.04,-10.49,76.30,1,q1_q17_dr25_tce,5853.00,158.00,-176.00,4.544,0.044,-0.176,0.8680,0.2330,-0.0780,297.004820,48.134129,15.436
4,10848459,K00754.01,FALSE POSITIVE,0.0000,0,1,0,0,1.736952453,2.6300000e-07,-2.6300000e-07,170.3075650,1.150000e-04,-1.150000e-04,0.0000,0.0000,0.0000,2.63120,0.00530,-0.00530,1.3900e+02,1.620e+01,-1.620e+01,1.49,8.400e-01,-3.000e-01,1395.0,,,891.96,668.95,-230.23,13.90,1,q1_q17_dr25_tce,5805.00,71.00,-71.00,4.564,0.053,-0.168,0.7910,0.2010,-0.0670,285.534610,48.285210,15.597
,10147276,K07987.01,FALSE POSITIVE,0.0210,0,0,1,0,0.681401611,2.4340000e-06,-2.4340000e-06,132.1817500,2.850000e-03,-2.850000e-03,0.1470,0.3090,-0.1470,0.86500,0.16200,-0.16200,1.0360e+02,1.470e+01,-1.470e+01,1.07,3.600e-01,-1.100e-01,2218.0,,,5713.41,5675.74,-1836.94,12.30,1,q1_q17_dr25_tce,6173.00,193.00,-236.00,4.447,0.056,-0.224,1.0410,0.3410,-0.1140,294.164890,47.176281,15.385
6,10156110,K07989.01,FALSE POSITIVE,0.0000,0,0,1,1,4.856034820,6.3560000e-05,-6.3560000e-05,135.9933000,1.080000e-02,-1.080000e-02,0.1340,0.3230,-0.1340,3.07800,0.28300,-0.28300,7.6700e+01,1.080e+01,-1.080e+01,1.05,3.600e-01,-1.200e-01,1266.0,,,607.42,600.39,-194.33,8.20,1,q1_q17_dr25_tce,6469.00,158.00,-225.00,4.385,0.054,-0.216,1.1930,0.4100,-0.1370,297.009770,47.121021,14.826

"""

# Read the string data into a DataFrame
df = pd.read_csv(io.StringIO(data))

# --- IMPORTANT: Remove the target column 'koi_disposition' before prediction ---
if 'koi_disposition' in df.columns:
    df_with_target = df.copy() # Keep a copy with target for comparison if needed
    df = df.drop(columns=['koi_disposition'])

# Remove columns that were dropped during training (id-like and high cardinality)
# These were identified and dropped in the training script
id_like_cols = ["rowid", "kepid", "kepoi_name", "kepler_name"] # Add kepler_name
df = df.drop(columns=[c for c in id_like_cols if c in df.columns])

# Ensure the order of columns matches the training data's expected columns
# Filter expected_columns to exclude those we just dropped (and the target if it was in expected_columns)
# Note: expected_columns from configdata.json should NOT contain the target
filtered_expected_columns = [c for c in expected_columns if c in df.columns]
df = df[filtered_expected_columns]

# Force coercion for columns identified as numeric during training
for c in numeric_cols_train:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Handle potential missing columns in the test data compared to training data
# Add missing columns with NaN values and ensure they are of appropriate type
missing_cols = set(expected_columns) - set(df.columns)
for c in missing_cols:
    df[c] = np.nan # Add as NaN initially
    print(f"Warning: Added missing column '{c}' to test data.")

# Ensure the order of columns is the same as during training
df = df[expected_columns]

# Define clipping bounds (should match those used during training)
CLIP_LOWER_Q = 0.001
CLIP_UPPER_Q = 0.999

# Apply clipping only to columns that were numeric during training AND are currently numeric in the test DataFrame
# Explicitly filter for numeric dtypes in the current dataframe slice
numeric_cols_for_clipping = [c for c in numeric_cols_train if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]

if len(numeric_cols_for_clipping) > 0:
    # Calculate quartiles on the test data subset for the relevant numeric columns
    # A more robust production system would save and load these bounds from training.
    lower = df[numeric_cols_for_clipping].quantile(CLIP_LOWER_Q)
    upper = df[numeric_cols_for_clipping].quantile(CLIP_UPPER_Q)
    # Apply clipping only to the numeric subset
    df[numeric_cols_for_clipping] = df[numeric_cols_for_clipping].clip(lower=lower, upper=upper, axis=1)
    print("Applied quantile clipping to numeric columns in test data.")
else:
    print("No numeric columns found for clipping in test data.")

# 🔹 Charger le scaler
scaler = joblib.load(model_dir / "preprocessor.joblib")

# Apply the preprocessor to the DataFrame
X_test = scaler.transform(df)


# Définition du modèle MLP (exactement comme pour l'entraînement), MLP MODEL DEFINITION
# Get input_dim, hidden_layers, and num_classes from the saved config
try:
    with open(model_dir / "config.json", "r") as f:
        config = json.load(f)
    input_dim = config["input_dim"]
    hidden_layers = config["hidden_layers"]
    num_classes = config["num_classes"]
    # label_mapping and idx_to_label already loaded from configdata.json
except FileNotFoundError:
     print(f"Error: config.json not found in {model_dir}. Cannot proceed.")
     exit()


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim, dropout=0.2):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, output_dim))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

# 🔹 Charger le modèle
model = MLP(input_dim, hidden_layers, num_classes).to(device)
model.load_state_dict(torch.load(model_dir / "model.pth", map_location=device))
model.eval()

# 🔹 Prédiction
with torch.no_grad():
    X_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    outputs = model(X_tensor)
    probabilities = torch.softmax(outputs, dim=1).cpu().numpy()
    predictions_idx = torch.argmax(outputs, dim=1).cpu().numpy()
    predictions_label = [idx_to_label[idx] for idx in predictions_idx]


print("Probabilities:", probabilities)
print("Predictions (index):", predictions_idx)
print("Predictions (label):", predictions_label)

# Print the original data with predictions for comparison
# Use the original df_with_target if it exists, otherwise use df
df_to_display = df_with_target if 'df_with_target' in locals() else df
df_to_display['predicted_disposition'] = predictions_label
print("\nOriginal Data with Predictions:")
# Include koi_disposition from the original data for comparison if available
display_cols = ['koi_disposition', 'predicted_disposition', 'koi_pdisposition'] + [c for c in filtered_expected_columns if c not in ['koi_pdisposition']]
display(df_to_display[display_cols])

Applied quantile clipping to numeric columns in test data.
Probabilities: [[7.7814929e-02 9.2212814e-01 5.6918034e-05]
 [6.0487472e-02 9.3947583e-01 3.6704889e-05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00]]
Predictions (index): [1 1 2 0 0 0]
Predictions (label): ['CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE', 'CANDIDATE', 'CANDIDATE']

Original Data with Predictions:




Unnamed: 0,koi_disposition,predicted_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,CONFIRMED,CANDIDATE,1.0,0,0,0.0,0.0,9.488036,2.775e-05,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,CONFIRMED,CANDIDATE,0.969,0,0,0.0,0.0,54.418383,0.0002479,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0.0,0.0,19.89914,1.494e-05,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,CANDIDATE,FALSE POSITIVE,0.0,0,1,0.0,0.0,1.736952,2.63e-07,...,-71.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,0.0210,CANDIDATE,0,0.0,1,0,0.681402,2e-06,-2e-06,132.1817,...,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385,,
5,0.0000,CANDIDATE,0,0.0,1,1,4.856035,6.4e-05,-6.4e-05,135.9933,...,0.054,-0.216,1.193,0.41,-0.137,297.00977,47.121021,14.826,,
