Ce notebook :

Compatible multi-ordinateur et Google Drive.

1. src/ fonctionne pour tous les imports.
Le code est bien séparé → les notebooks servent pour l’expérimentation et la présentation, le dossier src/ sert pour la logique réutilisable et propre. Donc c'est dans le fichier model.py que j'ajuste les hyperparamètres.

> Chemins relatifs corrects pour data/raw/, results/, models/ et submissions/.

> Aucun warning de regex.

> Prêt à exécuter Phase 0 → 3.

In [1]:
import os
print("Dossier courant :", os.getcwd())
print("Contenu du dossier courant :", os.listdir())
print("Contenu de la racine :", os.listdir(os.path.abspath(os.path.join(os.getcwd(), ".."))))


Dossier courant : C:\Users\cdiac\Desktop\KaggleProjects\titanic_project\notebooks
Contenu du dossier courant : ['.ipynb_checkpoints', '02_modeling.ipynb']
Contenu de la racine : ['data', 'models', 'notebooks', 'results', 'src', 'submissions']


In [2]:
# =====================================
# ⚡ Phase 0 : Configuration projet portable
# =====================================
import sys
import os
import warnings
import logging
import pandas as pd
import numpy as np
import joblib
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score

# Définir la racine du projet
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

# Import depuis src/
from src.preprocessing import clean_df, get_features, make_preprocessor
from src.models import get_models_and_params, weighted_metric

warnings.filterwarnings("ignore")

# Logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Création des dossiers si inexistants
os.makedirs(os.path.join(PROJECT_ROOT, "results"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "models"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "submissions"), exist_ok=True)

RESULTS_FILE = os.path.join(PROJECT_ROOT, "results/results_live.csv")
DATA_RAW = os.path.join(PROJECT_ROOT, "data/raw")

print("✅ Configuration projet effectuée")
print("PROJECT_ROOT =", PROJECT_ROOT)


✅ Configuration projet effectuée
PROJECT_ROOT = C:\Users\cdiac\Desktop\KaggleProjects\titanic_project


In [3]:
# Vérification
print("Dossier courant :", os.getcwd())
print("src existe :", os.path.exists(os.path.join(PROJECT_ROOT, "src")))
print("train.csv existe :", os.path.exists(os.path.join(DATA_RAW, "train.csv")))
print("test.csv existe :", os.path.exists(os.path.join(DATA_RAW, "test.csv")))

# Chargement des données
df_train = pd.read_csv(os.path.join(DATA_RAW, "train.csv"))
df_test = pd.read_csv(os.path.join(DATA_RAW, "test.csv"))

# Nettoyage
df_train = clean_df(df_train)
df_test = clean_df(df_test)

print("✅ Données chargées et nettoyées")
df_train.head()


Dossier courant : C:\Users\cdiac\Desktop\KaggleProjects\titanic_project\notebooks
src existe : True
train.csv existe : True
test.csv existe : True
✅ Données chargées et nettoyées


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,Mr


In [4]:
RUN_ID_START = 1  # Valeur par défaut
if os.path.exists(RESULTS_FILE) and os.path.getsize(RESULTS_FILE) > 0:
    try:
        df_existing = pd.read_csv(RESULTS_FILE)
        if "Run_ID" in df_existing.columns:
            df_existing["Run_ID"] = pd.to_numeric(df_existing["Run_ID"], errors="coerce")
            max_run = df_existing["Run_ID"].max()
            if not np.isnan(max_run):
                RUN_ID_START = int(max_run) + 1
    except Exception as e:
        logger.warning("Impossible de lire ou traiter %s : %s. RUN_ID_START=1", RESULTS_FILE, e)

logger.info("✅ RUN_ID_START défini à %s", RUN_ID_START)


2025-08-22 18:37:48,608 - INFO - ✅ RUN_ID_START défini à 13


In [5]:
results_list = []
run_id = RUN_ID_START

X_full, y_full = df_train.drop("Survived", axis=1), df_train["Survived"]

for use_isalone in [True, False]:
    for use_cabin in [False, True]:
        features = get_features(df_train, use_isalone, use_cabin)
        X, y = df_train[features], y_full

        preprocessor = make_preprocessor(df_train, features)
        models = get_models_and_params()

        for model_name, (model, param_grid) in models.items():
            pipe = Pipeline([("preprocessor", preprocessor), ("clf", model)])
            grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
            grid.fit(X, y)

            y_pred_train = grid.predict(X)
            train_acc = accuracy_score(y, y_pred_train)
            cv_acc = grid.best_score_
            f1 = f1_score(y, y_pred_train)
            recall = recall_score(y, y_pred_train)
            auc = roc_auc_score(y, grid.predict_proba(X)[:,1]) if hasattr(grid.best_estimator_["clf"], "predict_proba") else np.nan
            wm = weighted_metric(cv_acc, f1, recall, auc)
            gap = cv_acc - train_acc

            res = {
                "Run_ID": run_id,
                "Model": model_name,
                "Use_IsAlone": use_isalone,
                "Use_Cabin": use_cabin,
                "Best_Params": grid.best_params_,
                "Train Accuracy": train_acc,
                "CV Accuracy": cv_acc,
                "Gap (CV - Train)": gap,
                "F1 Score": f1,
                "Recall": recall,
                "AUC": auc,
                "Weighted Metric": wm,
                "best_pipe": grid.best_estimator_
            }
            results_list.append(res)

            # Append dans CSV live
            df_res = pd.DataFrame([res])
            mode = "a" if os.path.exists(RESULTS_FILE) else "w"
            header = not os.path.exists(RESULTS_FILE)
            df_res.drop(columns=["best_pipe"]).to_csv(RESULTS_FILE, mode=mode, header=header, index=False)

            logger.info("💾 Résultat enregistré dans %s (Run_ID=%s)", RESULTS_FILE, run_id)
            run_id += 1

df_results = pd.DataFrame(results_list)


2025-08-22 18:37:56,155 - INFO - 💾 Résultat enregistré dans C:\Users\cdiac\Desktop\KaggleProjects\titanic_project\results/results_live.csv (Run_ID=13)
2025-08-22 18:38:09,277 - INFO - 💾 Résultat enregistré dans C:\Users\cdiac\Desktop\KaggleProjects\titanic_project\results/results_live.csv (Run_ID=14)
  File "C:\Users\cdiac\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\cdiac\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cdiac\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\cdiac\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20

In [None]:
df_results["Abs Gap"] = df_results["Gap (CV - Train)"].abs()
df_sorted = df_results.sort_values(by=["Weighted Metric","Abs Gap"], ascending=[False,True])

fig_rank = px.bar(
    df_sorted,
    x="Run_ID", y="Weighted Metric",
    color="Model",
    text="Abs Gap",
    hover_data=["CV Accuracy","Train Accuracy","F1 Score","Recall","AUC","Best_Params"],
    title="Classement des modèles (Weighted Metric vs Gap)"
)
fig_rank.update_traces(texttemplate="Gap: %{text:.3f}", textposition="outside")
fig_rank.show()

logger.info("🏆 Classement terminé. Consultez %s et le graphique avant de choisir un modèle.", RESULTS_FILE)


In [None]:
DO_PREDICTION = False  # mettre True pour soumettre

if DO_PREDICTION:
    chosen_run_id = int(input("👉 Entrez le Run_ID du modèle choisi : "))
    chosen_row = df_results[df_results["Run_ID"] == chosen_run_id].iloc[0]
    best_pipe = chosen_row["best_pipe"]

    features_test = get_features(df_train, chosen_row["Use_IsAlone"], chosen_row["Use_Cabin"])
    X_train_full, y_train_full = df_train[features_test], df_train["Survived"]
    X_test = df_test[features_test]

    best_pipe.fit(X_train_full, y_train_full)
    y_test_pred = best_pipe.predict(X_test)

    df_submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_test_pred
    })
    df_submission.to_csv(os.path.join(PROJECT_ROOT, f"submissions/submission_run{chosen_run_id}.csv"), index=False)
    logger.info("💾 Fichier submissions/submission_run%s.csv généré", chosen_run_id)

    joblib.dump(best_pipe, os.path.join(PROJECT_ROOT, f"models/best_pipe_run{chosen_run_id}.pkl"))
    logger.info("💾 Pipeline sauvegardé sous models/best_pipe_run%s.pkl", chosen_run_id)
else:
    logger.info("⏸️ DO_PREDICTION=False → aucune prédiction test lancée.")
