In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 02 - Entraînement des Modèles\n",
    "\n",
    "Développement et optimisation des modèles ML pour la détection de neurodiversité."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n",
    "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
    "import joblib\n",
    "import mlflow\n",
    "import mlflow.sklearn\n",
    "from pathlib import Path\n",
    "\n",
    "# Configuration MLflow\n",
    "mlflow.set_experiment(\"ubisoft_people_analytics\")\n",
    "\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Chargement et préparation des données\n",
    "df = pd.read_csv('../data/sample_data.csv')\n",
    "print(f\"Dataset original: {df.shape}\")\n",
    "\n",
    "# Feature engineering\n",
    "def prepare_features(data):\n",
    "    \"\"\"Préparer les features pour l'entraînement.\"\"\"\n",
    "    df_prep = data.copy()\n",
    "    \n",
    "    # Encoder les variables catégorielles si elles existent\n",
    "    if 'department' in df_prep.columns:\n",
    "        le = LabelEncoder()\n",
    "        df_prep['department_encoded'] = le.fit_transform(df_prep['department'])\n",
    "        df_prep = df_prep.drop('department', axis=1)\n",
    "    \n",
    "    # Créer des features dérivées\n",
    "    df_prep['creativity_burnout_ratio'] = df_prep['creative_score'] / (df_prep['burnout_scale'] + 1)\n",
    "    df_prep['high_creativity'] = (df_prep['creative_score'] > df_prep['creative_score'].quantile(0.75)).astype(int)\n",
    "    df_prep['high_burnout'] = (df_prep['burnout_scale'] > df_prep['burnout_scale'].quantile(0.75)).astype(int)\n",
    "    \n",
    "    return df_prep\n",
    "\n",
    "df_features = prepare_features(df)\n",
    "print(f\"Après feature engineering: {df_features.shape}\")\n",
    "df_features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Préparation des datasets pour TDAH et Autisme\n",
    "def prepare_ml_data(data, target_col):\n",
    "    \"\"\"Préparer X et y pour l'entraînement.\"\"\"\n",
    "    # Exclure les colonnes non-features\n",
    "    exclude_cols = ['employee_id', 'adhd_risk', 'autism_risk'] \n",
    "    feature_cols = [col for col in data.columns if col not in exclude_cols]\n",
    "    \n",
    "    X = data[feature_cols]\n",
    "    y = data[target_col]\n",
    "    \n",
    "    print(f\"Features utilisées: {feature_cols}\")\n",
    "    print(f\"Distribution de {target_col}:\")\n",
    "    print(y.value_counts(normalize=True))\n",
    "    \n",
    "    return X, y\n",
    "\n",
    "# Préparer les données pour TDAH\n",
    "X_adhd, y_adhd = prepare_ml_data(df_features, 'adhd_risk')\n",
    "\n",
    "# Split train/test\n",
    "X_train_adhd, X_test_adhd, y_train_adhd, y_test_adhd = train_test_split(\n",
    "    X_adhd, y_adhd, test_size=0.2, random_state=42, stratify=y_adhd\n",
    ")\n",
    "\n",
    "print(f\"\\nTailles des datasets TDAH:\")\n",
    "print(f\"Train: {X_train_adhd.shape}, Test: {X_test_adhd.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Définition des modèles à tester\n",
    "models = {\n",
    "    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),\n",
    "    'GradientBoosting': GradientBoostingClassifier(random_state=42),\n",
    "    'LogisticRegression': LogisticRegression(random_state=42, class_weight='balanced'),\n",
    "    'SVM': SVC(random_state=42, class_weight='balanced', probability=True)\n",
    "}\n",
    "\n",
    "# Grilles d'hyperparamètres\n",
    "param_grids = {\n",
    "    'RandomForest': {\n",
    "        'n_estimators': [100, 200],\n",
    "        'max_depth': [5, 10, None],\n",
    "        'min_samples_split': [2, 5],\n",
    "        'min_samples_leaf': [1, 2]\n",
    "    },\n",
    "    'GradientBoosting': {\n",
    "        'n_estimators': [100, 200],\n",
    "        'learning_rate': [0.05, 0.1],\n",
    "        'max_depth': [3, 5]\n",
    "    },\n",
    "    'LogisticRegression': {\n",
    "        'C': [0.1, 1, 10],\n",
    "        'penalty': ['l1', 'l2'],\n",
    "        'solver': ['liblinear']\n",
    "    },\n",
    "    'SVM': {\n",
    "        'C': [0.1, 1, 10],\n",
    "        'kernel': ['rbf', 'linear'],\n",
    "        'gamma': ['scale', 'auto']\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Fonction d'entraînement et évaluation\n",
    "def train_and_evaluate_model(model, param_grid, X_train, X_test, y_train, y_test, model_name):\n",
    "    \"\"\"Entraîner et évaluer un modèle avec GridSearch.\"\"\"\n",
    "    \n",
    "    with mlflow.start_run(run_name=f\"{model_name}_ADHD\"):\n",
    "        # GridSearch avec validation croisée\n",
    "        grid_search = GridSearchCV(\n",
    "            model, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1\n",
    "        )\n",
    "        \n",
    "        # Standardisation si nécessaire\n",
    "        if model_name in ['LogisticRegression', 'SVM']:\n",
    "            scaler = StandardScaler()\n",
    "            X_train_scaled = scaler.fit_transform(X_train)\n",
    "            X_test_scaled = scaler.transform(X_test)\n",
    "        else:\n",
    "            X_train_scaled = X_train\n",
    "            X_test_scaled = X_test\n",
    "            scaler = None\n",
    "        \n",
    "        # Entraînement\n",
    "        grid_search.fit(X_train_scaled, y_train)\n",
    "        best_model = grid_search.best_estimator_\n",
    "        \n",
    "        # Prédictions\n",
    "        y_pred = best_model.predict(X_test_scaled)\n",
    "        y_prob = best_model.predict_proba(X_test_scaled)[:, 1]\n",
    "        \n",
    "        # Métriques\n",
    "        from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score\n",
    "        \n",
    "        accuracy = accuracy_score(y_test, y_pred)\n",
    "        precision = precision_score(y_test, y_pred)\n",
    "        recall = recall_score(y_test, y_pred)\n",
    "        f1 = f1_score(y_test, y_pred)\n",
    "        auc = roc_auc_score(y_test, y_prob)\n",
    "        \n",
    "        # Cross-validation score\n",
    "        cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='f1')\n",
    "        \n",
    "        # Log MLflow\n",
    "        mlflow.log_params(grid_search.best_params_)\n",
    "        mlflow.log_metrics({\n",
    "            'accuracy': accuracy,\n",
    "            'precision': precision,\n",
    "            'recall': recall,\n",
    "            'f1_score': f1,\n",
    "            'auc_roc': auc,\n",
    "            'cv_f1_mean': cv_scores.mean(),\n",
    "            'cv_f1_std': cv_scores.std()\n",
    "        })\n",
    "        \n",
    "        # Sauvegarder le modèle\n",
    "        mlflow.sklearn.log_model(best_model, \"model\")\n",
    "        if scaler:\n",
    "            mlflow.sklearn.log_model(scaler, \"scaler\")\n",
    "        \n",
    "        # Résultats\n",
    "        results = {\n",
    "            'model': best_model,\n",
    "            'scaler': scaler,\n",
    "            'best_params': grid_search.best_params_,\n",
    "            'metrics': {\n",
    "                'accuracy': accuracy,\n",
    "                'precision': precision,\n",
    "                'recall': recall,\n",
    "                'f1_score': f1,\n",
    "                'auc_roc': auc,\n",
    "                'cv_f1_mean': cv_scores.mean(),\n",
    "                'cv_f1_std': cv_scores.std()\n",
    "            },\n",
    "            'predictions': y_pred,\n",
    "            'probabilities': y_prob\n",
    "        }\n",
    "        \n",
    "        print(f\"\\n=== {model_name} RESULTS ===\")\n",
    "        print(f\"Best params: {grid_search.best_params_}\")\n",
    "        print(f\"F1-Score: {f1:.3f}\")\n",
    "        print(f\"AUC-ROC: {auc:.3f}\")\n",
    "        print(f\"CV F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}\")\n",
    "        \n",
    "        return results\n",
    "\n",
    "# Entraîner tous les modèles\n",
    "results = {}\n",
    "for model_name, model in models.items():\n",
    "    if model_name in param_grids:\n",
    "        results[model_name] = train_and_evaluate_model(\n",
    "            model, param_grids[model_name], \n",
    "            X_train_adhd, X_test_adhd, y_train_adhd, y_test_adhd, \n",
    "            model_name\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Comparaison des modèles\n",
    "comparison_df = pd.DataFrame({\n",
    "    model_name: [\n",
    "        result['metrics']['f1_score'],\n",
    "        result['metrics']['auc_roc'],\n",
    "        result['metrics']['precision'],\n",
    "        result['metrics']['recall'],\n",
    "        result['metrics']['accuracy']\n",
    "    ] for model_name, result in results.items()\n",
    "}, index=['F1-Score', 'AUC-ROC', 'Precision', 'Recall', 'Accuracy'])\n",
    "\n",
    "print(\"=== COMPARAISON DES MODÈLES ===\")\n",
    "print(comparison_df.round(3))\n",
    "\n",
    "# Visualisation\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# Heatmap des métriques\n",
    "sns.heatmap(comparison_df, annot=True, cmap='RdYlGn', ax=axes[0], cbar_kws={'shrink': 0.8})\n",
    "axes[0].set_title('Comparaison des Métriques par Modèle')\n",
    "\n",
    "# Barplot F1-Score\n",
    "f1_scores = comparison_df.loc['F1-Score'].sort_values(ascending=True)\n",
    "axes[1].barh(f1_scores.index, f1_scores.values, alpha=0.7)\n",
    "axes[1].set_title('F1-Score par Modèle')\n",
    "axes[1].set_xlabel('F1-Score')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Meilleur modèle\n",
    "best_model_name = comparison_df.loc['F1-Score'].idxmax()\n",
    "print(f\"\\n🏆 Meilleur modèle: {best_model_name} (F1-Score: {comparison_df.loc['F1-Score', best_model_name]:.3f})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyse détaillée du meilleur modèle\n",
    "best_result = results[best_model_name]\n",
    "best_model = best_result['model']\n",
    "\n",
    "# Matrice de confusion\n",
    "cm = confusion_matrix(y_test_adhd, best_result['predictions'])\n",
    "plt.figure(figsize=(8, 6))\n",
    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
    "            xticklabels=['No ADHD Risk', 'ADHD Risk'],\n",
    "            yticklabels=['No ADHD Risk', 'ADHD Risk'])\n",
    "plt.title(f'Matrice de Confusion - {best_model_name}')\n",
    "plt.ylabel('Vraie Classe')\n",
    "plt.xlabel('Classe Prédite')\n",
    "plt.show()\n",
    "\n",
    "# Courbe ROC\n",
    "fpr, tpr, thresholds = roc_curve(y_test_adhd, best_result['probabilities'])\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot(fpr, tpr, color='darkorange', lw=2, \n",
    "         label=f'ROC Curve (AUC = {best_result[\"metrics\"][\"auc_roc\"]:.3f})')\n",
    "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')\n",
    "plt.xlim([0.0, 1.0])\n",
    "plt.ylim([0.0, 1.05])\n",
    "plt.xlabel('Taux de Faux Positifs')\n",
    "plt.ylabel('Taux de Vrais Positifs')\n",
    "plt.title(f'Courbe ROC - {best_model_name}')\n",
    "plt.legend(loc=\"lower right\")\n",
    "plt.grid(alpha=0.3)\n",
    "plt.show()\n",
    "\n",
    "# Feature importance (si disponible)\n",
    "if hasattr(best_model, 'feature_importances_'):\n",
    "    feature_importance = pd.DataFrame({\n",
    "        'feature': X_adhd.columns,\n",
    "        'importance': best_model.feature_importances_\n",
    "    }).sort_values('importance', ascending=True)\n",
    "    \n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.barh(feature_importance['feature'], feature_importance['importance'], alpha=0.7)\n",
    "    plt.title(f'Importance des Features - {best_model_name}')\n",
    "    plt.xlabel('Importance')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    print(\"Top 5 features les plus importantes:\")\n",
    "    print(feature_importance.tail().to_string(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Sauvegarde du meilleur modèle\n",
    "model_dir = Path('../models')\n",
    "model_dir.mkdir(exist_ok=True)\n",
    "\n",
    "# Sauvegarder le modèle\n",
    "joblib.dump(best_model, model_dir / 'random_forest.pkl')\n",
    "if best_result['scaler']:\n",
    "    joblib.dump(best_result['scaler'], model_dir / 'scaler.pkl')\n",
    "\n",
    "# Sauvegarder les métadonnées\n",
    "metadata = {\n",
    "    'model_name': best_model_name,\n",
    "    'model_type': 'ADHD_classifier',\n",
    "    'training_date': pd.Timestamp.now().isoformat(),\n",
    "    'feature_names': X_adhd.columns.tolist(),\n",
    "    'best_params': best_result['best_params'],\n",
    "    'metrics': best_result['metrics'],\n",
    "    'target_variable': 'adhd_risk'\n",
    "}\n",
    "\n",
    "import json\n",
    "with open(model_dir / 'model_metadata.json', 'w') as f:\n",
    "    json.dump(metadata, f, indent=2)\n",
    "\n",
    "print(f\"✅ Modèle sauvegardé: {model_dir / 'random_forest.pkl'}\")\n",
    "print(f\"✅ Métadonnées sauvegardées: {model_dir / 'model_metadata.json'}\")\n",
    "print(f\"✅ Performances finales: F1={best_result['metrics']['f1_score']:.3f}, AUC={best_result['metrics']['auc_roc']:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Test du modèle sauvegardé\n",
    "print(\"=== TEST DU MODÈLE SAUVEGARDÉ ===\")\n",
    "loaded_model = joblib.load(model_dir / 'random_forest.pkl')\n",
    "\n",
    "# Test sur un échantillon\n",
    "test_sample = X_test_adhd.iloc[:5]\n",
    "predictions = loaded_model.predict(test_sample)\n",
    "probabilities = loaded_model.predict_proba(test_sample)\n",
    "\n",
    "print(\"Prédictions sur échantillon:\")\n",
    "for i, (pred, prob) in enumerate(zip(predictions, probabilities)):\n",
    "    print(f\"Employé {i+1}: Prédiction={pred}, Probabilité TDAH={prob[1]:.3f}\")\n",
    "\n",
    "print(\"\\n=== RÉSUMÉ FINAL ===\")\n",
    "print(f\"• Meilleur modèle: {best_model_name}\")\n",
    "print(f\"• F1-Score: {best_result['metrics']['f1_score']:.3f}\")\n",
    "print(f\"• Précision: {best_result['metrics']['precision']:.3f}\")\n",
    "print(f\"• Rappel: {best_result['metrics']['recall']:.3f}\")\n",
    "print(f\"• AUC-ROC: {best_result['metrics']['auc_roc']:.3f}\")\n",
    "print(f\"• Validation croisée: {best_result['metrics']['cv_f1_mean']:.3f} ± {best_result['metrics']['cv_f1_std']:.3f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",\n   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
