In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 01 - Exploration des Données\n",
    "\n",
    "Analyse exploratoire du dataset employés Ubisoft."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "from pathlib import Path\n",
    "\n",
    "plt.style.use('ggplot')\n",
    "sns.set_palette('husl')\n",
    "\n",
    "# Configuration\n",
    "pd.set_option('display.max_columns', None)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Chargement des données\n",
    "data_path = Path('../data/sample_data.csv')\n",
    "df = pd.read_csv(data_path)\n",
    "\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Informations générales\n",
    "print(\"=== INFORMATIONS DATASET ===\")\n",
    "print(df.info())\n",
    "print(\"\\n=== STATISTIQUES DESCRIPTIVES ===\")\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Distribution des variables numériques\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "fig.suptitle('Distribution des Variables Numériques', fontsize=16, fontweight='bold')\n",
    "\n",
    "# Creative Score\n",
    "axes[0,0].hist(df['creative_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')\n",
    "axes[0,0].set_title('Distribution Creative Score')\n",
    "axes[0,0].set_xlabel('Score Créativité')\n",
    "axes[0,0].set_ylabel('Fréquence')\n",
    "\n",
    "# Burnout Scale\n",
    "axes[0,1].hist(df['burnout_scale'], bins=10, alpha=0.7, color='salmon', edgecolor='black')\n",
    "axes[0,1].set_title('Distribution Burnout Scale')\n",
    "axes[0,1].set_xlabel('Échelle Burnout')\n",
    "axes[0,1].set_ylabel('Fréquence')\n",
    "\n",
    "# ADHD Risk\n",
    "adhd_counts = df['adhd_risk'].value_counts()\n",
    "axes[1,0].bar(adhd_counts.index, adhd_counts.values, alpha=0.7, color=['lightgreen', 'coral'])\n",
    "axes[1,0].set_title('Répartition Risque TDAH')\n",
    "axes[1,0].set_xlabel('Risque TDAH (0=Non, 1=Oui)')\n",
    "axes[1,0].set_ylabel('Nombre d\\'employés')\n",
    "\n",
    "# Autism Risk\n",
    "autism_counts = df['autism_risk'].value_counts()\n",
    "axes[1,1].bar(autism_counts.index, autism_counts.values, alpha=0.7, color=['lightblue', 'orange'])\n",
    "axes[1,1].set_title('Répartition Risque Autisme')\n",
    "axes[1,1].set_xlabel('Risque Autisme (0=Non, 1=Oui)')\n",
    "axes[1,1].set_ylabel('Nombre d\\'employés')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyse des corrélations\n",
    "numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
    "correlation_matrix = df[numeric_cols].corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
    "sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,\n",
    "            square=True, linewidths=0.5, cbar_kws={\"shrink\": .8})\n",
    "plt.title('Matrice de Corrélation', fontsize=16, fontweight='bold')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"Top 5 corrélations positives:\")\n",
    "corr_pairs = correlation_matrix.unstack().sort_values(ascending=False)\n",
    "corr_pairs = corr_pairs[corr_pairs < 1.0]  # Exclure les corrélations parfaites\n",
    "print(corr_pairs.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyse par département\n",
    "if 'department' in df.columns:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "    \n",
    "    # Creative Score par département\n",
    "    df.boxplot(column='creative_score', by='department', ax=axes[0])\n",
    "    axes[0].set_title('Creative Score par Département')\n",
    "    axes[0].set_xlabel('Département')\n",
    "    axes[0].set_ylabel('Score Créativité')\n",
    "    \n",
    "    # Burnout par département\n",
    "    df.boxplot(column='burnout_scale', by='department', ax=axes[1])\n",
    "    axes[1].set_title('Burnout Scale par Département')\n",
    "    axes[1].set_xlabel('Département')\n",
    "    axes[1].set_ylabel('Échelle Burnout')\n",
    "    \n",
    "    plt.suptitle('')  # Supprimer le titre automatique\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Détection des outliers\n",
    "def detect_outliers_iqr(data, column):\n",
    "    Q1 = data[column].quantile(0.25)\n",
    "    Q3 = data[column].quantile(0.75)\n",
    "    IQR = Q3 - Q1\n",
    "    lower_bound = Q1 - 1.5 * IQR\n",
    "    upper_bound = Q3 + 1.5 * IQR\n",
    "    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]\n",
    "    return outliers\n",
    "\n",
    "print(\"=== DÉTECTION DES OUTLIERS ===\")\n",
    "for col in ['creative_score', 'burnout_scale']:\n",
    "    outliers = detect_outliers_iqr(df, col)\n",
    "    print(f\"{col}: {len(outliers)} outliers détectés ({len(outliers)/len(df)*100:.1f}%)\")\n",
    "    if len(outliers) > 0:\n",
    "        print(f\"  Valeurs: {outliers[col].tolist()}\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyse multivariée avec Plotly\n",
    "fig = px.scatter(df, x='creative_score', y='burnout_scale', \n",
    "                color='adhd_risk', size='creative_score',\n",
    "                title='Relation Creative Score vs Burnout (coloré par risque TDAH)',\n",
    "                labels={'adhd_risk': 'Risque TDAH', 'creative_score': 'Score Créativité', 'burnout_scale': 'Échelle Burnout'})\n",
    "fig.show()\n",
    "\n",
    "# Graphique 3D interactif\n",
    "if 'autism_risk' in df.columns:\n",
    "    fig_3d = px.scatter_3d(df, x='creative_score', y='burnout_scale', z='adhd_risk',\n",
    "                          color='autism_risk', \n",
    "                          title='Analyse 3D: Créativité, Burnout, TDAH (coloré par Autisme)')\n",
    "    fig_3d.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Résumé des insights\n",
    "print(\"=== INSIGHTS CLÉS ===\")\n",
    "print(f\"1. Dataset: {len(df)} employés, {df.shape[1]} variables\")\n",
    "print(f\"2. Prévalence TDAH: {df['adhd_risk'].mean()*100:.1f}%\")\n",
    "if 'autism_risk' in df.columns:\n",
    "    print(f\"3. Prévalence Autisme: {df['autism_risk'].mean()*100:.1f}%\")\n",
    "print(f\"4. Score créativité moyen: {df['creative_score'].mean():.1f} ± {df['creative_score'].std():.1f}\")\n",
    "print(f\"5. Niveau burnout moyen: {df['burnout_scale'].mean():.1f}/10\")\n",
    "\n",
    "# Corrélation la plus forte\n",
    "strongest_corr = correlation_matrix.unstack().sort_values(ascending=False)\n",
    "strongest_corr = strongest_corr[strongest_corr < 1.0].iloc[0]\n",
    "strongest_pair = strongest_corr.name\n",
    "print(f\"6. Corrélation la plus forte: {strongest_pair[0]} ↔ {strongest_pair[1]} (r={strongest_corr:.3f})\")\n",
    "\n",
    "print(\"\\n=== RECOMMANDATIONS SUITE ===\")\n",
    "print(\"• Analyser les facteurs de burnout dans l'équipe créative\")\n",
    "print(\"• Investiguer la relation créativité-neurodiversité\")\n",
    "print(\"• Développer des indicateurs prédictifs de rétention\")\n",
    "print(\"• Créer des profils d'accommodation personnalisés\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
