# üîç Validation Compl√®te avec DeepChecks - Crime Dataset 2020

## üìã Table des Mati√®res
1. [Niveau 1 : Int√©grit√© des Donn√©es](#niveau-1)
2. [Niveau 2 : Drift et Distribution](#niveau-2)
3. [Niveau 3 : Performance du Mod√®le](#niveau-3)
4. [R√©sum√© et Recommandations](#resume)

---

## üì¶ Installation des D√©pendances

In [None]:
# Installation (si n√©cessaire)
# !pip install deepchecks pandas scikit-learn joblib mlflow dagshub

## üìö Imports

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
from datetime import datetime

# DeepChecks
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import (
    data_integrity,
    train_test_validation,
    model_evaluation
)

# DeepChecks - Checks individuels
from deepchecks.tabular.checks import (
    # Int√©grit√© des donn√©es
    MixedNulls,
    MixedDataTypes,
    StringMismatch,
    DataDuplicates,
    ConflictingLabels,
    OutlierSampleDetection,
    FeatureFeatureCorrelation,
    FeatureLabelCorrelation,
    
    # Drift et Distribution
    TrainTestFeatureDrift,
    TrainTestLabelDrift,
    WholeDatasetDrift,
    FeatureDrift,
    LabelDrift,
    
    # Performance du mod√®le
    PerformanceReport,
    ConfusionMatrixReport,
    RocReport,
    SimpleModelComparison,
    CalibrationScore,
    ModelInfo
)

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

print("‚úÖ Imports r√©ussis")

## üóÇÔ∏è Fonction de Mapping des Crimes

In [None]:
def map_crime_group_4(desc):
    """Regroupe les crimes en 4 cat√©gories principales"""
    if pd.isna(desc):
        return "Other / Fraud / Public Order Crime"
    desc = str(desc).upper()
    
    # Violent Crime
    if any(k in desc for k in [
        "ASSAULT", "BATTERY", "ROBBERY", "HOMICIDE",
        "MANSLAUGHTER", "KIDNAPPING", "CRIMINAL THREATS",
        "INTIMATE PARTNER", "RAPE", "SEX", "SODOMY",
        "ORAL COPULATION", "LEWD", "PORNOGRAPHY",
        "FALSE IMPRISONMENT"
    ]):
        return "Violent Crime"
    
    # Property & Theft Crime
    if any(k in desc for k in [
        "THEFT", "BURGLARY", "SHOPLIFTING",
        "VANDALISM", "ARSON", "PICKPOCKET",
        "PURSE SNATCH", "TRESPASS", "BIKE",
        "ILLEGAL DUMPING"
    ]):
        return "Property & Theft Crime"
    
    # Vehicle-Related Crime
    if any(k in desc for k in [
        "VEHICLE", "DWOC", "MOTOR VEHICLE",
        "BOAT"
    ]):
        return "Vehicle-Related Crime"
    
    return "Other / Fraud / Public Order Crime"

print("‚úÖ Fonction de mapping d√©finie")

## üìÇ Chargement des Donn√©es

In [None]:
# Charger les donn√©es
print("üìÇ Chargement des donn√©es...")
df = pd.read_csv('data/processed/crime_2020_processed.csv')

print(f"\nüìä Shape : {df.shape}")
print(f"\nüìã Colonnes disponibles :")
print(df.columns.tolist())

# Appliquer le regroupement
df['Crime_Group'] = df['Crm Cd Desc'].apply(map_crime_group_4)

print(f"\n‚úÖ Donn√©es charg√©es : {len(df):,} lignes")

## üîß Pr√©paration des Donn√©es

In [None]:
# S√©lection des features
feature_cols = ['Hour', 'Day_of_week', 'Month_num', 'LAT', 'LON', 'Vict Age', 'AREA']

X = df[feature_cols].copy()
y = df['Crime_Group']

# Encoder les labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"\nüìä Features : {feature_cols}")
print(f"\nüè∑Ô∏è Classes d√©tect√©es :")
for i, class_name in enumerate(le.classes_):
    count = (y == class_name).sum()
    print(f"   {i}. {class_name}: {count:,} ({count/len(y)*100:.2f}%)")

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"\nüìä Split :")
print(f"   Train : {X_train.shape}")
print(f"   Test  : {X_test.shape}")

# Imputation
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=feature_cols,
    index=X_train.index
)
X_test_imputed = pd.DataFrame(
    imputer.transform(X_test),
    columns=feature_cols,
    index=X_test.index
)

print(f"\n‚úÖ Imputation termin√©e")
print(f"   NaN restants (train) : {X_train_imputed.isna().sum().sum()}")
print(f"   NaN restants (test)  : {X_test_imputed.isna().sum().sum()}")

## üì¶ Chargement du Mod√®le

In [None]:
# Charger le meilleur mod√®le (LightGBM)
model_path = 'models/lightgbm_baseline.joblib'

try:
    model = joblib.load(model_path)
    print(f"‚úÖ Mod√®le charg√© : {model_path}")
    print(f"   Type : {type(model).__name__}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è Mod√®le non trouv√© : {model_path}")
    print("   Veuillez d'abord entra√Æner le mod√®le avec :")
    print("   python src/models/train.py --model lightgbm")
    model = None

## üóÇÔ∏è Cr√©ation des Datasets DeepChecks

In [None]:
# Cr√©er les datasets DeepChecks
train_dataset = Dataset(
    X_train_imputed,
    label=y_train,
    cat_features=[],  # Pas de features cat√©gorielles (toutes num√©riques)
    features=feature_cols
)

test_dataset = Dataset(
    X_test_imputed,
    label=y_test,
    cat_features=[],
    features=feature_cols
)

print("‚úÖ Datasets DeepChecks cr√©√©s")
print(f"   Train : {len(train_dataset)} √©chantillons")
print(f"   Test  : {len(test_dataset)} √©chantillons")

---
# <a id="niveau-1"></a>üìä Niveau 1 : Int√©grit√© des Donn√©es

V√©rification de la qualit√© des donn√©es : valeurs manquantes, duplicatas, outliers, coh√©rence.

---

## 1.1 - Suite Compl√®te d'Int√©grit√©

In [None]:
print("="*80)
print("üîç NIVEAU 1 : INT√âGRIT√â DES DONN√âES")
print("="*80)

# Cr√©er la suite d'int√©grit√©
integrity_suite = data_integrity()

# Ex√©cuter sur le dataset d'entra√Ænement
print("\nüìä Ex√©cution de la suite d'int√©grit√© sur les donn√©es d'entra√Ænement...")
integrity_result = integrity_suite.run(train_dataset)

# Afficher le r√©sultat
integrity_result.show()

## 1.2 - Checks Individuels D√©taill√©s

### 1.2.1 - Valeurs Manquantes

In [None]:
# Check des valeurs manquantes
mixed_nulls_check = MixedNulls()
result = mixed_nulls_check.run(train_dataset)
result.show()

### 1.2.2 - Duplicatas

In [None]:
# Check des duplicatas
duplicates_check = DataDuplicates()
result = duplicates_check.run(train_dataset)
result.show()

### 1.2.3 - D√©tection d'Outliers

In [None]:
# D√©tection des outliers
outlier_check = OutlierSampleDetection()
result = outlier_check.run(train_dataset)
result.show()

### 1.2.4 - Corr√©lations Feature-Feature

In [None]:
# Corr√©lations entre features
feature_correlation_check = FeatureFeatureCorrelation()
result = feature_correlation_check.run(train_dataset)
result.show()

### 1.2.5 - Corr√©lations Feature-Label

In [None]:
# Corr√©lations feature-label
label_correlation_check = FeatureLabelCorrelation()
result = label_correlation_check.run(train_dataset)
result.show()

## 1.3 - R√©sum√© Int√©grit√© des Donn√©es

In [None]:
print("\n" + "="*80)
print("üìã R√âSUM√â - INT√âGRIT√â DES DONN√âES")
print("="*80)

# Statistiques g√©n√©rales
print(f"\nüìä Statistiques g√©n√©rales :")
print(f"   ‚Ä¢ Nombre d'√©chantillons : {len(X_train_imputed):,}")
print(f"   ‚Ä¢ Nombre de features    : {len(feature_cols)}")
print(f"   ‚Ä¢ Nombre de classes     : {len(le.classes_)}")
print(f"   ‚Ä¢ Valeurs manquantes    : {X_train_imputed.isna().sum().sum()}")
print(f"   ‚Ä¢ Duplicatas            : {X_train_imputed.duplicated().sum()}")

# Distribution des classes
print(f"\nüè∑Ô∏è Distribution des classes (train) :")
for i, class_name in enumerate(le.classes_):
    count = (y_train == i).sum()
    percentage = count / len(y_train) * 100
    print(f"   {i}. {class_name:40} : {count:6,} ({percentage:5.2f}%)")

print("\n‚úÖ Niveau 1 termin√© !")

---
# <a id="niveau-2"></a>üìà Niveau 2 : Drift et Distribution

Comparaison des distributions entre train et test, d√©tection de drift.

---

## 2.1 - Suite Compl√®te Train/Test Validation

In [None]:
print("="*80)
print("üìà NIVEAU 2 : DRIFT ET DISTRIBUTION")
print("="*80)

# Cr√©er la suite de validation train/test
train_test_suite = train_test_validation()

# Ex√©cuter
print("\nüìä Ex√©cution de la validation train/test...")
train_test_result = train_test_suite.run(
    train_dataset=train_dataset,
    test_dataset=test_dataset
)

# Afficher
train_test_result.show()

## 2.2 - Drift des Features (D√©taill√©)

In [None]:
# Drift des features
feature_drift_check = TrainTestFeatureDrift()
result = feature_drift_check.run(
    train_dataset=train_dataset,
    test_dataset=test_dataset
)
result.show()

## 2.3 - Drift des Labels

In [None]:
# Drift des labels
label_drift_check = TrainTestLabelDrift()
result = label_drift_check.run(
    train_dataset=train_dataset,
    test_dataset=test_dataset
)
result.show()

## 2.4 - Analyse Statistique du Drift

In [None]:
# Analyse statistique d√©taill√©e du drift
from scipy.stats import ks_2samp

print("\n" + "="*80)
print("üìä ANALYSE STATISTIQUE DU DRIFT (Test de Kolmogorov-Smirnov)")
print("="*80)

drift_results = []

for feature in feature_cols:
    # Test KS
    statistic, p_value = ks_2samp(
        X_train_imputed[feature], 
        X_test_imputed[feature]
    )
    
    drift_detected = p_value < 0.05
    
    drift_results.append({
        'Feature': feature,
        'KS Statistic': statistic,
        'P-Value': p_value,
        'Drift D√©tect√©': 'üî¥ OUI' if drift_detected else '‚úÖ NON'
    })

df_drift = pd.DataFrame(drift_results)
df_drift = df_drift.sort_values('KS Statistic', ascending=False)

print("\n" + df_drift.to_string(index=False))
print("\n" + "="*80)

# Compteur de drift
n_drift = df_drift['Drift D√©tect√©'].str.contains('OUI').sum()
print(f"\nüîç R√©sultat : {n_drift}/{len(feature_cols)} features avec drift d√©tect√©")

if n_drift > 0:
    print("\n‚ö†Ô∏è ATTENTION : Drift d√©tect√© !")
    print("   Le mod√®le pourrait avoir des performances d√©grad√©es.")
else:
    print("\n‚úÖ Aucun drift significatif d√©tect√©.")
    print("   Les distributions train/test sont similaires.")

## 2.5 - Visualisations des Distributions

In [None]:
# Visualiser les distributions train vs test
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, feature in enumerate(feature_cols):
    ax = axes[i]
    
    # Histogrammes
    ax.hist(X_train_imputed[feature], bins=30, alpha=0.5, label='Train', density=True)
    ax.hist(X_test_imputed[feature], bins=30, alpha=0.5, label='Test', density=True)
    
    ax.set_xlabel(feature)
    ax.set_ylabel('Densit√©')
    ax.legend()
    ax.set_title(f'Distribution : {feature}')

# Cacher le dernier subplot vide
if len(feature_cols) < len(axes):
    for i in range(len(feature_cols), len(axes)):
        axes[i].axis('off')

plt.tight_layout()
plt.savefig('reports/deepchecks_distributions.png', dpi=150, bbox_inches='tight')
print("\n‚úÖ Graphique sauvegard√© : reports/deepchecks_distributions.png")
plt.show()

---
# <a id="niveau-3"></a>üéØ Niveau 3 : Performance du Mod√®le

√âvaluation compl√®te des performances du mod√®le sur le test set.

---

## 3.1 - Pr√©dictions

In [None]:
print("="*80)
print("üéØ NIVEAU 3 : PERFORMANCE DU MOD√àLE")
print("="*80)

if model is not None:
    # Pr√©dictions
    y_pred_train = model.predict(X_train_imputed)
    y_pred_test = model.predict(X_test_imputed)
    
    # Probabilit√©s (si disponible)
    if hasattr(model, 'predict_proba'):
        y_proba_test = model.predict_proba(X_test_imputed)
    else:
        y_proba_test = None
    
    print("\n‚úÖ Pr√©dictions g√©n√©r√©es")
    print(f"   Train : {len(y_pred_train)} pr√©dictions")
    print(f"   Test  : {len(y_pred_test)} pr√©dictions")
else:
    print("\n‚ö†Ô∏è Aucun mod√®le charg√©, skip Niveau 3")

## 3.2 - Suite Compl√®te d'√âvaluation

In [None]:
if model is not None:
    # Cr√©er la suite d'√©valuation
    model_eval_suite = model_evaluation()
    
    # Ex√©cuter
    print("\nüìä Ex√©cution de l'√©valuation du mod√®le...")
    model_eval_result = model_eval_suite.run(
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        model=model
    )
    
    # Afficher
    model_eval_result.show()
else:
    print("‚ö†Ô∏è Skip suite d'√©valuation (pas de mod√®le)")

## 3.3 - Rapport de Performance D√©taill√©

In [None]:
if model is not None:
    performance_check = PerformanceReport()
    result = performance_check.run(
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        model=model
    )
    result.show()
else:
    print("‚ö†Ô∏è Skip performance report")

## 3.4 - Matrice de Confusion

In [None]:
if model is not None:
    confusion_matrix_check = ConfusionMatrixReport()
    result = confusion_matrix_check.run(
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        model=model
    )
    result.show()
else:
    print("‚ö†Ô∏è Skip confusion matrix")

## 3.5 - M√©triques D√©taill√©es par Classe

In [None]:
if model is not None:
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    
    print("\n" + "="*80)
    print("üìä M√âTRIQUES D√âTAILL√âES")
    print("="*80)
    
    # M√©triques globales
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
    test_recall = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
    
    print(f"\nüéØ M√©triques Globales :")
    print(f"   ‚Ä¢ Train Accuracy    : {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"   ‚Ä¢ Test Accuracy     : {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"   ‚Ä¢ Test F1-Score     : {test_f1:.4f}")
    print(f"   ‚Ä¢ Test Precision    : {test_precision:.4f}")
    print(f"   ‚Ä¢ Test Recall       : {test_recall:.4f}")
    
    # Overfitting/Underfitting
    gap = train_acc - test_acc
    print(f"\nüìâ √âcart Train-Test : {gap:.4f} ({gap*100:.2f}%)")
    if gap > 0.1:
        print("   ‚ö†Ô∏è OVERFITTING d√©tect√© !")
    elif gap < 0:
        print("   ‚ö†Ô∏è UNDERFITTING d√©tect√© !")
    else:
        print("   ‚úÖ Bon √©quilibre train/test")
    
    # Rapport par classe
    print(f"\nüìã Classification Report :")
    print(classification_report(
        y_test, 
        y_pred_test, 
        target_names=le.classes_,
        zero_division=0
    ))
else:
    print("‚ö†Ô∏è Skip m√©triques d√©taill√©es")

## 3.6 - Feature Importance

In [None]:
if model is not None and hasattr(model, 'feature_importances_'):
    # Feature importance
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print("\n" + "="*80)
    print("üåü FEATURE IMPORTANCE")
    print("="*80)
    print(feature_importance_df.to_string(index=False))
    
    # Graphique
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance - LightGBM')
    plt.tight_layout()
    plt.savefig('reports/deepchecks_feature_importance.png', dpi=150, bbox_inches='tight')
    print("\n‚úÖ Graphique sauvegard√© : reports/deepchecks_feature_importance.png")
    plt.show()
else:
    print("‚ö†Ô∏è Feature importance non disponible pour ce mod√®le")

---
# <a id="resume"></a>üìã R√©sum√© et Recommandations

---

In [None]:
print("="*80)
print("üìã R√âSUM√â G√âN√âRAL - VALIDATION DEEPCHECKS")
print("="*80)

print(f"\nüìÖ Date : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìä Dataset : crime_2020_processed.csv")
print(f"ü§ñ Mod√®le : {type(model).__name__ if model else 'N/A'}")

print("\n" + "="*80)
print("‚úÖ NIVEAU 1 : INT√âGRIT√â DES DONN√âES")
print("="*80)
print(f"   ‚Ä¢ √âchantillons       : {len(X_train_imputed):,}")
print(f"   ‚Ä¢ Features           : {len(feature_cols)}")
print(f"   ‚Ä¢ Valeurs manquantes : {X_train_imputed.isna().sum().sum()}")
print(f"   ‚Ä¢ Duplicatas         : {X_train_imputed.duplicated().sum()}")

print("\n" + "="*80)
print("‚úÖ NIVEAU 2 : DRIFT ET DISTRIBUTION")
print("="*80)
if 'df_drift' in locals():
    n_drift = df_drift['Drift D√©tect√©'].str.contains('OUI').sum()
    print(f"   ‚Ä¢ Features avec drift : {n_drift}/{len(feature_cols)}")
    if n_drift > 0:
        print("   ‚ö†Ô∏è ATTENTION : Drift d√©tect√© sur certaines features")
        print("      ‚Üí Consid√©rer un r√©entra√Ænement avec donn√©es plus r√©centes")
    else:
        print("   ‚úÖ Distributions train/test similaires")

print("\n" + "="*80)
print("‚úÖ NIVEAU 3 : PERFORMANCE DU MOD√àLE")
print("="*80)
if model is not None:
    print(f"   ‚Ä¢ Test Accuracy  : {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"   ‚Ä¢ Test F1-Score  : {test_f1:.4f}")
    print(f"   ‚Ä¢ √âcart Train-Test : {gap:.4f}")
    
    if gap > 0.1:
        print("   ‚ö†Ô∏è Overfitting d√©tect√©")
    elif test_acc < 0.6:
        print("   ‚ö†Ô∏è Performance mod√©r√©e (< 60%)")
    else:
        print("   ‚úÖ Performance acceptable")
else:
    print("   ‚ö†Ô∏è Aucun mod√®le √©valu√©")

print("\n" + "="*80)
print("üí° RECOMMANDATIONS")
print("="*80)

recommendations = []

# Recommandations bas√©es sur les r√©sultats
if model is not None and test_acc < 0.6:
    recommendations.append("üìà Am√©liorer les performances (<60%) :")
    recommendations.append("   ‚Ä¢ Ajouter plus de features (temporelles, spatiales)")
    recommendations.append("   ‚Ä¢ Optimiser les hyperparam√®tres (GridSearch)")
    recommendations.append("   ‚Ä¢ Combiner avec donn√©es 2021 pour plus de samples")

if 'n_drift' in locals() and n_drift > 0:
    recommendations.append("üîÑ Drift d√©tect√© :")
    recommendations.append("   ‚Ä¢ R√©entra√Æner le mod√®le avec donn√©es plus r√©centes")
    recommendations.append("   ‚Ä¢ Mettre en place monitoring continu du drift")

if model is not None and gap > 0.1:
    recommendations.append("‚öñÔ∏è Overfitting d√©tect√© :")
    recommendations.append("   ‚Ä¢ Augmenter la r√©gularisation")
    recommendations.append("   ‚Ä¢ R√©duire la complexit√© du mod√®le")
    recommendations.append("   ‚Ä¢ Ajouter plus de donn√©es d'entra√Ænement")

recommendations.append("üöÄ Actions prioritaires :")
recommendations.append("   1. Promouvoir le meilleur mod√®le en production")
recommendations.append("   2. Mettre en place l'API et le monitoring")
recommendations.append("   3. Planifier r√©entra√Ænement p√©riodique")

for rec in recommendations:
    print(rec)

print("\n" + "="*80)
print("‚úÖ VALIDATION DEEPCHECKS TERMIN√âE")
print("="*80)

print("\nüìä Rapports sauvegard√©s :")
print("   ‚Ä¢ reports/deepchecks_distributions.png")
if model is not None and hasattr(model, 'feature_importances_'):
    print("   ‚Ä¢ reports/deepchecks_feature_importance.png")

print("\nüíæ Pour sauvegarder les rapports HTML :")
print("   integrity_result.save_as_html('reports/deepchecks_integrity.html')")
print("   train_test_result.save_as_html('reports/deepchecks_train_test.html')")
if model is not None:
    print("   model_eval_result.save_as_html('reports/deepchecks_model_eval.html')")

## üíæ Sauvegarder les Rapports HTML

In [None]:
# Cr√©er le dossier reports s'il n'existe pas
import os
os.makedirs('reports', exist_ok=True)

# Sauvegarder les rapports HTML
print("üíæ Sauvegarde des rapports HTML...\n")

integrity_result.save_as_html('reports/deepchecks_integrity.html')
print("   ‚úÖ reports/deepchecks_integrity.html")

train_test_result.save_as_html('reports/deepchecks_train_test.html')
print("   ‚úÖ reports/deepchecks_train_test.html")

if model is not None:
    model_eval_result.save_as_html('reports/deepchecks_model_eval.html')
    print("   ‚úÖ reports/deepchecks_model_eval.html")

print("\n‚úÖ Tous les rapports sont sauvegard√©s dans le dossier reports/")

---

## üéâ Validation Termin√©e !

Vous pouvez maintenant :
1. Consulter les rapports HTML dans `reports/`
2. Analyser les r√©sultats d√©taill√©s
3. Prendre des d√©cisions sur le r√©entra√Ænement
4. Promouvoir le mod√®le en production si satisfaisant

---