# üîç Validation Compl√®te avec DeepChecks - Crime Dataset 2020

## üìã Compatible scikit-learn 1.8.0

### Table des Mati√®res
1. [Setup Compatibilit√©](#setup)
2. [Niveau 1 : Int√©grit√© des Donn√©es](#niveau-1)
3. [Niveau 2 : Drift et Distribution](#niveau-2)
4. [Niveau 3 : Performance du Mod√®le](#niveau-3)
5. [R√©sum√© et Recommandations](#resume)

---

## üì¶ Installation des D√©pendances

In [None]:
# Installation (si n√©cessaire)
# !pip install deepchecks pandas scikit-learn joblib matplotlib seaborn scipy

## <a id="setup"></a>üîß Setup Compatibilit√© scikit-learn 1.8.0

**‚ö†Ô∏è IMPORTANT : Ex√©cutez cette cellule EN PREMIER avant tous les imports !**

In [None]:
# ==========================================
# üîß FIX COMPATIBILIT√â SCIKIT-LEARN 1.8.0
# ==========================================

import sklearn.metrics
from sklearn.metrics import make_scorer
import numpy as np

def max_error_replacement(y_true, y_pred):
    """Remplace max_error supprim√© dans sklearn 1.8.0"""
    return np.max(np.abs(y_true - y_pred))

try:
    if 'max_error' not in sklearn.metrics.get_scorer_names():
        sklearn.metrics._SCORERS['max_error'] = make_scorer(
            max_error_replacement, greater_is_better=False
        )
        print("‚úÖ Fix sklearn 1.8.0 appliqu√© avec succ√®s")
    else:
        print("‚úÖ Scorer max_error d√©j√† disponible")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur: {e}")

print(f"\nüìå scikit-learn version: {sklearn.__version__}")
print("\n‚úÖ Setup termin√© - Continuez !")

## üìö Imports

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
from datetime import datetime

from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity, train_test_validation, model_evaluation
from deepchecks.tabular.checks import (
    MixedNulls, DataDuplicates, OutlierSampleDetection,
    FeatureFeatureCorrelation, FeatureLabelCorrelation,
    TrainTestFeatureDrift, TrainTestLabelDrift
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

warnings.filterwarnings('ignore')
print("‚úÖ Imports r√©ussis")

## üóÇÔ∏è Fonction de Mapping des Crimes

In [None]:
def map_crime_group_4(desc):
    """Regroupe les crimes en 4 cat√©gories principales"""
    if pd.isna(desc):
        return "Other / Fraud / Public Order Crime"
    desc = str(desc).upper()
    
    if any(k in desc for k in ["ASSAULT", "BATTERY", "ROBBERY", "HOMICIDE", "RAPE", "SEX"]):
        return "Violent Crime"
    if any(k in desc for k in ["THEFT", "BURGLARY", "SHOPLIFTING", "VANDALISM"]):
        return "Property & Theft Crime"
    if any(k in desc for k in ["VEHICLE", "DWOC", "MOTOR VEHICLE"]):
        return "Vehicle-Related Crime"
    return "Other / Fraud / Public Order Crime"

print("‚úÖ Fonction de mapping d√©finie")

## üìÇ Chargement des Donn√©es

In [None]:
df = pd.read_csv('data/processed/crime_2020_processed.csv')
df['Crime_Group'] = df['Crm Cd Desc'].apply(map_crime_group_4)
print(f"‚úÖ Donn√©es charg√©es : {len(df):,} lignes")
print(f"Shape : {df.shape}")

## üîß Pr√©paration des Donn√©es

In [None]:
feature_cols = ['Hour', 'Day_of_week', 'Month_num', 'LAT', 'LON', 'Vict Age', 'AREA']
X = df[feature_cols].copy()
y = df['Crime_Group']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Features : {feature_cols}")
print(f"\nClasses :")
for i, name in enumerate(le.classes_):
    count = (y == name).sum()
    print(f"  {i}. {name}: {count:,} ({count/len(y)*100:.1f}%)")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train), columns=feature_cols, index=X_train.index
)
X_test_imputed = pd.DataFrame(
    imputer.transform(X_test), columns=feature_cols, index=X_test.index
)

print(f"\nTrain: {X_train.shape}, Test: {X_test.shape}")
print(f"NaN train: {X_train_imputed.isna().sum().sum()}")
print(f"NaN test: {X_test_imputed.isna().sum().sum()}")

## üì¶ Chargement du Mod√®le

In [None]:
try:
    model = joblib.load('models/lightgbm_baseline.joblib')
    print(f"‚úÖ Mod√®le charg√©: {type(model).__name__}")
except:
    model = None
    print("‚ö†Ô∏è Mod√®le non trouv√© (Niveau 3 sera skipp√©)")

## üóÇÔ∏è Cr√©ation des Datasets DeepChecks

In [None]:
train_dataset = Dataset(X_train_imputed, label=y_train, features=feature_cols)
test_dataset = Dataset(X_test_imputed, label=y_test, features=feature_cols)
print(f"‚úÖ Datasets cr√©√©s")
print(f"  Train: {len(train_dataset)} √©chantillons")
print(f"  Test: {len(test_dataset)} √©chantillons")

---
# <a id="niveau-1"></a>üìä Niveau 1 : Int√©grit√© des Donn√©es
---

In [None]:
print("="*80)
print("üîç NIVEAU 1 : INT√âGRIT√â DES DONN√âES")
print("="*80)

try:
    integrity_suite = data_integrity()
    integrity_result = integrity_suite.run(train_dataset)
    integrity_result.show()
    print("\n‚úÖ Suite d'int√©grit√© OK")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur: {e}")

In [None]:
checks = [
    (MixedNulls(), "Valeurs manquantes"),
    (DataDuplicates(), "Duplicatas"),
    (OutlierSampleDetection(), "Outliers"),
    (FeatureFeatureCorrelation(), "Corr√©lations FF"),
    (FeatureLabelCorrelation(), "Corr√©lations FL")
]

for check, name in checks:
    try:
        check.run(train_dataset).show()
        print(f"‚úÖ {name}")
    except Exception as e:
        print(f"‚ö†Ô∏è {name}: {e}")

---
# <a id="niveau-2"></a>üìà Niveau 2 : Drift et Distribution
---

In [None]:
print("="*80)
print("üìà NIVEAU 2 : DRIFT ET DISTRIBUTION")
print("="*80)

try:
    tt_suite = train_test_validation()
    tt_result = tt_suite.run(train_dataset, test_dataset)
    tt_result.show()
    print("\n‚úÖ Suite train/test OK")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur: {e}")

In [None]:
print("\n" + "="*80)
print("üìä ANALYSE STATISTIQUE DU DRIFT (Kolmogorov-Smirnov)")
print("="*80)

drift_results = []
for feature in feature_cols:
    stat, pval = ks_2samp(X_train_imputed[feature], X_test_imputed[feature])
    drift_results.append({
        'Feature': feature,
        'KS Statistic': f"{stat:.4f}",
        'P-Value': f"{pval:.4f}",
        'Drift': 'üî¥ OUI' if pval < 0.05 else '‚úÖ NON'
    })

df_drift = pd.DataFrame(drift_results)
print("\n" + df_drift.to_string(index=False))

n_drift = df_drift['Drift'].str.contains('OUI').sum()
print(f"\nüîç R√©sultat : {n_drift}/{len(feature_cols)} features avec drift")

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
    axes[i].hist(X_train_imputed[col], bins=30, alpha=0.5, label='Train', density=True)
    axes[i].hist(X_test_imputed[col], bins=30, alpha=0.5, label='Test', density=True)
    axes[i].set_title(col)
    axes[i].legend()

for i in range(len(feature_cols), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.savefig('reports/distributions.png', dpi=150, bbox_inches='tight')
print("‚úÖ Graphique sauvegard√©: reports/distributions.png")
plt.show()

---
# <a id="niveau-3"></a>üéØ Niveau 3 : Performance du Mod√®le
---

In [None]:
print("="*80)
print("üéØ NIVEAU 3 : PERFORMANCE DU MOD√àLE")
print("="*80)

if model:
    try:
        eval_suite = model_evaluation()
        eval_result = eval_suite.run(train_dataset, test_dataset, model)
        eval_result.show()
        print("\n‚úÖ Suite d'√©valuation OK")
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur: {e}")
else:
    print("‚ö†Ô∏è Pas de mod√®le charg√© - Niveau 3 skipp√©")

In [None]:
if model:
    y_pred_train = model.predict(X_train_imputed)
    y_pred_test = model.predict(X_test_imputed)
    
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    gap = train_acc - test_acc
    
    print("\n" + "="*80)
    print("üìä M√âTRIQUES D√âTAILL√âES")
    print("="*80)
    print(f"\nTrain Accuracy : {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"Test Accuracy  : {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"Test F1-Score  : {test_f1:.4f}")
    print(f"Gap Train-Test : {gap:.4f} ({gap*100:.2f}%)")
    
    if gap > 0.1:
        print("\n‚ö†Ô∏è OVERFITTING d√©tect√© !")
    else:
        print("\n‚úÖ Bon √©quilibre train/test")
    
    print("\n" + classification_report(y_test, y_pred_test, target_names=le.classes_))

In [None]:
if model and hasattr(model, 'feature_importances_'):
    imp_df = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\n" + "="*80)
    print("üåü FEATURE IMPORTANCE")
    print("="*80)
    print(imp_df.to_string(index=False))
    
    plt.figure(figsize=(10, 6))
    plt.barh(imp_df['Feature'], imp_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance - LightGBM')
    plt.tight_layout()
    plt.savefig('reports/feature_importance.png', dpi=150, bbox_inches='tight')
    print("\n‚úÖ Graphique sauvegard√©: reports/feature_importance.png")
    plt.show()

---
# <a id="resume"></a>üìã R√©sum√© et Recommandations
---

In [None]:
print("="*80)
print("üìã R√âSUM√â G√âN√âRAL - VALIDATION DEEPCHECKS")
print("="*80)
print(f"\nüìÖ Date : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìä Dataset : crime_2020_processed.csv")
print(f"ü§ñ Mod√®le : {type(model).__name__ if model else 'N/A'}")

print("\n" + "="*80)
print("‚úÖ NIVEAU 1 : INT√âGRIT√â")
print("="*80)
print(f"  √âchantillons    : {len(X_train_imputed):,}")
print(f"  Features        : {len(feature_cols)}")
print(f"  Classes         : {len(le.classes_)}")
print(f"  NaN             : {X_train_imputed.isna().sum().sum()}")
print(f"  Duplicatas      : {X_train_imputed.duplicated().sum()}")

print("\n" + "="*80)
print("‚úÖ NIVEAU 2 : DRIFT")
print("="*80)
if 'n_drift' in locals():
    print(f"  Features drift  : {n_drift}/{len(feature_cols)}")
    if n_drift > 0:
        print("  ‚ö†Ô∏è Drift d√©tect√© sur certaines features")
    else:
        print("  ‚úÖ Pas de drift significatif")

print("\n" + "="*80)
print("‚úÖ NIVEAU 3 : PERFORMANCE")
print("="*80)
if model:
    print(f"  Test Accuracy   : {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"  Test F1-Score   : {test_f1:.4f}")
    print(f"  Gap Train-Test  : {gap:.4f}")
else:
    print("  ‚ö†Ô∏è Aucun mod√®le √©valu√©")

print("\n" + "="*80)
print("üí° RECOMMANDATIONS")
print("="*80)
print("  1. ‚úÖ Donn√©es pr√™tes pour production")
print("  2. üîÑ Surveiller le drift en continu")
print("  3. üìà Am√©liorer features si accuracy < 60%")
print("  4. üöÄ D√©ployer l'API de pr√©diction")
print("\n‚úÖ Validation termin√©e !")

## üíæ Sauvegarder les Rapports HTML

In [None]:
import os
os.makedirs('reports', exist_ok=True)

print("üíæ Sauvegarde des rapports HTML...\n")

if 'integrity_result' in locals():
    integrity_result.save_as_html('reports/deepchecks_integrity.html')
    print("  ‚úÖ reports/deepchecks_integrity.html")

if 'tt_result' in locals():
    tt_result.save_as_html('reports/deepchecks_train_test.html')
    print("  ‚úÖ reports/deepchecks_train_test.html")

if model and 'eval_result' in locals():
    eval_result.save_as_html('reports/deepchecks_model_eval.html')
    print("  ‚úÖ reports/deepchecks_model_eval.html")

print("\n‚úÖ Tous les rapports sont sauvegard√©s !")