In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Ajouter src au path si besoin
sys.path.append(os.path.join(os.getcwd(), '..'))

print("‚úÖ Imports de base effectu√©s")

# Deepchecks
try:
    from deepchecks.tabular import Dataset
    from deepchecks.tabular.suites import data_integrity
    DEEPCHECKS_AVAILABLE = True
    print("‚úÖ Deepchecks import√©")
except ImportError as e:
    print(f"‚ö†Ô∏è Deepchecks non disponible: {e}")
    print("Installation: pip install deepchecks[all]")
    DEEPCHECKS_AVAILABLE = False

# Configuration des chemins
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
REPORTS_DIR = BASE_DIR / 'reports' / 'deepchecks'
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

# Fichier sp√©cifique √† charger
TARGET_FILE = PROCESSED_DATA_DIR / 'crime_2020_processed2.csv'

print("="*80)
print("üìä DEEPCHECKS CRIME LA - NIVEAU 1 : INT√âGRIT√â DES DONN√âES (PR√âPROCESS√âES 2020)")
print("="*80)
print(f"üìÖ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìÅ Base: {BASE_DIR}")
print(f"üìÅ Fichier cible: {TARGET_FILE}")
print(f"üìÅ Reports: {REPORTS_DIR}")
print()

def load_crime_processed_data():
    """Charge sp√©cifiquement crime_2020_processed2.csv"""
    print("üì¶ Chargement des donn√©es pr√©process√©es")
    print("-" * 80)
    
    if not TARGET_FILE.exists():
        raise FileNotFoundError(
            f"‚ùå Fichier non trouv√© : {TARGET_FILE}\n"
            "   ‚Üí Ex√©cute d'abord preprocessed.py pour g√©n√©rer ce fichier."
        )
    
    print(f"‚úÖ Donn√©es charg√©es: {TARGET_FILE.name}")
    df = pd.read_csv(TARGET_FILE)
    
    print(f"   Shape: {df.shape}")
    print(f"   Colonnes: {len(df.columns)}")
    print(f"   Colonnes disponibles: {list(df.columns)}")
    print()
    
    return df

# Chargement
df_crime = load_crime_processed_data()

# Aper√ßu
print("üìã Aper√ßu des donn√©es pr√©process√©es:")
display(df_crime.head())

print(f"\nüìä Info des colonnes:")
df_crime.info()

def prepare_crime_dataset_for_deepchecks(df):
    """Pr√©paration du Dataset Deepchecks (adapt√© √† tes donn√©es pr√©process√©es)"""
    print("\nüìù Pr√©paration Dataset Deepchecks")
    print("-" * 80)
    
    # Label : Crime_Group (cr√©√© dans preprocessing)
    label_col = 'Crime_Group' if 'Crime_Group' in df.columns else None
    
    if label_col is None:
        print("‚ö†Ô∏è Attention: 'Crime_Group' non trouv√© ‚Üí pas de label pour Deepchecks")
    
    # Features cat√©gorielles
    cat_features = ['Vict Sex', 'Vict Descent', 'Day of Week', 'Month', 'Day Type']
    cat_features = [col for col in cat_features if col in df.columns]
    
    print(f"‚úÖ Configuration:")
    print(f"   Label: {label_col}")
    print(f"   Features cat√©gorielles: {cat_features}")
    
    if DEEPCHECKS_AVAILABLE and label_col:
        dataset = Dataset(df, label=label_col, cat_features=cat_features)
        return dataset, label_col
    else:
        return None, label_col

# Pr√©paration
crime_dataset, label_column = prepare_crime_dataset_for_deepchecks(df_crime)

def run_crime_data_integrity_checks(dataset, df):
    """Ex√©cution des checks d'int√©grit√©"""
    print("\n" + "="*80)
    print("üìä EX√âCUTION DES CHECKS DEEPCHECKS")
    print("="*80)
    
    if DEEPCHECKS_AVAILABLE and dataset:
        print("\nüîç Lancement de la suite data_integrity()...")
        integrity_suite = data_integrity()
        result = integrity_suite.run(dataset)
        
        print("\nüìä RAPPORT INTERACTIF (dans le notebook) :")
        result.show()
        
        # Sauvegarde HTML (facultatif)
        report_path = REPORTS_DIR / 'deepchecks_integrity_processed_2020.html'
        result.save_as_html(str(report_path))
        print(f"üíæ Rapport HTML sauvegard√© : {report_path.name}")
    else:
        print("‚ö†Ô∏è Deepchecks non ex√©cut√© (pas de label ou biblioth√®que absente)")

# Lancement des checks
run_crime_data_integrity_checks(crime_dataset, df_crime)

# V√©rifications critiques manuelles (adapt√©es √† tes donn√©es propres)
def critical_checks_processed(df):
    print("\nüö® V√âRIFICATIONS CRITIQUES MANUELLES")
    print("-" * 80)
    
    issues = []
    
    # GPS
    if {'LAT', 'LON'}.issubset(df.columns):
        invalid_gps = df[(df['LAT'] == 0) | (df['LON'] == 0) | 
                         (df['LAT'].isna()) | (df['LON'].isna()) |
                         (~df['LAT'].between(33.7, 34.4)) | 
                         (~df['LON'].between(-118.7, -118.1))].shape[0]
        pct = invalid_gps / len(df) * 100
        if pct > 0.01:
            issues.append(f"üî¥ GPS invalides: {invalid_gps} lignes ({pct:.2f}%)")
        else:
            print(f"‚úÖ GPS: {invalid_gps} invalides ({pct:.2f}%) ‚Üí Excellent")

    # Heures
    if 'Hour' in df.columns:
        invalid_hours = ((df['Hour'] < 0) | (df['Hour'] > 23)).sum()
        if invalid_hours == 0:
            print(f"‚úÖ Heures: Toutes valides (0-23)")
        else:
            issues.append(f"üü° Heures invalides: {invalid_hours}")

    # √Çges
    if 'Vict Age' in df.columns:
        invalid_ages = ((df['Vict Age'] < 0) | (df['Vict Age'] > 120)).sum()
        if invalid_ages == 0:
            print(f"‚úÖ √Çges: Tous raisonnables")
        else:
            issues.append(f"üü° √Çges aberrants: {invalid_ages}")

    # Doublons
    duplicates = df.duplicated().sum()
    if duplicates == 0:
        print(f"‚úÖ Doublons: 0 d√©tect√©s")
    else:
        issues.append(f"üî¥ Doublons: {duplicates}")

    # Valeurs manquantes critiques
    critical_cols = ['Hour', 'LAT', 'LON', 'Crime_Group']
    missing_critical = df[critical_cols].isna().sum().sum()
    if missing_critical == 0:
        print(f"‚úÖ Aucune valeur manquante dans les colonnes critiques")
    else:
        issues.append(f"üî¥ NaN dans colonnes critiques: {missing_critical}")

    if issues:
        print(f"\n‚ö†Ô∏è {len(issues)} probl√®me(s) d√©tect√©(s) :")
        for issue in issues:
            print(f"   {issue}")
        return False
    else:
        print(f"\nüéâ TOUTES LES V√âRIFICATIONS CRITIQUES PASS√âES !")
        return True

# Ex√©cution
data_ok = critical_checks_processed(df_crime)

print("\n" + "="*80)
print("‚úÖ DEEPCHECKS TERMIN√â")
print("="*80)

print(f"\nüìä R√©sum√© final:")
print(f"   Lignes: {df_crime.shape[0]:,} | Colonnes: {df_crime.shape[1]}")
print(f"   Label: {label_column}")
print(f"   Qualit√© globale: {'‚úÖ EXCELLENTE' if data_ok else '‚ö†Ô∏è √Ä AM√âLIORER'}")

if 'Crime_Group' in df_crime.columns:
    print(f"\n   Distribution Crime_Group:")
    print(df_crime['Crime_Group'].value_counts())

print(f"\nüí° Prochaines √©tapes:")
print("   ‚Üí Tu es pr√™t pour l'entra√Ænement des mod√®les !")
print("   ‚Üí Lance train.py ou ensemble.py avec tes donn√©es crime_2020_processed2.csv")

print("\n" + "="*80)

‚úÖ Imports de base effectu√©s
‚úÖ Deepchecks import√©
üìä DEEPCHECKS CRIME LA - NIVEAU 1 : INT√âGRIT√â DES DONN√âES (PR√âPROCESS√âES 2020)
üìÖ Date: 2025-12-21 18:33:04
üìÅ Base: d:\Ing√©nierie3\MLops
üìÅ Fichier cible: d:\Ing√©nierie3\MLops\data\processed\crime_2020_processed2.csv
üìÅ Reports: d:\Ing√©nierie3\MLops\reports\deepchecks

üì¶ Chargement des donn√©es pr√©process√©es
--------------------------------------------------------------------------------
‚úÖ Donn√©es charg√©es: crime_2020_processed2.csv
   Shape: (197864, 18)
   Colonnes: 18
   Colonnes disponibles: ['AREA', 'Part 1-2', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Crm Cd 1', 'LAT', 'LON', 'Day of Week', 'Month', 'Year', 'Day Type', 'Day_of_week', 'Month_num', 'Hour', 'Hour of Day', 'Crime_Group']

üìã Aper√ßu des donn√©es pr√©process√©es:


Unnamed: 0,AREA,Part 1-2,Vict Age,Vict Sex,Vict Descent,Premis Cd,Crm Cd 1,LAT,LON,Day of Week,Month,Year,Day Type,Day_of_week,Month_num,Hour,Hour of Day,Crime_Group
0,7,1,0,M,O,101.0,510.0,34.0375,-118.3506,Sunday,March,2020,Weekend,6,3,21,21,Vehicle-Related Crime
1,1,1,47,M,O,128.0,330.0,34.0444,-118.2628,Saturday,February,2020,Weekend,5,2,18,18,Property & Theft Crime
2,3,1,19,X,X,502.0,480.0,34.021,-118.3002,Wednesday,November,2020,Weekday,2,11,17,17,Other / Fraud / Public Order Crime
3,9,1,19,M,O,405.0,343.0,34.1576,-118.4387,Tuesday,March,2020,Weekday,1,3,20,20,Property & Theft Crime
4,4,1,0,M,H,101.0,510.0,34.082,-118.213,Wednesday,September,2020,Weekday,2,9,6,6,Vehicle-Related Crime



üìä Info des colonnes:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197864 entries, 0 to 197863
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AREA          197864 non-null  int64  
 1   Part 1-2      197864 non-null  int64  
 2   Vict Age      197864 non-null  int64  
 3   Vict Sex      197864 non-null  object 
 4   Vict Descent  197864 non-null  object 
 5   Premis Cd     197864 non-null  float64
 6   Crm Cd 1      197864 non-null  float64
 7   LAT           197864 non-null  float64
 8   LON           197864 non-null  float64
 9   Day of Week   197864 non-null  object 
 10  Month         197864 non-null  object 
 11  Year          197864 non-null  int64  
 12  Day Type      197864 non-null  object 
 13  Day_of_week   197864 non-null  int64  
 14  Month_num     197864 non-null  int64  
 15  Hour          197864 non-null  int64  
 16  Hour of Day   197864 non-null  int64  
 17  Crime_Group   197864 no


üìä RAPPORT INTERACTIF (dans le notebook) :


Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_2LENMSTNFL16VOD0CR7BK3G82">Data Integrity Sui‚Ä¶

üíæ Rapport HTML sauvegard√© : deepchecks_integrity_processed_2020.html

üö® V√âRIFICATIONS CRITIQUES MANUELLES
--------------------------------------------------------------------------------
‚úÖ GPS: 0 invalides (0.00%) ‚Üí Excellent
‚úÖ Heures: Toutes valides (0-23)
‚úÖ √Çges: Tous raisonnables
‚úÖ Doublons: 0 d√©tect√©s
‚úÖ Aucune valeur manquante dans les colonnes critiques

üéâ TOUTES LES V√âRIFICATIONS CRITIQUES PASS√âES !

‚úÖ DEEPCHECKS TERMIN√â

üìä R√©sum√© final:
   Lignes: 197,864 | Colonnes: 18
   Label: Crime_Group
   Qualit√© globale: ‚úÖ EXCELLENTE

   Distribution Crime_Group:
Crime_Group
Property & Theft Crime                95002
Violent Crime                         57227
Other / Fraud / Public Order Crime    24158
Vehicle-Related Crime                 21477
Name: count, dtype: int64

üí° Prochaines √©tapes:
   ‚Üí Tu es pr√™t pour l'entra√Ænement des mod√®les !
   ‚Üí Lance train.py ou ensemble.py avec tes donn√©es crime_2020_processed2.csv

