# üìä DEEPCHECKS CRIME LA - NIVEAU 1 : INT√âGRIT√â DES DONN√âES

## üéØ Objectif
V√©rifier l'int√©grit√© des donn√©es de criminalit√© de Los Angeles avant l'entra√Ænement :
- D√©tection des valeurs manquantes critiques
- Identification des doublons
- V√©rification de la coh√©rence des features
- Analyse des outliers g√©ographiques et temporels

---

## üì¶ Imports et Configuration

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Ajouter src au path pour imports (si besoin)
sys.path.append(os.path.join(os.getcwd(), '..'))

print("‚úÖ Imports de base effectu√©s")

# Deepchecks Tabular
try:
    from deepchecks.tabular import Dataset
    from deepchecks.tabular.suites import data_integrity
    DEEPCHECKS_AVAILABLE = True
    print("‚úÖ Deepchecks import√©")
except ImportError as e:
    print(f"‚ö†Ô∏è Deepchecks non disponible: {e}")
    print("Installation: pip install deepchecks[all]")
    DEEPCHECKS_AVAILABLE = False

# Configuration des chemins
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE_DIR / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
REPORTS_DIR = BASE_DIR / 'reports' / 'deepchecks'
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("="*80)
print("üìä DEEPCHECKS CRIME LA - NIVEAU 1 : INT√âGRIT√â DES DONN√âES (DONN√âES BRUTES UNIQUEMENT)")
print("="*80)
print(f"üìÖ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìÅ Base: {BASE_DIR}")
print(f"üìÅ Donn√©es brutes: {RAW_DATA_DIR}")
print(f"üìÅ Reports: {REPORTS_DIR}")
print()

def load_crime_raw_data():
    print("üì¶ Chargement exclusif des donn√©es BRUTES crime")
    print("-" * 80)
    
    raw_files = list(RAW_DATA_DIR.glob('*.csv'))
    
    if not raw_files:
        raise FileNotFoundError(f"‚ùå Aucun fichier CSV trouv√© dans {RAW_DATA_DIR}")
    
    data_file = raw_files[0]
    print(f"‚úÖ Donn√©es brutes charg√©es: {data_file.name}")
    df = pd.read_csv(data_file)
    
    print(f"   Shape: {df.shape}")
    print(f"   Colonnes: {len(df.columns)}")
    if 'DATE OCC' in df.columns:
        print(f"   P√©riode: {df['DATE OCC'].min()} √† {df['DATE OCC'].max()}")
    print()
    
    return df

df_crime = load_crime_raw_data()

print("üìã Aper√ßu des donn√©es brutes:")
display(df_crime.head())

print(f"\nüìä Info des colonnes:")
df_crime.info()

def prepare_crime_dataset_for_deepchecks(df):
    print("üìù Pr√©paration Dataset Deepchecks")
    print("-" * 80)
    
    label_col = 'Crm Cd Desc' if 'Crm Cd Desc' in df.columns else None
    
    feature_cols = [col for col in ['LAT', 'LON', 'Vict Age', 'AREA'] if col in df.columns]
    
    cat_features = [col for col in df.select_dtypes(include=['object']).columns if col != label_col]
    
    print(f"‚úÖ Configuration Dataset:")
    print(f"   Label: {label_col}")
    print(f"   Features num√©riques d√©tect√©es: {feature_cols}")
    print(f"   Features cat√©gorielles (exemple): {cat_features[:5]}{'...' if len(cat_features) > 5 else ''}")
    
    if DEEPCHECKS_AVAILABLE and label_col:
        dataset = Dataset(df, label=label_col, cat_features=cat_features)
        return dataset, label_col, feature_cols
    else:
        return None, label_col, feature_cols

crime_dataset, label_column, feature_columns = prepare_crime_dataset_for_deepchecks(df_crime)

def run_crime_data_integrity_checks(dataset, df):
    print("\n" + "="*80)
    print("üìä NIVEAU 1: INT√âGRIT√â DES DONN√âES BRUTES CRIME LA")
    print("="*80)
    
    results = {}
    
    if DEEPCHECKS_AVAILABLE and dataset:
        print("\nüîç Ex√©cution des checks Deepchecks...")
        integrity_suite = data_integrity()
        result = integrity_suite.run(dataset)
        
        # 1. Affichage DIRECT dans le notebook ‚Üí TOUJOURS parfait et interactif !
        print("\nüìä AFFICHAGE DU RAPPORT DIRECTEMENT DANS LE NOTEBOOK :\n")
        result.show()
        
        # 2. Sauvegarde HTML en mode widget (interactif, fonctionne avec anywidget install√©)
        integrity_report_path = REPORTS_DIR / 'crime_raw_data_integrity_report.html'
        result.save_as_html(str(integrity_report_path))  # as_widget=True par d√©faut
        
        print(f"\n‚úÖ Rapport HTML sauvegard√© : {integrity_report_path.name}")
        print("   Ouvre-le dans ton navigateur si tu veux une version fichier s√©par√©e.")
        
        results['deepchecks_result'] = result
    
    # Analyses manuelles (inchang√©es)
    if {'LAT', 'LON'}.issubset(df.columns):
        lat_valid = ((df['LAT'] >= 33.0) & (df['LAT'] <= 35.0)).sum()
        lon_valid = ((df['LON'] >= -119.0) & (df['LON'] <= -117.0)).sum()
        results['gps_valid_pct'] = min(lat_valid, lon_valid) / len(df) * 100
    
    missing_analysis = df.isna().sum().sort_values(ascending=False)
    missing_pct = (missing_analysis / len(df) * 100).round(2)
    results['missing_analysis'] = missing_analysis
    results['missing_pct'] = missing_pct
    
    if label_column:
        crime_dist = df[label_column].value_counts()
        results['crime_types_count'] = len(crime_dist)
        results['most_common_crime'] = crime_dist.index[0]
    
    return results

integrity_results = run_crime_data_integrity_checks(crime_dataset, df_crime)

# === Le reste du code (v√©rifications critiques, r√©sum√©, etc.) reste identique ===
# (je le copie pour que ce soit complet)

def critical_crime_data_checks(df):
    print("\nüö® V√âRIFICATIONS CRITIQUES (DONN√âES BRUTES)")
    print("-" * 80)
    
    issues = []
    
    if {'LAT', 'LON'}.issubset(df.columns):
        invalid_coords = ((df['LAT'] < 33.0) | (df['LAT'] > 35.0) | 
                          (df['LON'] < -119.0) | (df['LON'] > -117.0) | 
                          df['LAT'].isna() | df['LON'].isna()).sum()
        pct = invalid_coords / len(df) * 100
        if pct > 10:
            issues.append(f"üî¥ CRITIQUE: {invalid_coords} coordonn√©es GPS invalides ({pct:.1f}%)")
        else:
            print(f"‚úÖ Coordonn√©es GPS: {invalid_coords} invalides ({pct:.1f}%) - OK")
    
    if 'DATE OCC' in df.columns:
        df_date = df.copy()
        df_date['DATE OCC'] = pd.to_datetime(df_date['DATE OCC'], errors='coerce')
        invalid_dates = df_date['DATE OCC'].isna().sum()
        pct = invalid_dates / len(df) * 100
        if pct > 5:
            issues.append(f"üî¥ CRITIQUE: {invalid_dates} dates invalides ({pct:.1f}%)")
        else:
            print(f"‚úÖ Dates: {invalid_dates} invalides ({pct:.1f}%) - OK")
    
    if 'Vict Age' in df.columns:
        invalid_ages = ((df['Vict Age'] < 0) | (df['Vict Age'] > 120)).sum()
        if invalid_ages > 0:
            issues.append(f"üü° ATTENTION: {invalid_ages} √¢ges aberrants")
        else:
            print(f"‚úÖ √Çges victimes: OK")
    
    duplicates = df.duplicated().sum()
    pct_dup = duplicates / len(df) * 100
    if pct_dup > 1:
        issues.append(f"üî¥ CRITIQUE: {duplicates} doublons ({pct_dup:.1f}%)")
    else:
        print(f"‚úÖ Doublons: {duplicates} ({pct_dup:.1f}%) - Acceptable")
    
    if issues:
        print(f"\n‚ö†Ô∏è {len(issues)} PROBL√àME(S) CRITIQUE(S) :")
        for issue in issues:
            print(f"   {issue}")
        return False
    else:
        print(f"\n‚úÖ TOUTES LES V√âRIFICATIONS CRITIQUES PASS√âES")
        return True

data_quality_ok = critical_crime_data_checks(df_crime)

print("\n" + "="*80)
print("‚úÖ NIVEAU 1 : INT√âGRIT√â DES DONN√âES BRUTES - TERMIN√â")
print("="*80)

print(f"\nüìä R√©sum√© Global (donn√©es brutes):")
print(f"   Lignes: {df_crime.shape[0]:,} | Colonnes: {df_crime.shape[1]}")
print(f"   Qualit√© globale: {'‚úÖ BONNE' if data_quality_ok else '‚ö†Ô∏è PROBL√àMES √Ä CORRIGER'}")

if 'gps_valid_pct' in integrity_results:
    print(f"   GPS valides estim√©s: {integrity_results['gps_valid_pct']:.1f}%")
if 'crime_types_count' in integrity_results:
    print(f"   Types de crimes distincts: {integrity_results['crime_types_count']}")
    print(f"   Crime le plus fr√©quent: {integrity_results['most_common_crime']}")

print(f"\nüí° Prochaines √©tapes:")
if data_quality_ok:
    print("   1. ‚úÖ Donn√©es brutes OK ‚Üí passe au pr√©processing")
else:
    print("   1. üîß Nettoie surtout les 41 √¢ges aberrants (valeurs n√©gatives ou >120)")

print(f"\nüìã Rapport interactif visible ci-dessus dans le notebook !")
print(f"   Fichier HTML aussi disponible ici : {REPORTS_DIR / 'crime_raw_data_integrity_report.html'}")

print("\n" + "="*80)

‚úÖ Imports de base effectu√©s
‚úÖ Deepchecks import√©
üìä DEEPCHECKS CRIME LA - NIVEAU 1 : INT√âGRIT√â DES DONN√âES (DONN√âES BRUTES UNIQUEMENT)
üìÖ Date: 2025-12-21 18:33:59
üìÅ Base: d:\Ing√©nierie3\MLops
üìÅ Donn√©es brutes: d:\Ing√©nierie3\MLops\data\raw
üìÅ Reports: d:\Ing√©nierie3\MLops\reports\deepchecks

üì¶ Chargement exclusif des donn√©es BRUTES crime
--------------------------------------------------------------------------------
‚úÖ Donn√©es brutes charg√©es: crime_Data_2020.csv
   Shape: (199840, 28)
   Colonnes: 28
   P√©riode: 2020-01-01 √† 2020-12-31

üìã Aper√ßu des donn√©es brutes:


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,2020-03-01,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,2020-02-08,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,2020-11-04,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,2020-03-10,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,200412582,09/09/2020 12:00:00 AM,2020-09-09,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,...,IC,Invest Cont,510.0,,,,200 E AVENUE 28,,34.082,-118.213



üìä Info des colonnes:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199840 entries, 0 to 199839
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           199840 non-null  int64  
 1   Date Rptd       199840 non-null  object 
 2   DATE OCC        199840 non-null  object 
 3   TIME OCC        199840 non-null  int64  
 4   AREA            199840 non-null  int64  
 5   AREA NAME       199840 non-null  object 
 6   Rpt Dist No     199840 non-null  int64  
 7   Part 1-2        199840 non-null  int64  
 8   Crm Cd          199840 non-null  int64  
 9   Crm Cd Desc     199840 non-null  object 
 10  Mocodes         173083 non-null  object 
 11  Vict Age        199840 non-null  int64  
 12  Vict Sex        174354 non-null  object 
 13  Vict Descent    174351 non-null  object 
 14  Premis Cd       199838 non-null  float64
 15  Premis Desc     199771 non-null  object 
 16  Weapon Used Cd  72976 non-null 


üìä AFFICHAGE DU RAPPORT DIRECTEMENT DANS LE NOTEBOOK :



Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_I9CBDL1HKU31QM5NYDXYJP3Z8">Data Integrity Sui‚Ä¶


‚úÖ Rapport HTML sauvegard√© : crime_raw_data_integrity_report.html
   Ouvre-le dans ton navigateur si tu veux une version fichier s√©par√©e.

üö® V√âRIFICATIONS CRITIQUES (DONN√âES BRUTES)
--------------------------------------------------------------------------------
‚úÖ Coordonn√©es GPS: 791 invalides (0.4%) - OK
‚úÖ Dates: 0 invalides (0.0%) - OK
‚úÖ Doublons: 0 (0.0%) - Acceptable

‚ö†Ô∏è 1 PROBL√àME(S) CRITIQUE(S) :
   üü° ATTENTION: 41 √¢ges aberrants

‚úÖ NIVEAU 1 : INT√âGRIT√â DES DONN√âES BRUTES - TERMIN√â

üìä R√©sum√© Global (donn√©es brutes):
   Lignes: 199,840 | Colonnes: 28
   Qualit√© globale: ‚ö†Ô∏è PROBL√àMES √Ä CORRIGER
   GPS valides estim√©s: 99.6%
   Types de crimes distincts: 129
   Crime le plus fr√©quent: VEHICLE - STOLEN

üí° Prochaines √©tapes:
   1. üîß Nettoie surtout les 41 √¢ges aberrants (valeurs n√©gatives ou >120)

üìã Rapport interactif visible ci-dessus dans le notebook !
   Fichier HTML aussi disponible ici : d:\Ing√©nierie3\MLops\reports\de