In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from IPython.display import display, HTML
import seaborn as sns
import csv
import os
import logging
import re
import json

In [85]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [86]:
PATH_TO_CSVS = 'data/raw'
RANDOM_STATE = 1

In [87]:
# Détecte automatiquement le délimiteur d'un fichier CSV
def get_delimiter(file_path, bytes=4096):
    try:
        with open(file_path, 'r') as file:
            data = file.read(bytes)
        sniffer = csv.Sniffer()
        delimiter = sniffer.sniff(data).delimiter
        return delimiter
    except Exception as e:
        logging.error(f"Erreur lors de la détection du délimiteur: {e}")
        return None

In [88]:
# Lecture d'un fichier CSV en essayant différents encodages
def read_csv_file(file_path):
    if not os.path.exists(file_path):
        return None, False, f"Fichier non trouvé: {file_path}"
    
    delimiter = get_delimiter(file_path)
    if not delimiter:
        return None, False, f"Impossible de détecter le délimiteur pour le fichier: {file_path}"
    
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, low_memory=False, encoding=encoding, delimiter=delimiter)
            return df, True, None
        except (UnicodeDecodeError, pd.errors.ParserError) as e:
            logging.warning(f"Erreur avec l'encodage {encoding} pour le fichier {file_path}: {e}")
    
    return None, False, f"Impossible de lire le fichier {file_path} avec les encodages: {encodings}."

In [89]:
# Chargement des datasets dans des dictionnaires
def load_datasets(prefixes, years, base_path=PATH_TO_CSVS):
    dataframes = {}
    
    for prefix in prefixes:
        datasets = {}
        for year in years:
            connector = '_' if year <= 2016 else '-'
            file_name = os.path.join(base_path, f'{prefix}{connector}{year}.csv')
            df, success, error = read_csv_file(file_name)
            if success:
                datasets[year] = df
            else:
                logging.error(error)
        dataframes[prefix] = datasets
    
    return dataframes

In [90]:
years = list(range(2019, 2023))
prefixes = ['caracteristiques', 'lieux', 'usagers', 'vehicules']

dataframes = load_datasets(prefixes, years)

# log
for prefix, df_dict in dataframes.items():
    logging.info(f'{prefix}: {len(df_dict)} datasets chargés.')

logging.info(f'Total datasets chargés: {sum(len(df_dict) for df_dict in dataframes.values())}.')

2024-11-25 14:16:48,519 - INFO - caracteristiques: 4 datasets chargés.
2024-11-25 14:16:48,520 - INFO - lieux: 4 datasets chargés.
2024-11-25 14:16:48,520 - INFO - usagers: 4 datasets chargés.
2024-11-25 14:16:48,521 - INFO - vehicules: 4 datasets chargés.
2024-11-25 14:16:48,521 - INFO - Total datasets chargés: 16.


In [91]:
# Convertir les dtypes d'un DataFrame
def convert_dtypes(df, reference_dtypes):
    for col in df.columns:
        if col in reference_dtypes:
            try:
                df[col] = df[col].astype(reference_dtypes[col])
            except ValueError as e:
                logging.error(f"Erreur lors de la conversion de la colonne {col} en type {reference_dtypes[col]}: {e}")
    return df

In [92]:
# Extraire la structure de référence de chaque groupe de DataFrames
def extract_reference_structure(dataframes):
    reference_structures = {}
    
    for prefix, df_dict in dataframes.items():
        if df_dict:
            last_df = list(df_dict.values())[-1]
            reference_structures[prefix] = last_df.dtypes.to_dict()
    return reference_structures

In [93]:
# Gérer le préprocessing des DataFrames
def preprocess(df, prefix):
    if prefix == 'caracteristiques':
        if 'Accident_Id' in df.columns:
            df = df.rename(columns={'Accident_Id': 'Num_Acc'})
        if 'an' in df.columns:
            df['an'] = df['an'].apply(lambda x: x + 2000 if x < 2000 else x)
        if 'hrmn' in df.columns:
            df['hrmn'] = df['hrmn'].apply(lambda x: f"{str(x).zfill(4)[:2]}:{str(x).zfill(4)[2:]}")
        df = df.fillna({'lum': -1, 'int': -1, 'atm': -1, 'col': -1})
        df = df.drop(columns=['adr', 'lat', 'long'], errors='ignore')
    elif prefix == 'lieux':
        df = df.fillna({'circ': -1, 'vosp': -1, 'prof': -1, 'pr': -1, 'pr1': -1, 'plan': -1, 'surf': -1, 'infra': -1, 'situ': -1})
        if 'lartpc' in df.columns:
            df['lartpc'] = df['lartpc'].replace(0, -1).fillna(-1)
        if 'vma' in df.columns:
            df['vma'] = df['vma'].apply(lambda x: -1 if pd.isna(x) or x > 130 else x)
        df = df.drop(columns=['voie', 'v1', 'v2', 'larrout'], errors='ignore')
    elif prefix == 'usagers':
        df = df.fillna({'place': -1, 'catu': -1, 'grav': -1, 'sexe': -1, 'trajet': -1, 'secu1': -1, 'secu2': -1, 'secu3': -1, 'locp': -1, 'actp': -1, 'etatp': -1})
        if 'catu' in df.columns:
            df['catu'] = df['catu'].replace(4, -1)
        if 'an_nais' in df.columns:
            df['an_nais'] = df['an_nais'].apply(lambda x: pd.NA if x < 1900 else x)
    elif prefix == 'vehicules':
        df = df.fillna({'senc': -1, 'obs': -1, 'obsm': -1, 'choc': -1, 'manv': -1, 'motor': -1})
        if 'catv' in df.columns:
            df['catv'] = df['catv'].fillna(0)
        df = df.drop(columns=['occutc'], errors='ignore')
    return df

In [94]:
# Appliquer le préprocessing sur tous les datasets
def preprocess_datasets(dataframes):
    for prefix, df_dict in dataframes.items():
        for year, df in df_dict.items():
            df = preprocess(df, prefix)
            df_dict[year] = df
    return dataframes

In [95]:
# Harmoniser les DataFrames selon la structure de référence
def harmonize_dataframes(dataframes, reference_structures):
    harmonized_dataframes = {}
    for prefix, df_dict in dataframes.items():
        reference_dtypes = reference_structures.get(prefix, {})
        harmonized_dict = {}
        for year, df in df_dict.items():
            df = df[[col for col in df.columns if col in reference_dtypes]]
            df = convert_dtypes(df, reference_dtypes)
            harmonized_dict[year] = df
        harmonized_dataframes[prefix] = harmonized_dict
    return harmonized_dataframes

In [96]:
dataframes = preprocess_datasets(dataframes)
reference_structures = extract_reference_structure(dataframes)
harmonized_dataframes = harmonize_dataframes(dataframes, reference_structures)

In [97]:
# Fonction pour concaténer les DataFrames par type
def concat_harmonized_dataframes(dataframes):
    concatenated_dataframes = {}
    for prefix, df_dict in dataframes.items():
        concatenated_df = pd.concat(df_dict.values(), ignore_index=True)
        concatenated_dataframes[prefix] = concatenated_df
    return concatenated_dataframes

In [98]:
concatenated_dataframes = concat_harmonized_dataframes(harmonized_dataframes)

In [99]:
# Fusionner les DataFrames

def merge_dataframes(concatenated_dataframes):
    df_caracteristiques = concatenated_dataframes['caracteristiques']
    df_lieux = concatenated_dataframes['lieux']
    df_usagers = concatenated_dataframes['usagers']
    df_vehicules = concatenated_dataframes['vehicules']
    
    merged_df = pd.merge(df_caracteristiques, df_lieux, on='Num_Acc', how='inner')
    merged_df = pd.merge(merged_df, df_usagers, on='Num_Acc', how='inner')
    merged_df = pd.merge(merged_df, df_vehicules, on=['Num_Acc', 'id_vehicule', 'num_veh'], how='inner')
    
    return merged_df

In [100]:
final_merged_df = merge_dataframes(concatenated_dataframes)

In [101]:
# Prétraitement après fusion
def preprocessing_final_dataframe(df):
    df = df.drop(columns=['id_usager', 'Num_Acc', 'com', 'id_vehicule', 'num_veh', 'lartpc'], errors='ignore')
    if 'hrmn' in df.columns:
        df['hour'] = df['hrmn'].str[:2].astype(int)
        df = df.drop(columns=['hrmn'])
    if 'an_nais' in df.columns:
        mode_an_nais = df['an_nais'].mode()[0]
        df['an_nais'] = df['an_nais'].fillna(mode_an_nais).astype(int)
    return df

In [102]:
# Encodage du DataFrame
def encode_dataframe(df):
    dummy_columns = ['lum', 'agg', 'int', 'atm', 'col', 'catr', 'circ', 'prof', 'place', 'catu', 'sexe', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp', 'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'plan', 'surf', 'an', 'infra', 'dep', 'situ', 'vosp']
    df = pd.get_dummies(df, columns=dummy_columns, drop_first=True)
    return df

final_merged_df = preprocessing_final_dataframe(final_merged_df)
final_merged_df = encode_dataframe(final_merged_df)

In [103]:
# Enregistrer le DataFrame final
PATH_TO_CSVS_PROCESSED = 'data/processed/data.csv'
final_merged_df.reset_index(drop=True, inplace=True)
final_merged_df.to_csv(PATH_TO_CSVS_PROCESSED, index=False)

In [104]:
final_merged_df.head()

Unnamed: 0,jour,mois,nbv,pr,pr1,vma,grav,an_nais,hour,lum_1,...,situ_2,situ_3,situ_4,situ_5,situ_6,situ_8,vosp_0,vosp_1,vosp_2,vosp_3
0,30,11,10,6,900,70,4,2002,1,False,...,False,False,False,False,False,False,True,False,False,False
1,30,11,10,6,900,70,4,1993,1,False,...,False,False,False,False,False,False,True,False,False,False
2,30,11,10,6,900,70,1,1959,1,False,...,False,False,False,False,False,False,True,False,False,False
3,30,11,2,3,845,70,4,1994,2,False,...,False,False,False,False,False,False,True,False,False,False
4,28,11,8,10,500,90,1,1996,15,True,...,False,False,False,False,False,False,True,False,False,False


In [105]:
duplicate_rows = final_merged_df.duplicated().sum()
total_rows = len(final_merged_df)
duplicate_rate = duplicate_rows / total_rows * 100

print(f"Nombre de doublons : {duplicate_rows}")
print(f"Taux de doublons : {duplicate_rate:.2f}%")

missing_values = final_merged_df.isnull().sum().sum()
missing_rate = missing_values / total_rows * 100

print(f"Nombre de valeur manquante : {missing_values}")
print(f"Taux de valeur manquante : {missing_rate:.2f}%")

Nombre de doublons : 469
Taux de doublons : 0.09%
Nombre de valeur manquante : 0
Taux de valeur manquante : 0.00%
