In [420]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from IPython.display import display, HTML
import csv
import os
import logging
import math
import re
import json

In [421]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [422]:
PATH_TO_CSVS = 'data/raw'

In [423]:
# https://stackoverflow.com/questions/46135839/auto-detect-the-delimiter-in-a-csv-file-using-pd-read-csv comme base
# detecte automatiquement le sep d'un fichier csv


def get_delimiter(file_path, bytes=4096):
    try:
        with open(file_path, 'r') as file:
            data = file.read(bytes)
        sniffer = csv.Sniffer()
        delimiter = sniffer.sniff(data).delimiter
        return delimiter
    except Exception as e:
        logging.error(f"Erreur lors de la détection du délimiteur: {e}")
        return None

In [424]:
# lecture d'un fichier csv en essayant différents encodage.


def read_csv_file(file_path):
    if not os.path.exists(file_path):
        return None, False, f"Fichier non trouvé: {file_path}"
    
    delimiter = get_delimiter(file_path)
    if not delimiter:
        return None, False, f"Impossible de détecter le délimiteur pour le fichier: {file_path}"
    
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, low_memory=False, encoding=encoding, delimiter=delimiter)
            return df, True, None
        except (UnicodeDecodeError, pd.errors.ParserError) as e:
            logging.warning(f"Erreur avec l'encodage {encoding} pour le fichier {file_path}: {e}")
    
    return None, False, f"Impossible de lire le fichier {file_path} avec les encodages: {encodings}."

    #modification github

In [425]:
# chargement des datasets dans des listes


def load_datasets(prefixes, years, base_path= PATH_TO_CSVS):
    dataframes = []
    
    for prefix in prefixes:
        datasets = []
        for year in years:
            connector = '_' if year <= 2016 else '-'
            file_name = os.path.join(base_path, f'{prefix}{connector}{year}.csv')
            df, success, error = read_csv_file(file_name)
            if success:
                datasets.append({file_name: df})
            else:
                logging.error(error)
        dataframes.append(datasets)
    
    return dataframes

In [426]:
years = list(range(2005, 2023))
prefixes= ['caracteristiques', 'lieux', 'usagers', 'vehicules']

dataframes = load_datasets(prefixes, years)

# log
for prefix, df_list in zip(prefixes, dataframes):
        logging.info(f'{prefix}: {len(df_list)} datasets chargés.')

logging.info(f'Total datasets chargés: {sum(len(dfs) for dfs in dataframes)}.')

2024-06-21 16:39:28,170 - INFO - caracteristiques: 18 datasets chargés.
2024-06-21 16:39:28,171 - INFO - lieux: 18 datasets chargés.
2024-06-21 16:39:28,171 - INFO - usagers: 18 datasets chargés.
2024-06-21 16:39:28,171 - INFO - vehicules: 18 datasets chargés.
2024-06-21 16:39:28,172 - INFO - Total datasets chargés: 72.


In [427]:
# Convertir les dtypes d'un dataframe


def convert_dtypes(df, reference_dtypes):
    for col in df.columns:
        if col in reference_dtypes:
            try:
                df.loc[:, col] = df[col].astype(reference_dtypes[col])  # Utilisation de .loc pour éviter les warnings.
            except ValueError as e:
                logging.error(f"Erreur lors de la conversion de la colonne {col} en type {reference_dtypes[col]}: {e}")
    return df

In [428]:
# Extraire la structure de référence de chaque groupe de dataframes


def extract_reference_structure(dataframes):
    reference_structures = {}
    
    for prefix, df_list in zip(prefixes, dataframes):
        if df_list:
            last_df = list(df_list[-1].values())[0]
            reference_structures[prefix] = last_df.dtypes.to_dict()
    return reference_structures

In [429]:
# Gérer le préprocessing


def preprocess_and_convert_dtypes(df, reference_dtypes, prefix):
    if prefix == 'caracteristiques':
        # Renommer 'Accident_Id' en 'Num_Acc' si nécessaire
        if 'Accident_Id' in df.columns:
            df = df.rename(columns={'Accident_Id': 'Num_Acc'})

        # Ajouter 2000 à 'an' si < 2000
        if 'an' in df.columns:
            df.loc[:, 'an'] = df['an'].apply(lambda x: x + 2000 if x < 2000 else x)
        
        # Convertir 'hrmn' de 'HHMM' à 'HH:MM'
        if 'hrmn' in df.columns:
            df.loc[:, 'hrmn'] = df['hrmn'].apply(lambda x: f"{str(x).zfill(4)[:2]}:{str(x).zfill(4)[2:]}")
        
        # Remplacer les valeurs NaN par -1
        for col in ['lum', 'int', 'atm', 'col']:
            if col in df.columns:
                df[col] = df[col].fillna(-1)
        
        # Supprimer les colonnes 'adr', 'lat', 'long'
        df = df.drop(columns=['adr', 'lat', 'long'], errors='ignore')

    elif prefix == 'lieux':
        # Remplacer les valeurs NaN par -1
        for col in ['circ', 'vosp', 'prof', 'pr', 'pr1', 'plan', 'surf', 'infra', 'situ']:
            if col in df.columns:
                df[col] = df[col].fillna(-1)
        
        # Remplacer les valeurs NaN et 0 par -1 pour 'lartpc'
        if 'lartpc' in df.columns:
            df['lartpc'] = df['lartpc'].replace(0, -1).fillna(-1)
        
        # Remplacer les valeurs NaN et > 130 par -1 pour 'vma'
        if 'vma' in df.columns:
            df['vma'] = df['vma'].apply(lambda x: -1 if pd.isna(x) or x > 130 else x)
        
        # Supprimer les colonnes 'voie', 'v1', 'v2', 'larrout'
        df = df.drop(columns=['voie', 'v1', 'v2', 'larrout'], errors='ignore')

    elif prefix == 'usagers':
        # Remplacer les valeurs NaN par -1
        for col in ['place', 'catu', 'grav', 'sexe', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp']:
            if col in df.columns:
                df[col] = df[col].fillna(-1)
        
        # Remplacer les valeurs 4 par -1 pour 'catu'
        if 'catu' in df.columns:
            df['catu'] = df['catu'].replace(4, -1)
        
        # Gérer les outliers pour 'an_nais'
        if 'an_nais' in df.columns:
            df['an_nais'] = df['an_nais'].apply(lambda x: pd.NA if x < 1900 else x)
    
    elif prefix == 'vehicules':
        # Remplacer les valeurs NaN par -1
        for col in ['senc', 'obs', 'obsm', 'choc', 'manv', 'motor']:
            if col in df.columns:
                df[col] = df[col].fillna(-1)
        
        # Remplacer les valeurs NaN par 0 pour 'catv'
        if 'catv' in df.columns:
            df['catv'] = df['catv'].fillna(0)
        
        # Supprimer la colonne 'occutc'
        df = df.drop(columns=['occutc'], errors='ignore')
    
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df.loc[:, col] = df[col].fillna(-1)

        # Gestion des cas particuliers
        if 'lartpc' in df.columns:
            df.loc[:, 'lartpc'] = df['lartpc'].replace(0, -1).fillna(-1)
        if 'catu' in df.columns:
            df.loc[:, 'catu'] = df['catu'].replace(4, -1)
        if 'an_nais' in df.columns:
            df.loc[:, 'an_nais'] = df['an_nais'].apply(lambda x: pd.NA if x < 1900 else x)
        
    # Conversion des types de données en fonction des types de référence
    for col, dtype in reference_dtypes.items():
        if col in df.columns:
            try:
                # Remplacer les valeurs pd.NA par np.nan pour permettre la conversion
                df[col] = df[col].replace({pd.NA: np.nan})
                df[col] = df[col].astype(dtype)
            except ValueError:
                print(f"Impossible de convetir {col} en {dtype}.")
    
    return df
    


In [430]:
# Harmoniser les autres dataframes selon la structure de référence


def harmonize_dataframes(dataframes, reference_structures):
    harmonized_dataframes = []
    
    for prefix, df_list in zip(prefixes, dataframes):
        reference_dtypes = reference_structures.get(prefix, {})
        harmonized_group = []
        
        for df_dict in df_list:
            for file_name, df in df_dict.items():
                # Garder uniquement les colonnes présentes dans la structure de référence
                df = df[[col for col in df.columns if col in reference_dtypes]]
                
                # Prétraiter et convertir les types de données
                df = preprocess_and_convert_dtypes(df, reference_dtypes, prefix)
                
                harmonized_group.append({file_name: df})
        
        harmonized_dataframes.append(harmonized_group)
    
    return harmonized_dataframes


In [431]:
# Changements à prévoir

# Structure de référence pour caractéristiques: {
# 'Num_Acc': dtype('int64'), -> changer Accident_Id du dernier dataframe
# 'jour': dtype('int64'), -> OK
#  'mois': dtype('int64'), -> OK
# 'an': dtype('int64'), -> +2000 si <2000
# 'hrmn': dtype('O'), -> il faut traiter le changement de HHMM à HH:MM
# 'lum': dtype('int64'), -> N/A to -1
#  'dep': dtype('O'), -> la saisie change en 2019
# 'com': dtype('O'), -> changement de saisie
# 'agg': dtype('int64'), -> rien à gérer
#  'int': dtype('int64'), -> N/A to -1
#  'atm': dtype('int64'), -> N/A to -1
#  'col': dtype('int64'), -> N/A to -1
#  'adr': dtype('O'), -> à supprimer
#  'lat': dtype('O'), -> à supprimer
#  'long': dtype('O')} -> à supprimer

# Structure de référence pour lieux: {
# 'Num_Acc': dtype('int64'),
#  'catr': dtype('int64'), -> ok
#  'voie': dtype('O'), -> à supprimer
#  'v1': dtype('int64'), -> à supprimer
#  'v2': dtype('O'), -> à supprimer
#  'circ': dtype('int64'), -> N/A to -1
#  'nbv': dtype('O'), -> Abhérations à gérer
#  'vosp': dtype('int64'), -> N/A to -1
#  'prof': dtype('int64'), -> N/A to -1
#  'pr': dtype('O'), -> N/A to -1
#  'pr1': dtype('O'), -> N/A to -1
#  'plan': dtype('int64'), -> N/A to -1
#  'lartpc': dtype('O'), -> N/A and 0 to -1
#  'larrout': dtype('O'), -> à supprimer
#  'surf': dtype('int64'), -> N/A to -1
#  'infra': dtype('int64'), -> N/A to -1
#  'situ': dtype('int64'), -> N/A to -1
#  'vma': dtype('int64')} -> Si > 130 alors -1 et N/A to -1

# Structure de référence pour usagers: {
# 'Num_Acc': dtype('int64'),
#  'id_usager': dtype('O'), 
#  'id_vehicule': dtype('O'), 
#  'num_veh': dtype('O'), 
#  'place': dtype('int64'), N/A to -1
#  'catu': dtype('int64'), N/A and 4 to -1
#  'grav': dtype('int64'), N/A to -1
#  'sexe': dtype('int64'), -> N/A to -1
#  'an_nais': dtype('float64'), Attention aux outliers très bas.
#  'trajet': dtype('int64'), N/A to -1
#  'secu1': dtype('int64'), N/A to -1
#  'secu2': dtype('int64'), N/A to -1
#  'secu3': dtype('int64'), N/A to -1
#  'locp': dtype('int64'), N/A to -1 
#  'actp': dtype('O'), N/A to -1
#  'etatp': dtype('int64')} N/A to -1

# Structure de référence pour vehicules: {
# 'Num_Acc': dtype('int64'),
#  'id_vehicule': dtype('O'),
#  'num_veh': dtype('O'), 
#  'senc': dtype('int64'), N/A to -1
#  'catv': dtype('int64'), N/A to 0
#  'obs': dtype('int64'), N/A to -1
#  'obsm': dtype('int64'), N/A to -1
#  'choc': dtype('int64'), N/A to -1
#  'manv': dtype('int64'), N/A to -1
#  'motor': dtype('int64'), N/A to -1
#  'occutc': dtype('float64')} à supprimer

In [432]:
reference_structures = extract_reference_structure(dataframes)
harmonized_dataframes = harmonize_dataframes(dataframes, reference_structures)

  df.loc[:, 'hrmn'] = df['hrmn'].apply(lambda x: f"{str(x).zfill(4)[:2]}:{str(x).zfill(4)[2:]}")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(-1)
  df.loc[:, 'hrmn'] = df['hrmn'].apply(lambda x: f"{str(x).zfill(4)[:2]}:{str(x).zfill(4)[2:]}")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(-1)
  df.loc[:, 'hrmn'] = df['hrmn'].apply(lambda x: f"{str(x).zfill(4)[:2]}:{str(x).zfill(4)[2:]}")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Impossible de convetir catr en int64.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lartpc'] = df['lartpc'].replace(0, -1).fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

In [433]:
for prefix, df_list in zip(prefixes, harmonized_dataframes):
    logging.info(f'{prefix}: {len(df_list)} datasets harmonisés.')

logging.info(f'Total datasets harmonisés: {sum(len(dfs) for dfs in harmonized_dataframes)}.')

2024-06-21 16:39:33,088 - INFO - caracteristiques: 18 datasets harmonisés.
2024-06-21 16:39:33,089 - INFO - lieux: 18 datasets harmonisés.
2024-06-21 16:39:33,090 - INFO - usagers: 18 datasets harmonisés.
2024-06-21 16:39:33,091 - INFO - vehicules: 18 datasets harmonisés.
2024-06-21 16:39:33,092 - INFO - Total datasets harmonisés: 72.


In [434]:
# Afficher les structures de référence pour vérification
for prefix, structure in reference_structures.items():
    logging.info(f'Structure de référence pour {prefix}: {structure}')

2024-06-21 16:39:33,107 - INFO - Structure de référence pour caracteristiques: {'Accident_Id': dtype('int64'), 'jour': dtype('int64'), 'mois': dtype('int64'), 'an': dtype('int64'), 'hrmn': dtype('O'), 'lum': dtype('int64'), 'dep': dtype('O'), 'com': dtype('O'), 'agg': dtype('int64'), 'int': dtype('int64'), 'atm': dtype('int64'), 'col': dtype('int64'), 'adr': dtype('O'), 'lat': dtype('O'), 'long': dtype('O')}
2024-06-21 16:39:33,108 - INFO - Structure de référence pour lieux: {'Num_Acc': dtype('int64'), 'catr': dtype('int64'), 'voie': dtype('O'), 'v1': dtype('int64'), 'v2': dtype('O'), 'circ': dtype('int64'), 'nbv': dtype('O'), 'vosp': dtype('int64'), 'prof': dtype('int64'), 'pr': dtype('O'), 'pr1': dtype('O'), 'plan': dtype('int64'), 'lartpc': dtype('O'), 'larrout': dtype('O'), 'surf': dtype('int64'), 'infra': dtype('int64'), 'situ': dtype('int64'), 'vma': dtype('int64')}
2024-06-21 16:39:33,110 - INFO - Structure de référence pour usagers: {'Num_Acc': dtype('int64'), 'id_usager': dtyp

In [435]:
# Comparer structure réelles et référence


def compare_structures(harmonized_dataframes, reference_structures):
    for prefix, df_list in zip(prefixes, harmonized_dataframes):
        reference_dtypes = reference_structures.get(prefix, {})
        for df_dict in df_list:
            for file_name, df in df_dict.items():
                df_dtypes = df.dtypes.to_dict()
                missing_cols = set(reference_dtypes.keys()) - set(df_dtypes.keys())
                extra_cols = set(df_dtypes.keys()) - set(reference_dtypes.keys())
                diff_types = {col: (reference_dtypes[col], df_dtypes[col]) for col in df_dtypes if col in reference_dtypes and reference_dtypes[col] != df_dtypes[col]}

                table_data = [
                    ["Colonnes manquantes", ", ".join(missing_cols)],
                    ["Colonnes supplémentaires", ", ".join(extra_cols)],
                    ["Différences de types", ", ".join([f"{col} (réf: {reference_dtypes[col]}, fichier: {df_dtypes[col]})" for col in diff_types])]
                ]
                table = tabulate(table_data, headers=["Type de différence", "Colonnes"], tablefmt="grid")
                
                logging.info(f"Comparaison pour le fichier {file_name}:\n{table}")

In [436]:
compare_structures(harmonized_dataframes, reference_structures)

2024-06-21 16:39:33,162 - INFO - Comparaison pour le fichier data/raw\caracteristiques_2005.csv:
+--------------------------+-----------------------------+
| Type de différence       | Colonnes                    |
| Colonnes manquantes      | lat, adr, long, Accident_Id |
+--------------------------+-----------------------------+
| Colonnes supplémentaires |                             |
+--------------------------+-----------------------------+
| Différences de types     |                             |
+--------------------------+-----------------------------+
2024-06-21 16:39:33,164 - INFO - Comparaison pour le fichier data/raw\caracteristiques_2006.csv:
+--------------------------+-----------------------------+
| Type de différence       | Colonnes                    |
| Colonnes manquantes      | lat, adr, long, Accident_Id |
+--------------------------+-----------------------------+
| Colonnes supplémentaires |                             |
+--------------------------+-----------

In [437]:
# Fonction pour concaténer les DataFrames par type


def concat_harmonized_dataframes(dataframes):
    concatenated_dataframes = []

    for df_list in dataframes:
        
        df_only_list = [list(d.values())[0] for d in df_list]
        concatenated_df = pd.concat(df_only_list, ignore_index=True)
        concatenated_dataframes.append(concatenated_df)
    
    return concatenated_dataframes

In [438]:
# Appel de la fonction pour concaténer les DataFrames
concatenated_dataframes = concat_harmonized_dataframes(dataframes)

# Affichage des DataFrames concaténés pour vérifier
for prefix, df in zip(prefixes, concatenated_dataframes):
    logging.info(f"{prefix}: {df.shape}")

2024-06-21 16:39:33,914 - INFO - caracteristiques: (1176873, 17)
2024-06-21 16:39:33,914 - INFO - lieux: (1176873, 19)
2024-06-21 16:39:33,915 - INFO - usagers: (2636377, 17)
2024-06-21 16:39:33,916 - INFO - vehicules: (2009395, 11)


In [439]:
# Fusionne les dataframes en un seul 

# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\
# /!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\/!\

def merge_dataframes_on_usager(concatenated_dataframes):
    # Définir les DataFrames
    df_usagers = concatenated_dataframes[2]
    df_vehicules = concatenated_dataframes[3]
    df_caracteristiques = concatenated_dataframes[1]
    df_lieux = concatenated_dataframes[4]

    merged_usagers_vehicules = pd.merge(df_usagers, df_vehicules, on=['Num_Acc', 'num_veh'], how='inner')

    merged_usagers_vehicules_caracteristiques = pd.merge(merged_usagers_vehicules, df_caracteristiques, on='Num_Acc', how='inner')

    final_merged_df = pd.merge(merged_usagers_vehicules_caracteristiques, df_lieux, on='Num_Acc', how='inner')

    #columns_to_drop = ['num_veh_x', 'id_vehicule_x', 'num_veh_y', 'id_vehicule_y', 'adr', 'lat', 'long', 'voie', 'v1', 'v2', 'larrout', 'occutc']
    #final_merged_df = final_merged_df.drop(columns=columns_to_drop, errors='ignore')

    return final_merged_df

In [440]:
final_df = merge_dataframes_on_usager(concatenated_dataframes)

IndexError: list index out of range

In [None]:
display(final_df.head(50))
display(final_df.tail(50))
logging.info(f"DataFrame final: {final_df.shape}")
logging.info(f'Column: {final_df.columns}')

Unnamed: 0_level_0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,senc,catv,occutc,obs,obsm,choc,manv,num_veh_y,id_vehicule_y,motor
id_usager,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
,200500000000.0,5,1,12,1900,3,2,1,1.0,3.0,...,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,


Unnamed: 0_level_0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,senc,catv,occutc,obs,obsm,choc,manv,num_veh_y,id_vehicule_y,motor
id_usager,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
133 849,202100100000.0,2021,1,1,18:45,3,1,1,1.0,6.0,...,2.0,7,,0.0,2.0,1.0,3.0,A01,100 909,1.0
133 845,202100100000.0,2021,1,1,18:00,2,1,1,1.0,6.0,...,1.0,7,,4.0,0.0,2.0,14.0,A01,100 906,1.0
133 846,202100100000.0,2021,1,1,18:00,2,1,1,1.0,6.0,...,1.0,7,,4.0,0.0,2.0,14.0,A01,100 906,1.0
133 843,202100100000.0,2021,1,1,08:55,1,1,1,8.0,3.0,...,1.0,7,,4.0,2.0,8.0,17.0,B01,100 904,1.0
133 843,202100100000.0,2021,1,1,08:55,1,1,1,8.0,3.0,...,1.0,7,,0.0,2.0,8.0,1.0,A01,100 905,1.0
133 844,202100100000.0,2021,1,1,08:55,1,1,1,8.0,3.0,...,1.0,7,,4.0,2.0,8.0,17.0,B01,100 904,1.0
133 844,202100100000.0,2021,1,1,08:55,1,1,1,8.0,3.0,...,1.0,7,,0.0,2.0,8.0,1.0,A01,100 905,1.0
133 842,202100100000.0,2021,1,1,16:30,1,1,1,1.0,6.0,...,1.0,33,,6.0,0.0,2.0,0.0,A01,100 903,1.0
133 841,202100100000.0,2021,1,1,03:30,4,2,1,1.0,6.0,...,1.0,7,,6.0,0.0,1.0,13.0,A01,100 902,1.0
133 840,202100100000.0,2021,1,1,05:15,5,2,3,1.0,6.0,...,0.0,33,,6.0,0.0,1.0,0.0,A01,100 901,1.0


2024-06-21 16:29:04,843 - INFO - DataFrame final: (4763356, 60)
2024-06-21 16:29:04,849 - INFO - Column: Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int', 'atm',
       'col', 'com', 'adr', 'gps', 'lat', 'long', 'dep', 'Accident_Id', 'catr',
       'voie', 'v1', 'v2', 'circ', 'nbv', 'pr', 'pr1', 'vosp', 'prof', 'plan',
       'lartpc', 'larrout', 'surf', 'infra', 'situ', 'env1', 'vma', 'place',
       'catu', 'grav', 'sexe', 'trajet', 'secu', 'locp', 'actp', 'etatp',
       'an_nais', 'num_veh_x', 'id_vehicule_x', 'secu1', 'secu2', 'secu3',
       'senc', 'catv', 'occutc', 'obs', 'obsm', 'choc', 'manv', 'num_veh_y',
       'id_vehicule_y', 'motor'],
      dtype='object')
