In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [68]:
#https://stackoverflow.com/questions/46135839/auto-detect-the-delimiter-in-a-csv-file-using-pd-read-csv
import csv

def get_delimiter(file_path, bytes = 4096):
    sniffer = csv.Sniffer()
    data = open(file_path, "r").read(bytes)
    delimiter = sniffer.sniff(data).delimiter
    return delimiter

In [72]:
def read_csv_file(file_path):
    try:
        for encoding in ['latin1', 'ISO-8859-1', 'utf-8']:
            delimiter= get_delimiter(file_path)
            try:
                df = pd.read_csv(file_path, low_memory=False, encoding=encoding, delimiter=delimiter)
                return df, True, None
            except (UnicodeDecodeError, pd.errors.ParserError) as e:
                pass  
        return None, False, f"Impossible de lire {file_path}."
    except FileNotFoundError as e:
        return None, False, e

In [73]:
years = list(range(2005, 2023, 1))
prefixes= ['caracteristiques', 'lieux', 'usagers', 'vehicules']
dataframes= []

for prefix in prefixes:
    datasets = []
    for year in years:
        connector = '_' if year <= 2016 else '-'
        file_name = f'datas/{prefix}{connector}{year}.csv'
        df, success, error = read_csv_file(file_name)
        if success:
            key= {file_name: df}
            datasets.append(key)
        else:
            print(f'{file_name} : {error}')
    dataframes.append(datasets)

print(f'Total datasets: {len(dataframes)}.')

for prefix, df_list in zip(prefixes, dataframes):
    print(f'{prefix}: {len(df_list)}.')

#Correction à la main des deux fichiers pour lesquels il y avait une erreur de frappe dans le nom.


Total datasets: 4.
caracteristiques: 18.
lieux: 18.
usagers: 18.
vehicules: 18.


In [None]:
for df_list in dataframes:
    print('------------')
    for item in df_list:
        print('************')
        for filename, df in item.items():
            total_rows = df.shape[0]
            print(f'{filename} - {total_rows} lignes.')
            print(f'colonne:dtype:null%')
            for column in df.columns:
                missing_percent = df[column].isna().sum() / total_rows * 100
                print(f'{column}:{df[column].dtype}:{round(missing_percent)}%')


------------
************
datas/caracteristiques_2005.csv
colonne:dtype:null%
Num_Acc:int64:0%
an:int64:0%
mois:int64:0%
jour:int64:0%
hrmn:int64:0%
lum:int64:0%
agg:int64:0%
int:int64:0%
atm:int64:0%
col:int64:0%
com:float64:0%
adr:object:19%
gps:object:69%
lat:float64:69%
long:float64:69%
dep:int64:0%
************
datas/caracteristiques_2006.csv
colonne:dtype:null%
Num_Acc:int64:0%
an:int64:0%
mois:int64:0%
jour:int64:0%
hrmn:int64:0%
lum:int64:0%
agg:int64:0%
int:int64:0%
atm:int64:0%
col:int64:0%
com:int64:0%
adr:object:13%
gps:object:71%
lat:float64:71%
long:float64:71%
dep:int64:0%
************
datas/caracteristiques_2007.csv
colonne:dtype:null%
Num_Acc:int64:0%
an:int64:0%
mois:int64:0%
jour:int64:0%
hrmn:int64:0%
lum:int64:0%
agg:int64:0%
int:int64:0%
atm:int64:0%
col:int64:0%
com:int64:0%
adr:object:18%
gps:object:72%
lat:float64:72%
long:float64:72%
dep:int64:0%
************
datas/caracteristiques_2008.csv
colonne:dtype:null%
Num_Acc:int64:0%
an:int64:0%
mois:int64:0%
jour:in

In [None]:
for df_list in dataframes:
    for item in df_list:
        for filename, df in item.items():
            

In [78]:
#visualiser des 'groupes' de dataframes dont la structure est proche. 
#c'est juste pour avoir une représentation 
from collections import defaultdict

grouped_dataframes = defaultdict(lambda: {"filenames": [], "num_cols": 0, "num_dtypes": 0})

for df_list in dataframes:
    for item in df_list:
        for filename, df in item.items():
            structure_key = str(sorted([(col, str(dtype)) for col, dtype in df.dtypes.items()]))
            num_cols = len(df.columns)
            num_dtypes = len(set(str(dtype) for dtype in df.dtypes))
            grouped_dataframes[structure_key]["filenames"].append(filename)
            grouped_dataframes[structure_key]["num_cols"] = max(grouped_dataframes[structure_key]["num_cols"], num_cols)
            grouped_dataframes[structure_key]["num_dtypes"] = max(grouped_dataframes[structure_key]["num_dtypes"], num_dtypes)

for i, (structure, info) in enumerate(grouped_dataframes.items(), 1):
    filenames = info["filenames"]
    num_cols = info["num_cols"]
    num_dtypes = info["num_dtypes"]
    print(f"Groupe numéro {i}:")
    print(f"{structure}")
    print(f"Col = {num_cols}. Unique Dtype = {num_dtypes}.")
    print(f"{filenames}")
    print()

Groupe numéro 1:
[('Num_Acc', 'int64'), ('adr', 'object'), ('agg', 'int64'), ('an', 'int64'), ('atm', 'int64'), ('col', 'int64'), ('com', 'float64'), ('dep', 'int64'), ('gps', 'object'), ('hrmn', 'int64'), ('int', 'int64'), ('jour', 'int64'), ('lat', 'float64'), ('long', 'float64'), ('lum', 'int64'), ('mois', 'int64')]
Col = 16. Unique Dtype = 3.
['datas/caracteristiques_2005.csv']

Groupe numéro 2:
[('Num_Acc', 'int64'), ('adr', 'object'), ('agg', 'int64'), ('an', 'int64'), ('atm', 'int64'), ('col', 'int64'), ('com', 'int64'), ('dep', 'int64'), ('gps', 'object'), ('hrmn', 'int64'), ('int', 'int64'), ('jour', 'int64'), ('lat', 'float64'), ('long', 'float64'), ('lum', 'int64'), ('mois', 'int64')]
Col = 16. Unique Dtype = 3.
['datas/caracteristiques_2006.csv', 'datas/caracteristiques_2007.csv', 'datas/caracteristiques_2008.csv', 'datas/caracteristiques_2015.csv']

Groupe numéro 3:
[('Num_Acc', 'int64'), ('adr', 'object'), ('agg', 'int64'), ('an', 'int64'), ('atm', 'float64'), ('col', 'in

In [None]:
#modifier la structure des datasets caractéristiques
structure_caracteristiques = {
    
}



