In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("dataset.csv", sep=";", on_bad_lines="skip")
print("Dataset chargé avec succès")

print("Dimensions :", df.shape)
print("Colonnes :", df.columns.tolist())

print("\nTypes de données :\n", df.dtypes)

if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df.dropna(subset=["Date"])

print("\nValeurs manquantes par colonne :\n", df.isnull().sum())

df = df.drop_duplicates()

if "Departure station" in df.columns:
    pattern = r"^[A-ZÀ-Ü\s\-]+$"
    df = df[df["Departure station"].astype(str).str.fullmatch(pattern, na=False)]

df = df.dropna(how="all")

textual_na_patterns = ["NA", "N/A", "n/a", "na", "-", ""]
textual_na_count = {}

for col in df.columns:
    textual_na_count[col] = (
        df[col]
        .astype(str)
        .str.upper()
        .isin([p.upper() for p in textual_na_patterns])
        .sum()
    )

print("\nAnalyse des 'textual NA' :")
for col, count in textual_na_count.items():
    if count > 0:
        print(f"{col} : {count} valeurs 'textual NA'")

if "Departure station" in df.columns and "Arrival station" in df.columns:
    print("\nRecherche de doublons de paires de stations (A->B et B->A) :")

    df_pairs = df[["Departure station", "Arrival station"]].dropna().copy()
    df_pairs["Sorted Pair"] = df_pairs.apply(
        lambda row: tuple(sorted([row["Departure station"], row["Arrival station"]])),
        axis=1,
    )
    duplicate_pairs = df_pairs.duplicated(subset="Sorted Pair", keep=False)

    if duplicate_pairs.any():
        print(f"{duplicate_pairs.sum()} doublons de paires de stations trouvés.")
        print(df_pairs[duplicate_pairs].drop_duplicates("Sorted Pair").head())
    else:
        print("Aucun doublon de paires de stations trouvé.")

print("\nAnalyse des valeurs manquantes détaillée :")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame(
    {"Missing Values": missing_values, "Percentage": missing_percent}
).sort_values(by="Missing Values", ascending=False)
print(missing_df[missing_df["Missing Values"] > 0])

df.describe(include="all")

if "Average delay of all trains at arrival" in df.columns:
    plt.figure(figsize=(10, 4))
    sns.histplot(
        df["Average delay of all trains at arrival"].dropna(),
        bins=30,
        kde=True,
        color="orange",
    )
    plt.title("Distribution des retards à l'arrivée (tous trains)")
    plt.xlabel("Minutes de retard")
    plt.ylabel("Nombre de trajets")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

if "Average delay of all trains at departure" in df.columns:
    plt.figure(figsize=(10, 4))
    sns.histplot(
        df["Average delay of all trains at departure"].dropna(),
        bins=30,
        kde=True,
        color="green",
    )
    plt.title("Distribution des retards au départ (tous trains)")
    plt.xlabel("Minutes de retard")
    plt.ylabel("Nombre de trajets")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

if "Date" in df.columns:
    df["Mois"] = df["Date"].dt.month
    df["Jour de la semaine"] = df["Date"].dt.day_name()

df.to_csv("cleaned_dataset.csv", index=False)
print("OK: Données nettoyées enregistrées dans 'cleaned_dataset.csv'")

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'