In [None]:
import pandas as pd
from type_imputation import map_type 

# Change from Richard to enable use of split data sets
def impute_drive(df):
    """imputes drive by selecting the most common type label for a given drive label. Returns the transformed drive column."""

    # Verteilung von `drive` pro `type` vor Imputation anzeigen
    print("Verteilung von `drive` pro `type` vor Imputation:")
    print(df.groupby(["type", "drive"]).size().unstack(fill_value=0))

    # Verteilung von `drive` pro `type` berechnen
    drive_distribution = df.groupby(["type", "drive"]).size().unstack(fill_value=0)

    # Modus und prozentualen Anteil berechnen
    mode_info = {}
    for type_name in drive_distribution.index:
        # Modus (häufigster drive-Wert) für diesen type
        mode_drive = drive_distribution.loc[type_name].idxmax()
        # Gesamtanzahl der Einträge für diesen type
        total_count = drive_distribution.loc[type_name].sum()
        # Anzahl des Modus
        mode_count = drive_distribution.loc[type_name, mode_drive]
        # Prozentualer Anteil des Modus
        mode_percentage = (mode_count / total_count * 100) if total_count > 0 else 0
        # Speichern der Informationen
        mode_info[type_name] = {"mode": mode_drive, "percentage": mode_percentage}

    # Modus von `drive` pro `type` für die Imputation
    drive_mode_per_type = {type_name: info["mode"] for type_name, info in mode_info.items()}

    # Fehlende Werte in `drive` imputieren
    df["drive"] = df.apply(
        lambda row: drive_mode_per_type[row["type"]] if pd.isna(row["drive"]) and row["type"] in drive_mode_per_type else row["drive"],
        axis=1
    )

    

    # Ergebnisse speichern
    #df.to_csv("vehicles_imputed.csv", index=False)

    # Verteilung nach Imputation anzeigen
    print("\nVerteilung von `drive` pro `type` nach Imputation:")
    print(df.groupby(["type", "drive"]).size().unstack(fill_value=0))

    # Modus und prozentualen Anteil ausgeben
    print("\nModus und prozentualer Anteil pro `type` (vor Imputation):")
    for type_name, info in mode_info.items():
        print(f"{type_name}: Modus = {info['mode']}, Anteil = {info['percentage']:.1f}%")
    
    return df["drive"]


X_train = pd.read_csv("../data/train_data.csv")
display(X_train.value_counts("drive"))
print(X_train["drive"].isna().sum())
print(f"amount of missing values: {impute_drive(X_train).isna().sum()}")

drive
4wd    105594
fwd     84533
rwd     46896
Name: count, dtype: int64

104481
Verteilung von `drive` pro `type` vor Imputation:
drive          4wd    fwd    rwd
type                            
SUV          36011  12152   2984
bus             13     40    231
convertible    375   1022   3635
coupe          580   3087   8499
hatchback      838   8352    459
mini-van       146   3134    125
offroad        442      7     22
other         2830   3252   2422
pickup       21675   1427   3906
sedan         6303  37900  10543
truck        19896    387   6176
van            236   2787   2424
wagon         3550   3293    245

Verteilung von `drive` pro `type` nach Imputation:
drive          4wd    fwd    rwd
type                            
SUV          46735  12152   2984
bus             13     40    367
convertible    375   1022   4856
coupe          580   3087  11625
hatchback      838  12030    459
mini-van       146   3667    125
offroad        454      7     22
other         2830  12448   2422
pickup       29416   1427   3906
sedan         6303  52684  10543
