In [76]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress all runtime warnings (e.g., divide by zero, overflow)
warnings.filterwarnings("ignore", category=RuntimeWarning)
np.seterr(all='ignore')

cars_raw = pd.read_csv("otomoto_cars_parsed2.csv", low_memory=False)
cars = cars_raw.copy()

In [77]:
# Data cleaning and diagnostics

# 1. Drop listings with missing 'Transmission' (critical field)
print(f"Number of records before dropping missing 'Transmission': {cars.shape[0]}")
cars = cars.dropna(subset=["Transmission"])
print(f"Number of records after dropping missing 'Transmission': {cars.shape[0]}")

# 2. Remove extreme price values (below 1,000 PLN or above 3,000,000 PLN)
print(f"Number of records before filtering extreme prices: {cars.shape[0]}")
cars = cars[(cars["Price"] >= 1000) & (cars["Price"] <= 3_000_000)]
print(f"Number of records after filtering extreme prices: {cars.shape[0]}")

# 3. Remove rare Make‚ÄìModel combinations (less than 20 occurrences)
make_model_counts = cars.groupby(["Make", "Model"]).size()
rare_combinations = make_model_counts[make_model_counts < 20]
print(f"Number of rare Make‚ÄìModel pairs (<20 listings): {len(rare_combinations)}")
valid_pairs = make_model_counts[make_model_counts >= 20].index
cars = cars[cars.set_index(["Make", "Model"]).index.isin(valid_pairs)]
cars = cars.reset_index(drop=True)
print(f"Number of records after removing rare Make‚ÄìModel pairs: {cars.shape[0]}")

# 4. Fill missing values in 'Power_per_Liter' if 'Engine_Capacity' > 0
missing_before = cars["Power_per_Liter"].isna().sum()
mask = cars["Power_per_Liter"].isna() & (cars["Engine_Capacity"] > 0)

cars.loc[mask, "Power_per_Liter"] = (
    cars.loc[mask, "Engine_Power"] / (cars.loc[mask, "Engine_Capacity"] / 1000)
)

# Fill remaining missing values (e.g., electric vehicles) with 0
cars["Power_per_Liter"] = cars["Power_per_Liter"].fillna(0)
missing_after = cars["Power_per_Liter"].isna().sum()

print(f"Missing values in 'Power_per_Liter' before: {missing_before}")
print(f"Missing values in 'Power_per_Liter' after: {missing_after}")

# 5. Remove implausible engine power values (> 1000 HP)
outlier_count = (cars["Engine_Power"] > 1000).sum()
cars = cars[cars["Engine_Power"] <= 1000]
print(f"Removed {outlier_count} records with 'Engine_Power' > 1000 HP (likely errors)")

# 6. Summary of remaining missing values (to be handled later)
missing_summary = cars.isnull().sum()
print("\nRemaining missing values after initial cleaning:")
print(missing_summary[missing_summary > 0])


Number of records before dropping missing 'Transmission': 206562
Number of records after dropping missing 'Transmission': 206562
Number of records before filtering extreme prices: 206562
Number of records after filtering extreme prices: 206562
Number of rare Make‚ÄìModel pairs (<20 listings): 1
Number of records after removing rare Make‚ÄìModel pairs: 206560
Missing values in 'Power_per_Liter' before: 2271
Missing values in 'Power_per_Liter' after: 0
Removed 15 records with 'Engine_Power' > 1000 HP (likely errors)

Remaining missing values after initial cleaning:
Equipment_Audio_and_Multimedia         15992
Equipment_Comfort_and_Extras            7601
Equipment_Driver_Assistance_Systems     9751
Equipment_Performance_and_Tuning       50488
Equipment_Safety                       11626
dtype: int64


In [78]:
# Feature Engineering: numerical, categorical, and textual features

# --- 4.2.1 Numerical Features ---


# Print summary statistics
print("Numerical feature transformations completed.\n")
print(cars[["Price", "Log_Price", "Mileage", "Log_Mileage", "Age", "Log_Age", "Mileage_per_Year"]].describe())


# --- 4.2.2 Categorical Features ---

# Sanitize text formatting (optional)
cars["Make"] = cars["Make"].str.strip().str.title()
cars["Model"] = cars["Model"].str.strip().str.title()
cars["Fuel_Type"] = cars["Fuel_Type"].str.strip().str.title()
cars["Gearbox"] = cars["Gearbox"].str.strip().str.title()
cars["Transmission"] = cars["Transmission"].str.strip().str.upper()

# Print unique counts for key categorical variables
print("\nUnique values in selected categorical features:")
for col in ["Make", "Model", "Body_Type", "Fuel_Type", "Gearbox", "Transmission"]:
    print(f"{col}: {cars[col].nunique()} unique values")

# Optional: store categorical variables for later encoding
categorical_features = ["Make", "Model", "Body_Type", "Fuel_Type", "Gearbox", "Transmission",
                        "No_Accidents", "Service_Record", "Is_Imported", "First_Owner", "Professional_Seller"]


# --- 4.2.3 Text Features (Descriptions) ---


# Basic cleanup of the existing 'Full_Description' field
cars["Full_Description"] = cars["Full_Description"].fillna("").astype(str).str.strip()

# Preview sample description
print("\nSample cleaned full description:")
print(cars["Full_Description"].iloc[0])

Numerical feature transformations completed.

              Price      Log_Price       Mileage    Log_Mileage  \
count  2.065450e+05  206545.000000  2.065450e+05  206545.000000   
mean   7.331867e+04      10.780469  1.521148e+05      11.619395   
std    8.566146e+04       0.928395  9.155483e+04       1.082653   
min    1.000000e+03       6.908755  1.000000e+00       0.693147   
25%    2.590000e+04      10.162037  8.380000e+04      11.336200   
50%    4.890000e+04      10.797553  1.500000e+05      11.918397   
75%    8.990000e+04      11.406464  2.100000e+05      12.254868   
max    2.699000e+06      14.808392  2.901980e+06      14.880904   

                 Age        Log_Age  Mileage_per_Year  
count  206545.000000  206545.000000     206545.000000  
mean        9.815101       2.206074      14452.518839  
std         5.882359       0.639701       7512.439410  
min         0.000000       0.000000          0.016393  
25%         5.000000       1.791759       9764.333333  
50%         9.

In [79]:
pd.set_option('display.max_columns', None)
cars.head()

Unnamed: 0,Listing_URL,Make,Model,Body_Type,Fuel_Type,Gearbox,Transmission,Mileage,Engine_Capacity,Engine_Power,No_Accidents,Service_Record,Is_Imported,First_Owner,Professional_Seller,Full_Description,Equipment_Audio_and_Multimedia,Equipment_Comfort_and_Extras,Equipment_Driver_Assistance_Systems,Equipment_Performance_and_Tuning,Equipment_Safety,Price,Log_Price,Log_Mileage,Age,Log_Age,Mileage_per_Year,Power_per_Liter
0,https://www.otomoto.pl/osobowe/oferta/volkswag...,Volkswagen,Golf,Kompakt,Petrol-Based,Manualna,FWD,150000,1984,210,1,1,1,0,0,Volkswagen Golf VI 2.0 TSI GTI NUMER TELEFONU:...,"[""Radio""]","[""Klimatyzacja automatyczna, dwustrefowa"", ""Ta...","[""Kontrola odleg≈Ço≈õci z przodu (przy parkowani...","[""Zawieszenie sportowe""]","[""ABS"", ""ESP"", ""System wspomagania hamowania"",...",39900,10.594157,11.918397,13,2.639057,10714.285714,105.846774
1,https://www.otomoto.pl/osobowe/oferta/opel-mok...,Opel,Mokka,SUV,Diesel,Manualna,FWD,151200,1686,130,1,1,1,1,0,Opel Mokka 1.7 CDTI Cosmo S&S 4x4 Do sprzedani...,"[""Interfejs Bluetooth"", ""Radio"", ""Zestaw g≈Ço≈õn...","[""Klimatyzacja automatyczna"", ""Tapicerka czƒô≈õc...","[""Tempomat"", ""Lampy ksenonowe"", ""Kontrola odle...","[""Felgi aluminiowe 18""]","[""ABS"", ""ESP"", ""System powiadamiania o wypadku...",32900,10.401258,11.926365,12,2.564949,11630.769231,77.105575
2,https://www.otomoto.pl/osobowe/oferta/hyundai-...,Hyundai,Ix35,SUV,Petrol-Based,Manualna,FWD,125000,1591,135,1,1,1,1,0,Hyundai ix35 1.6 2WD Comfort Rok produkcji: 20...,"[""Interfejs Bluetooth"", ""Radio"", ""Zestaw g≈Ço≈õn...","[""Klimatyzacja automatyczna, dwustrefowa"", ""Ta...","[""Podgrzewane lusterka boczne"", ""Kontrola trak...","[""Felgi aluminiowe 16""]","[""ABS"", ""ESP"", ""Elektroniczny system rozdzia≈Çu...",45900,10.734242,11.736077,10,2.397895,11363.636364,84.852294
3,https://www.otomoto.pl/osobowe/oferta/kia-ceed...,Kia,Ceed,Kombi,Petrol-Based,Manualna,FWD,145000,1591,135,1,1,1,1,0,Kia Ceed 1.6 GDI SW Vision Rok produkcji: 2015...,"[""Interfejs Bluetooth"", ""Radio"", ""Zestaw g≈Ço≈õn...","[""Klimatyzacja manualna"", ""Szyberdach szklany ...","[""Tempomat"", ""Lampy ksenonowe"", ""Kontrola odle...",,"[""ABS"", ""ESP"", ""Elektroniczny system rozdzia≈Çu...",34900,10.460271,11.884496,10,2.397895,13181.818182,84.852294
4,https://www.otomoto.pl/osobowe/oferta/audi-q7-...,Audi,Q7,SUV,Petrol-Based,Automatyczna,AWD,89000,2995,340,0,0,1,0,0,Audi Q7 55 TFSI quattro tiptronic Sprzedam Aud...,"[""Android Auto"", ""Interfejs Bluetooth"", ""Radio...","[""Klimatyzacja automatyczna, dwustrefowa"", ""Da...","[""Tempomat"", ""Lampy przednie w technologii LED...","[""Filtr czƒÖstek sta≈Çych"", ""Felgi aluminiowe 19""]","[""ABS"", ""ESP"", ""System hamowania awaryjnego dl...",129000,11.767575,11.396403,6,1.94591,12714.285714,113.522538


In [80]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# ‚úÖ Parametry
embedding_file = "cars_with_embeddings.parquet"
MODEL_NAME = "allegro/herbert-base-cased"
MAX_LENGTH = 256
BATCH_SIZE = 128
EMB_SIZE = 768  # pe≈Çny wymiar HerBERTa

# ‚úÖ Wyb√≥r urzƒÖdzenia (dla Mac: MPS lub CPU)
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"‚úÖ Using device: {DEVICE}")

# ‚úÖ Wstƒôpne czyszczenie p√≥l tekstowych
cars["Full_Description"] = cars["Full_Description"].fillna("").str.strip()
cars["Make"] = cars["Make"].fillna("").astype(str)
cars["Model"] = cars["Model"].fillna("").astype(str)
cars["Fuel_Type"] = cars["Fuel_Type"].fillna("").astype(str)
cars["Gearbox"] = cars["Gearbox"].fillna("").astype(str)
cars["Transmission"] = cars["Transmission"].fillna("").astype(str)

cars["Mileage_str"] = (cars["Mileage"].fillna(0) / 1000).round().astype(int).astype(str) + " tys. km"
cars["Power_str"] = cars["Engine_Power"].fillna(0).round().astype(int).astype(str) + " KM"
cars["Age_str"] = cars["Age"].fillna(0).astype(int).astype(str) + " lat"

# ‚úÖ Generowanie Equipment_Desc z binarnych kolumn
equipment_cols = [col for col in cars.columns if col.startswith("Equipment_")]
if equipment_cols:
    print(f"üîß Generowanie Equipment_Desc z {len(equipment_cols)} kolumn wyposa≈ºenia...")
    equipment_summaries = []
    for i, row in cars[equipment_cols].iterrows():
        items = [col.replace("Equipment_", "").replace("_", " ") for col in equipment_cols if row[col] == 1]
        summary = ", ".join(items) if items else "brak danych"
        equipment_summaries.append(summary)
    cars["Equipment_Desc"] = equipment_summaries
else:
    print("‚ö†Ô∏è Brak kolumn wyposa≈ºenia. Equipment_Desc ustawione na 'brak danych'.")
    cars["Equipment_Desc"] = "brak danych"

# ‚úÖ Funkcja budujƒÖca tekstowy opis
def build_text(row):
    return (
        f"{row['Make']} {row['Model']}, {row['Age_str']}, {row['Fuel_Type'].lower()}, "
        f"przebieg {row['Mileage_str']}, {row['Power_str']}, "
        f"skrzynia {row['Gearbox'].lower()}, napƒôd {row['Transmission'].lower()}. "
        f"Wyposa≈ºenie: {row['Equipment_Desc']}. "
        f"Opis: {row['Full_Description']}"
    )

# ‚úÖ Je≈ºeli plik istnieje ‚Äî wczytaj
if os.path.exists(embedding_file):
    print(f"üìÇ Plik istnieje. Wczytywanie z {embedding_file}...")
    cars_with_embeddings = pd.read_parquet(embedding_file)
    print(f"‚úÖ Wczytano. Kszta≈Çt: {cars_with_embeddings.shape}")

else:
    print("üöÄ Plik nie istnieje. Generowanie embedding√≥w z HerBERTa...")

    # ‚úÖ Zbuduj tekst wej≈õciowy
    cars["Textual_Input"] = cars.apply(build_text, axis=1)
    texts = cars["Textual_Input"].fillna("").astype(str).tolist()
    print(f"üìù Liczba rekord√≥w: {len(texts)}")

    # ‚úÖ Za≈Çaduj tokenizer i model HerBERT
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()
    print(f"‚úÖ Model {MODEL_NAME} za≈Çadowany.")

    # ‚úÖ Generowanie embedding√≥w z HerBERTa ([CLS] token)
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="üîÑ Embedding batches"):
            batch_texts = texts[i:i + BATCH_SIZE]
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)

    embeddings = np.vstack(embeddings)
    print(f"‚úÖ Gotowe. Kszta≈Çt embedding√≥w: {embeddings.shape}")

    # ‚úÖ Zbuduj DataFrame z embeddingami
    embedding_cols = [f"desc_emb_{i}" for i in range(EMB_SIZE)]
    embedding_df = pd.DataFrame(embeddings, columns=embedding_cols)

    # ‚úÖ Po≈ÇƒÖcz z oryginalnym DataFrame
    cars_with_embeddings = pd.concat([cars.reset_index(drop=True), embedding_df], axis=1)
    cars_with_embeddings.to_parquet(embedding_file, index=False)
    print(f"‚úÖ Zapisano embeddingi do {embedding_file}. Finalny kszta≈Çt: {cars_with_embeddings.shape}")

‚úÖ Using device: mps
üîß Generowanie Equipment_Desc z 5 kolumn wyposa≈ºenia...
üìÇ Plik istnieje. Wczytywanie z cars_with_embeddings.parquet...
‚úÖ Wczytano. Kszta≈Çt: (206545, 801)


In [81]:
from sklearn.preprocessing import MultiLabelBinarizer
import ast

# Kolumny z wyposa≈ºeniem (stringowe listy)
equipment_cols = [
    "Equipment_Audio_and_Multimedia",
    "Equipment_Comfort_and_Extras",
    "Equipment_Driver_Assistance_Systems",
    "Equipment_Performance_and_Tuning",
    "Equipment_Safety"
]

# Lista do przechowania zakodowanych DataFrame'√≥w
equipment_dfs = []

# Przetwarzanie ka≈ºdej kolumny wyposa≈ºenia
for col in equipment_cols:
    print(f"Processing {col}...")

    # Zamiana tekstu na listƒô (np. '["ABS", "ESP"]' -> ["ABS", "ESP"])
    parsed_equipment = cars_with_embeddings[col].fillna("[]").apply(ast.literal_eval)

    # Binarne kodowanie
    mlb = MultiLabelBinarizer(sparse_output=False)
    equipment_matrix = mlb.fit_transform(parsed_equipment)

    # Nadanie nazw kolumn
    feature_names = [f"{col}_{item.strip().replace(' ', '_')}" for item in mlb.classes_]
    df_equipment = pd.DataFrame(equipment_matrix, columns=feature_names, index=cars_with_embeddings.index)

    # Do≈ÇƒÖcz do listy
    equipment_dfs.append(df_equipment)

# Po≈ÇƒÖcz wszystkie wyposa≈ºeniowe kolumny
equipment_binary = pd.concat(equipment_dfs, axis=1)

# Do≈ÇƒÖcz do g≈Ç√≥wnego zbioru danych (TRWALE)
cars_with_embeddings = pd.concat([cars_with_embeddings, equipment_binary], axis=1)

# (Opcjonalnie) Zaktualizuj listƒô cech modelowych
try:
    all_model_features += list(equipment_binary.columns)
except NameError:
    all_model_features = list(equipment_binary.columns)  # je≈õli nie istnieje

# Statystyki
print(f"\nAdded {equipment_binary.shape[1]} binary equipment features.")
print("\nTop 20 most common equipment features:")
print(equipment_binary.sum().sort_values(ascending=False).head(20))

Processing Equipment_Audio_and_Multimedia...
Processing Equipment_Comfort_and_Extras...
Processing Equipment_Driver_Assistance_Systems...
Processing Equipment_Performance_and_Tuning...
Processing Equipment_Safety...

Added 176 binary equipment features.

Top 20 most common equipment features:
Equipment_Safety_ABS                                                          186877
Equipment_Safety_Poduszka_powietrzna_pasa≈ºera                                 186137
Equipment_Safety_Poduszka_powietrzna_kierowcy                                 185355
Equipment_Comfort_and_Extras_Elektryczne_szyby_przednie                       183397
Equipment_Audio_and_Multimedia_Radio                                          176524
Equipment_Safety_ESP                                                          175962
Equipment_Safety_Boczne_poduszki_powietrzne_-_prz√≥d                           167205
Equipment_Safety_Isofix_(punkty_mocowania_fotelika_dzieciƒôcego)               161271
Equipment_Driver_Assist

In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

# --- 4.4.1 Train-Test Split ---
TARGET = "Log_Price"
price_bins = pd.qcut(cars_with_embeddings[TARGET], q=10, labels=False)

X_full = cars_with_embeddings.drop(columns=["Price", "Log_Price"])
y_full = cars_with_embeddings[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full,
    test_size=0.2,
    stratify=price_bins,
    random_state=42
)

# Stw√≥rz nowƒÖ kolumnƒô "split"
cars_with_embeddings["split"] = "test"
cars_with_embeddings.loc[X_train.index, "split"] = "train"

# --- 4.4.2 K-Fold tylko na TRAIN ---
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cars_with_embeddings["cv_fold"] = -1

train_bins = price_bins.loc[X_train.index]
for fold, (_, val_idx) in enumerate(kf.split(X_train, train_bins)):
    val_ids = X_train.iloc[val_idx].index
    cars_with_embeddings.loc[val_ids, "cv_fold"] = fold

# Podsumowanie
print(cars_with_embeddings["split"].value_counts())
print(cars_with_embeddings["cv_fold"].value_counts().sort_index())

split
train    165236
test      41309
Name: count, dtype: int64
cv_fold
-1    41309
 0    33048
 1    33047
 2    33047
 3    33047
 4    33047
Name: count, dtype: int64


In [83]:
# DO U≈ªYTKU W Linear, Ridge, Lasso, AutoML

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Load data (musi zawieraƒá equipment features) ---
df = cars_with_embeddings.copy()

# --- Remove duplicated columns if they exist ---
df = df.loc[:, ~df.columns.duplicated()]

# --- Identify feature groups ---
embedding_cols = [col for col in df.columns if col.startswith("desc_emb_")]

binary_features = [
    "No_Accidents", "Service_Record", "Is_Imported",
    "First_Owner", "Professional_Seller"
]

continuous_features = [
    "Mileage", "Log_Mileage", "Age", "Log_Age",
    "Mileage_per_Year", "Engine_Power", "Engine_Capacity", "Power_per_Liter"
]
df["Make_Model"] = df["Make"] + " " + df["Model"]
categorical_features = [
    "Make", "Make_Model", "Body_Type", "Fuel_Type", "Gearbox", "Transmission"
]

# --- Drop unnecessary columns ---
drop_cols = ["Listing_URL", "Full_Description", "price_bin", "Price",
             "Equipment_Audio_and_Multimedia",
             "Equipment_Comfort_and_Extras",
             "Equipment_Driver_Assistance_Systems",
             "Equipment_Performance_and_Tuning",
             "Equipment_Safety", "Equipment_Desc","Model"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# --- One-hot encode categorical variables ---
df_encoded = pd.get_dummies(df[categorical_features], prefix=categorical_features)
df_encoded = df_encoded.astype(int)

# --- Standardize continuous features only ---
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df[continuous_features]),
    columns=continuous_features,
    index=df.index
)

# --- Identify equipment binary features ---
known_cols = set(continuous_features + binary_features + categorical_features + embedding_cols + ["Log_Price"])
equipment_features = [col for col in df.columns if col.startswith("Equipment_") and col not in known_cols]
print(f"Found {len(equipment_features)} equipment features.")

# --- Combine all features ---
final_df = pd.concat([
    df_scaled,                      # scaled continuous features
    df[binary_features],            # binary features (0/1)
    df_encoded,                     # one-hot encoded categorical features
    df[equipment_features],         # binary equipment features
    df[embedding_cols],             # description embeddings
    df[["Log_Price"]],              # target
    df[["cv_fold"]],                # cv_fold
    df[["split"]]                   # split
], axis=1)

# --- Final verification ---
assert final_df.columns.duplicated().sum() == 0, "There are still duplicate columns!"

# --- Save dataset ---
final_df.to_parquet("cars_ready_LinearRegression.parquet", index=False)
print(f"‚úÖ Final dataset saved. Shape: {final_df.shape}")

Found 176 equipment features.
‚úÖ Final dataset saved. Shape: (206545, 1636)


In [84]:
# DO U≈ªYTKU W DT, RF, XGB, LGBM, CatBoost, TabNet, MLP

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Load base dataframe ---
df = cars_with_embeddings.copy()

# --- Remove duplicated columns if any ---
df = df.loc[:, ~df.columns.duplicated()]
df["Make_Model"] = df["Make"] + " " + df["Model"]

# --- Drop unnecessary columns ---
drop_cols = ["Listing_URL", "Full_Description", "price_bin", "Price",
             "Equipment_Audio_and_Multimedia",
             "Equipment_Comfort_and_Extras",
             "Equipment_Driver_Assistance_Systems",
             "Equipment_Performance_and_Tuning",
             "Equipment_Safety", "Equipment_Desc", "Model"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# --- Identify column groups ---
embedding_cols = [col for col in df.columns if col.startswith("desc_emb_")]

binary_features = [
    "No_Accidents", "Service_Record", "Is_Imported",
    "First_Owner", "Professional_Seller"
]

continuous_features = [
    "Mileage", "Log_Mileage", "Age", "Log_Age",
    "Mileage_per_Year", "Engine_Power", "Engine_Capacity", "Power_per_Liter"
]

numerical_features = continuous_features + binary_features

categorical_features = [
    "Make", "Make_Model", "Body_Type", "Fuel_Type", "Gearbox", "Transmission"
]

equipment_features = [
    col for col in df.columns
    if col.startswith("Equipment_") and col not in numerical_features + categorical_features + embedding_cols
]

print(f"Found {len(equipment_features)} equipment features.")

# --- Scale only continuous features ---
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

print("Continuous features normalized (binary features left unchanged).")

# --- Cross-validated target encoding for categorical variables ---
target_mean = df["Log_Price"].mean()
original_categories = df[categorical_features].copy()

for col in categorical_features:
    encoded_col = np.full(len(df), np.nan)

    for fold in df["cv_fold"].unique():
        train_idx = df["cv_fold"] != fold
        val_idx = df["cv_fold"] == fold

        encoding_map = df.loc[train_idx].groupby(col)["Log_Price"].mean()
        mapped = original_categories.loc[val_idx, col].map(encoding_map)

        encoded_col[val_idx] = mapped

    df[col] = np.where(np.isnan(encoded_col), target_mean, encoded_col)

print("Categorical features safely target-encoded using cross-validation folds.")

# --- Final column list ---
all_model_features = numerical_features + categorical_features + equipment_features + embedding_cols

# --- Save final dataset ---
df[all_model_features + ["Log_Price", "cv_fold", "split"]].to_parquet("cars_ready_DecisionTree.parquet", index=False)
print(f"\n‚úÖ Final dataset saved. Shape: {df[all_model_features + ['Log_Price']].shape}")

Found 176 equipment features.
Continuous features normalized (binary features left unchanged).
Categorical features safely target-encoded using cross-validation folds.

‚úÖ Final dataset saved. Shape: (206545, 964)


In [85]:
# --- Przygotowanie danych tekstowych + strukturalnych do BART/LLM ---
import pandas as pd
import numpy as np

# --- Za≈Çaduj dane ≈∫r√≥d≈Çowe ---
df = cars_with_embeddings.copy()
# --- Drop unnecessary columns ---
drop_cols = ["Listing_URL", "price_bin", "Price",
             "Equipment_Audio_and_Multimedia",
             "Equipment_Comfort_and_Extras",
             "Equipment_Driver_Assistance_Systems",
             "Equipment_Performance_and_Tuning",
             "Equipment_Safety", "Equipment_Desc"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# --- Wstƒôpne uzupe≈Çnienia ---
df["Full_Description"] = df["Full_Description"].fillna("").str.strip()
df["Make"] = df["Make"].fillna("").astype(str)
df["Model"] = df["Model"].fillna("").astype(str)
df["Fuel_Type"] = df["Fuel_Type"].fillna("").astype(str)
df["Gearbox"] = df["Gearbox"].fillna("").astype(str)
df["Transmission"] = df["Transmission"].fillna("").astype(str)

# --- ZaokrƒÖglone liczby do tekstu ---
df["Mileage_str"] = (df["Mileage"].fillna(0) / 1000).round().astype(int).astype(str) + " tys. km"
df["Power_str"] = df["Engine_Power"].fillna(0).round().astype(int).astype(str) + " KM"
df["Age_str"] = df["Age"].fillna(0).astype(int).astype(str) + " lat"

# --- Wyposa≈ºenie (binary) ---
equipment_cols = [col for col in df.columns if col.startswith("Equipment_")]
equipment_summaries = []
for i, row in df[equipment_cols].iterrows():
    items = [col.replace("Equipment_", "").replace("_", " ") for col in equipment_cols if row[col] == 1]
    summary = ", ".join(items) if items else "brak danych"
    equipment_summaries.append(summary)
df["Equipment_Desc"] = equipment_summaries

# --- Zbuduj syntetyczny opis ---
def build_text(row):
    return (
        f"{row['Make']} {row['Model']}, {row['Age_str']}, {row['Fuel_Type'].lower()}, "
        f"przebieg {row['Mileage_str']}, {row['Power_str']}, "
        f"skrzynia {row['Gearbox'].lower()}, napƒôd {row['Transmission'].lower()}. "
        f"Wyposa≈ºenie: {row['Equipment_Desc']}. "
        f"Opis: {row['Full_Description']}"
    )

df["Textual_Input"] = df.apply(build_text, axis=1)

binary_features = [
    "No_Accidents", "Service_Record", "Is_Imported",
    "First_Owner", "Professional_Seller"
]

continuous_features = [
    "Mileage", "Log_Mileage", "Age", "Log_Age",
    "Mileage_per_Year", "Engine_Power", "Engine_Capacity", "Power_per_Liter"
]

numerical_features = continuous_features + binary_features

categorical_features = [
    "Make", "Model", "Body_Type", "Fuel_Type", "Gearbox", "Transmission"
]


Equipment_cols = [col for col in df.columns if col.startswith("Equipment_")]

# --- Kolumny do eksportu ---
final_cols = ["Textual_Input", "Log_Price", "cv_fold", "split"] + numerical_features + categorical_features + equipment_features

# --- Zapisz do pliku ---
df_final = df[final_cols].copy()
df_final.to_parquet("cars_ready_BART.parquet", index=False)
print(f"‚úÖ Zapisano cars_ready_BART.parquet ‚Üí shape: {df_final.shape}")

‚úÖ Zapisano cars_ready_BART.parquet ‚Üí shape: (206545, 199)
