In [58]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


In [59]:
df_train = pd.read_csv('Cleaned_train.csv')
df_test = pd.read_csv('Cleaned_test.csv')
common_cols = list(set(df_train.columns) & set(df_test.columns))
df_train = df_train[common_cols]
df_test = df_test[common_cols]

In [60]:
len(common_cols)

118

In [61]:
dtype_dict = {
    # Category columns
    "Naped": "category",
    "Skrzynia_biegow": "category",
    "Typ_nadwozia": "category",
    "Kolor": "category",
    "Kraj_pochodzenia": "category",

    # String columns
    "Marka_pojazdu": "category",
    # "Model_pojazdu": "string",
    # "Wersja_pojazdu": "string",
    # "Nazwa_generacji": "string",
    "Typ_silnika": "category",

    # Float columns
    "Rok_produkcji": "float64",
    "Przebieg_km": "float64",
    "Moc_KM": "float64",
    "Pojemnosc_cm3": "float64",
    "Emisja_CO2": "float64",
    "Liczba_drzwi": "float64",
    "Rok_startu_produkcji": "float64",
    "Rok_konca_produkcji": "float64",

    # Integer columns
    "Rok_publikacji_oferty": "float",
    "Miesiac_publikacji_oferty": "float",

    # Boolean columns
    "Xenon lights": "bool",
    "Factory radio": "bool",
    "Velor upholstery": "bool",
    "DVD player": "bool",
    "Start-Stop system": "bool",
    "Electrically adjustable seats": "bool",
    "Fog lights": "bool",
    "Shift paddles": "bool",
    "Passengers airbag": "bool",
    "Daytime running lights": "bool",
    "Power steering": "bool",
    "Four-zone air conditioning": "bool",
    "Sunroof": "bool",
    "GPS navigation": "bool",
    "Manual air conditioning": "bool",
    "Heated side mirrors": "bool",
    "Parking assistant": "bool",
    "ESP(stabilization of the track)": "bool",
    "Immobilizer": "bool",
    "Leather upholstery": "bool",
    "Lane assistant": "bool",
    "Drivers airbag": "bool",
    "USB socket": "bool",
    "Electric front windows": "bool",
    "Blind spot sensor": "bool",
    "SD socket": "bool",
    "Speed limiter": "bool",
    "Electrochromic side mirrors": "bool",
    "LED lights": "bool",
    "AUX socket": "bool",
    "Auxiliary heating": "bool",
    "Heated front seats": "bool",
    "Rain sensor": "bool",
    "Rear parking sensors": "bool",
    "Twilight sensor": "bool",
    "Front side airbags": "bool",
    "Automatic air conditioning": "bool",
    "Roof rails": "bool",
    "Central locking": "bool",
    "ABS": "bool",
    "CD": "bool",
    "Bluetooth": "bool",
    "Dual zone air conditioning": "bool",
    "Alloy wheels": "bool",
    "Cruise control": "bool",
    "Airbag protecting the knees": "bool",
    "TV tuner": "bool",
    "Rear side airbags": "bool",
    "Multifunction steering wheel": "bool",
    "Air curtains": "bool",
    "On-board computer": "bool",
    "Active cruise control": "bool",
    "CD changer": "bool",
    "Electrochromic rear view mirror": "bool",
    "Tinted windows": "bool",
    "Adjustable suspension": "bool",
    "Rear view camera": "bool",
    "MP3": "bool",
    "Heated rear seats": "bool",
    "Electric rear windows": "bool",
    "Alarm": "bool",
    "Hook": "bool",
    "Heated windscreen": "bool",
    "Isofix": "bool",
    "Front parking sensors": "bool",
    "Panoramic roof": "bool",
    "Aftermarket radio": "bool",
    "HUD(head-up display)": "bool",
    "Electrically adjustable mirrors": "bool",
    "ASR (traction control)": "bool",

    # Object columns (to be handled separately)
    "Pierwszy_wlasciciel": "category",  # Leave as object or map to category if needed
    "Cena": "float",  # Leave as object or convert to float after cleaning
    "Rok_pierwszej_rejestracji": "float",  # Leave as object or convert to datetime
    "Miesiac_pierwszej_rejestracji": "float",  # Leave as object or convert to datetime
    "Wojewodztwo": "category",  # Leave as object or map to category
    # "Miejscowosc": "object",  # Leave as object or map to category
}
ls = list(set(list(dtype_dict.keys())) & set(common_cols))
new_dtype_dict = {}
for element in ls:
    new_dtype_dict[element] = dtype_dict[element]

df_train = df_train[ls].astype(new_dtype_dict)
df_test = df_test[ls].astype(new_dtype_dict)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(df_train.iloc[:, df_train.columns != "Cena"], df_train['Cena'], test_size=.2)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import numpy as np

# Parametry do tuningu
param_grid = {
    "n_estimators": [50, 100, 200, 300],  # Liczba drzew
    "max_depth": [3, 5, 7, 9],  # Maksymalna głębokość drzewa
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Szybkość uczenia
    "subsample": [0.6, 0.8, 1.0],  # Frakcja próbek do treningu
    "colsample_bytree": [0.6, 0.8, 1.0],  # Frakcja cech do treningu
    "gamma": [0, 0.1, 0.2],  # Minimalna redukcja straty do podziału
    "reg_alpha": [0, 0.1, 1],  # Regularyzacja L1 (alpha)
    "reg_lambda": [0, 0.1, 1],  # Regularyzacja L2 (lambda)
}
param_grid_restricted = {
    "n_estimators": [100, 200],  # Zamiast [50, 100, 200, 300]
    "max_depth": [5, 7],  # Zamiast [3, 5, 7, 9]
    "learning_rate": [0.05, 0.1],  # Zamiast [0.01, 0.05, 0.1, 0.2]
    "subsample": [0.8, 1.0],  # Zamiast [0.6, 0.8, 1.0]
    "colsample_bytree": [0.8, 1.0],  # Zamiast [0.6, 0.8, 1.0]
    "gamma": [0, 0.1],  # Zamiast [0, 0.1, 0.2]
    "reg_alpha": [0, 0.1],  # Zamiast [0, 0.1, 1]
    "reg_lambda": [0, 0.1],  # Zamiast [0, 0.1, 1]
}

# Parametry specyficzne dla GPU
params = {
    "tree_method": "gpu_hist",  # Użyj GPU do budowy drzew
    "objective": "reg:squarederror",  # Zadanie regresji
    "eval_metric": "rmse",  # Metryka RMSE
    "gpu_id": 0,  # Użyj GPU o indeksie 0
    "predictor": "gpu_predictor",  # Użyj GPU do predykcji
}

# Inicjalizacja modelu XGBRegressor z parametrami GPU
xgb = XGBRegressor(**params, enable_categorical=True, random_state=42)

# Konfiguracja Grid Search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_restricted,
    scoring="neg_mean_squared_error",  # Metryka do optymalizacji (RMSE)
    cv=5,  # 5-krotna walidacja krzyżowa
    n_jobs=-1,  # Użyj wszystkich dostępnych rdzeni CPU
    verbose=2,  # Wyświetl postęp
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,  # Przetestuj tylko 50 losowych kombinacji
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)

# Trenowanie modelu z Grid Search
grid_search.fit(X_train, y_train)

# Najlepsze parametry i wynik
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik (Negative MSE):", grid_search.best_score_)

# Ocena modelu na zbiorze testowym
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE na zbiorze testowym: {rmse}")
print(f"R² na zbiorze testowym: {r2}")

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,  # Przetestuj tylko 50 losowych kombinacji
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42,
)

In [52]:
bst.predict(X_test)

array([132307.61 ,  17083.883, 109430.82 , ..., 109430.82 ,  17083.883,
        17083.883], dtype=float32)