In [13]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


In [125]:
df_train = pd.read_csv('Cleaned_train.csv', index_col=["ID"])
df_test = pd.read_csv('Cleaned_test.csv', index_col=["ID"])
test_id = df_test.index
common_cols = list(set(df_train.columns) & set(df_test.columns))
df_train = df_train[common_cols]
df_test = df_test[common_cols]

In [126]:
df_test.columns.tolist()

['Sunroof',
 'Electrochromic side mirrors',
 'Shift paddles',
 'Heated front seats',
 'CD',
 'Used',
 'Rain sensor',
 'Parking assistant',
 'Alloy wheels',
 'Pierwszy_wlasciciel',
 'Electrically adjustable seats',
 'Heated side mirrors',
 'TV tuner',
 'Four-zone air conditioning',
 'ASR (traction control)',
 'Rok_publikacji_oferty',
 'Gasoline + CNG',
 'HUD(head-up display)',
 'CD changer',
 'Diesel',
 'Power steering',
 'Wersja_pojazdu',
 'Wojewodztwo',
 'Immobilizer',
 'coupe',
 'Fog lights',
 'ESP(stabilization of the track)',
 'On-board computer',
 'compact',
 'Front side airbags',
 'Nazwa_generacji',
 'Heated rear seats',
 'Rok_startu_produkcji',
 'Bluetooth',
 'Isofix',
 'Leather upholstery',
 'Rear side airbags',
 'ABS',
 'Passengers airbag',
 'city_cars',
 'Gasoline + LPG',
 '4x4 (permanent)',
 'Manual',
 'SD socket',
 'LED lights',
 'Kraj_pochodzenia',
 'Central locking',
 'Rok_pierwszej_rejestracji',
 'Electrically adjustable mirrors',
 'MP3',
 'Air curtains',
 '4x4 (attached

In [127]:
dtype_dict = {
    # Category columns
    "Naped": "category",
    "Skrzynia_biegow": "category",
    "Typ_nadwozia": "category",
    "Kolor": "category",
    "Kraj_pochodzenia": "category",

    # String columns
    "Marka_pojazdu": "category",
    "Model_pojazdu": "string",
    # "Wersja_pojazdu": "string",
    # "Nazwa_generacji": "string",
    "Typ_silnika": "category",

    # Float columns
    "Rok_produkcji": "float64",
    "Przebieg_km": "float64",
    "Moc_KM": "float64",
    "Pojemnosc_cm3": "float64",
    "Emisja_CO2": "float64",
    "Liczba_drzwi": "float64",
    "Rok_startu_produkcji": "float64",
    "Rok_konca_produkcji": "float64",
    'Wspolczynnik_Rok_produkcji': "float64",
    'Wspolczynnik_Przebieg_km': "float64",
    'Wspolczynnik_Moc_KM': "float64",
    'Wspolczynnik_Pojemnosc_cm3': "float64",
    'Wspolczynnik_Liczba_drzwi': "float64",
    'Wspolczynnik_Automatic': "float64",
    'Wspolczynnik_Manual': "float64",
    'Wspolczynnik_SUV': "float64",
    'Wspolczynnik_sedan': "float64",
    'Wspolczynnik_station_wagon': "float64",

    # Integer columns
    "Rok_publikacji_oferty": "float",
    "Miesiac_publikacji_oferty": "float",

    # Boolean columns
    "Xenon lights": "bool",
    "Factory radio": "bool",
    "Velor upholstery": "bool",
    "DVD player": "bool",
    "Start-Stop system": "bool",
    "Electrically adjustable seats": "bool",
    "Fog lights": "bool",
    "Shift paddles": "bool",
    "Passengers airbag": "bool",
    "Daytime running lights": "bool",
    "Power steering": "bool",
    "Four-zone air conditioning": "bool",
    "Sunroof": "bool",
    "GPS navigation": "bool",
    "Manual air conditioning": "bool",
    "Heated side mirrors": "bool",
    "Parking assistant": "bool",
    "ESP(stabilization of the track)": "bool",
    "Immobilizer": "bool",
    "Leather upholstery": "bool",
    "Lane assistant": "bool",
    "Drivers airbag": "bool",
    "USB socket": "bool",
    "Electric front windows": "bool",
    "Blind spot sensor": "bool",
    "SD socket": "bool",
    "Speed limiter": "bool",
    "Electrochromic side mirrors": "bool",
    "LED lights": "bool",
    "AUX socket": "bool",
    "Auxiliary heating": "bool",
    "Heated front seats": "bool",
    "Rain sensor": "bool",
    "Rear parking sensors": "bool",
    "Twilight sensor": "bool",
    "Front side airbags": "bool",
    "Automatic air conditioning": "bool",
    "Roof rails": "bool",
    "Central locking": "bool",
    "ABS": "bool",
    "CD": "bool",
    "Bluetooth": "bool",
    "Dual zone air conditioning": "bool",
    "Alloy wheels": "bool",
    "Cruise control": "bool",
    "Airbag protecting the knees": "bool",
    "TV tuner": "bool",
    "Rear side airbags": "bool",
    "Multifunction steering wheel": "bool",
    "Air curtains": "bool",
    "On-board computer": "bool",
    "Active cruise control": "bool",
    "CD changer": "bool",
    "Electrochromic rear view mirror": "bool",
    "Tinted windows": "bool",
    "Adjustable suspension": "bool",
    "Rear view camera": "bool",
    "MP3": "bool",
    "Heated rear seats": "bool",
    "Electric rear windows": "bool",
    "Alarm": "bool",
    "Hook": "bool",
    "Heated windscreen": "bool",
    "Isofix": "bool",
    "Front parking sensors": "bool",
    "Panoramic roof": "bool",
    "Aftermarket radio": "bool",
    "HUD(head-up display)": "bool",
    "Electrically adjustable mirrors": "bool",
    "ASR (traction control)": "bool",
    'Marka_pojazdu_encoded': "float",
     'Marka_Audi': "float",
     'Marka_BMW': "float",
     'Marka_Citroën': "float",
     'Marka_Fiat': "float",
     'Marka_Ford': "float",
     'Marka_Honda': "float",
     'Marka_Hyundai': "float",
     'Marka_Jeep': "float",
     'Marka_Mazda': "float",
     'Marka_Mercedes-Benz': "float",
     'Marka_Mitsubishi': "float",
     'Marka_Nissan': "float",
     'Marka_Opel': "float",
     'Marka_Peugeot': "float",
     'Marka_Renault': "float",
     'Marka_Seat': "float",
     'Marka_Suzuki': "float",
     'Marka_Toyota': "float",
     'Marka_Volkswagen': "float",
     'Marka_Volvo': "float",
     'Marka_Škoda': "float",

    # Object columns (to be handled separately)
    "Pierwszy_wlasciciel": "category",  # Leave as object or map to category if needed
    "Cena": "float",  # Leave as object or convert to float after cleaning
    "Rok_pierwszej_rejestracji": "float",  # Leave as object or convert to datetime
    "Miesiac_pierwszej_rejestracji": "float",  # Leave as object or convert to datetime
    "Wojewodztwo": "category",  # Leave as object or map to category
    # "Miejscowosc": "object",  # Leave as object or map to category
}
ls = list(set(list(dtype_dict.keys())) & set(common_cols))
new_dtype_dict = {}
for element in ls:
    new_dtype_dict[element] = dtype_dict[element]

df_train = df_train[ls].astype(new_dtype_dict)
df_test = df_test[ls].astype(new_dtype_dict)

In [128]:
df_train

Unnamed: 0_level_0,Sunroof,Electrochromic side mirrors,Shift paddles,Heated front seats,CD,Rain sensor,Parking assistant,Alloy wheels,Pierwszy_wlasciciel,Electrically adjustable seats,...,Electric rear windows,Aftermarket radio,Auxiliary heating,Marka_pojazdu,Rok_produkcji,Automatic air conditioning,Tinted windows,Velor upholstery,Electric front windows,Roof rails
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,True,True,False,True,,False,...,False,False,False,Renault,2005.0,False,False,False,True,False
2,False,True,False,True,True,True,False,True,,False,...,True,False,False,Renault,2010.0,False,False,True,True,True
3,False,False,False,True,True,True,False,False,,False,...,False,False,False,Opel,2015.0,True,True,False,True,False
4,False,False,False,False,True,False,False,False,,False,...,False,False,False,Ford,2007.0,False,False,False,True,True
5,False,False,False,False,True,False,False,True,Yes,False,...,False,False,False,Toyota,2013.0,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135393,False,False,False,False,False,False,False,False,Yes,False,...,False,False,False,Opel,2018.0,True,False,False,True,False
135394,False,False,True,True,False,True,False,True,,False,...,True,False,False,Mercedes-Benz,2021.0,True,True,True,True,True
135395,False,True,False,False,True,True,False,True,,False,...,True,False,False,,,True,False,False,True,False
135396,False,False,False,False,True,False,False,False,,False,...,False,True,False,Renault,2001.0,False,False,True,True,False


In [144]:
df_train["Marka_Model"] = (df_train["Marka_pojazdu"].astype("str") + " " + df_train["Model_pojazdu"].astype("str")).astype("category")

df_test["Marka_Model"] = (df_test["Marka_pojazdu"].astype("str") + " " + df_test["Marka_pojazdu"].astype("str")).astype("category")

In [195]:
col_list = []
for col in df_train.columns:
    col_type = df_train[col].dtype

    if not pd.api.types.is_integer_dtype(col_type) and not pd.api.types.is_float_dtype(col_type) and not pd.api.types.is_bool_dtype(col_type):
        col_list.append(col)
df_train[col_list]


Unnamed: 0_level_0,Pierwszy_wlasciciel,Wojewodztwo,Kraj_pochodzenia,Kolor,Typ_silnika,Model_pojazdu,Marka_pojazdu,Marka_Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,,ŁÓDZKIE,,blue,,Grand Espace,Renault,Renault Grand Espace
2,,ŚLĄSKIE,,silver,1.6 16V,Megane,Renault,Renault Megane
3,,,Denmark,white,1.6 CDTI,Zafira,Opel,Opel Zafira
4,,,,blue,1.6 TDCi,Focus,Ford,Ford Focus
5,Yes,,Poland,other,1.8,Avensis,Toyota,Toyota Avensis
...,...,...,...,...,...,...,...,...
135393,Yes,DOLNOŚLĄSKIE,Poland,silver,,Astra,Opel,Opel Astra
135394,,MAZOWIECKIE,,white,,Vito,Mercedes-Benz,Mercedes-Benz Vito
135395,,MAZOWIECKIE,,black,,Zafira,,nan Zafira
135396,,,Germany,blue,1.2i,Clio,Renault,Renault Clio


In [202]:
X_train, X_test, y_train, y_test = train_test_split(df_train.iloc[:, df_train.columns != "Cena"], df_train['Cena'],test_size=.2)


In [203]:
car_model_category_mean = pd.concat([X_train, y_train], axis=1).groupby(['Marka_pojazdu'], observed=True)['Cena'].mean()
car_model_category_mean

Marka_pojazdu
Abarth         69845.939394
Acura          64162.416667
Aixam          27989.441863
Alfa Romeo     72820.290671
Alpine        419225.000000
                  ...      
Wołga          26250.000000
Zaporożec      33333.000000
Zastava         9733.333333
Škoda          53768.957331
Żuk            13500.000000
Name: Cena, Length: 101, dtype: float64

In [200]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108317 entries, 122222 to 11100
Data columns (total 90 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   Sunroof                          108317 non-null  bool    
 1   Electrochromic side mirrors      108317 non-null  bool    
 2   Shift paddles                    108317 non-null  bool    
 3   Heated front seats               108317 non-null  bool    
 4   CD                               108317 non-null  bool    
 5   Rain sensor                      108317 non-null  bool    
 6   Parking assistant                108317 non-null  bool    
 7   Alloy wheels                     108317 non-null  bool    
 8   Pierwszy_wlasciciel              32719 non-null   category
 9   Electrically adjustable seats    108317 non-null  bool    
 10  Heated side mirrors              108317 non-null  bool    
 11  TV tuner                         108317 non-null  boo

In [201]:
for index in car_model_category_mean.index:
    X_train.loc[X_train["Model_pojazdu"] == index, "Model_pojazdu_refactored"] = car_model_category_mean[index]
    X_test.loc[X_test["Model_pojazdu"] == index, "Model_pojazdu_refactored"] = car_model_category_mean[index]
    df_test.loc[df_test["Model_pojazdu"] == index, "Model_pojazdu_refactored"] = car_model_category_mean[index]


# X_test.drop("Marka_pojazdu", inplace=True)

In [None]:
car_brand_category_mean = pd.concat([X_train, y_train], axis=1).groupby(['Marka_pojazdu'], observed=True)['Cena'].mean()
for index in car_brand_category_mean.index:
    X_train.loc[X_train["Marka_pojazdu"] == index, "Marka_pojazdu_refactored"] = car_brand_category_mean[index]
    X_test.loc[X_test["Marka_pojazdu"] == index, "Marka_pojazdu_refactored"] = car_brand_category_mean[index]
    df_test.loc[df_test["Marka_pojazdu"] == index, "Marka_pojazdu_refactored"] = car_brand_category_mean[index]


In [205]:
del X_train["Model_pojazdu"]
del X_test["Model_pojazdu"]

In [206]:
del X_train["Marka_pojazdu"]
del X_test["Marka_pojazdu"]

In [188]:
# Downcasting data types
def reduce_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col].round(), downcast='integer')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col].round(), downcast='float')
    return df

In [190]:
X_train = reduce_dtypes(X_train)
X_test = reduce_dtypes(X_test)

In [204]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108317 entries, 65843 to 37221
Data columns (total 90 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   Sunroof                          108317 non-null  bool    
 1   Electrochromic side mirrors      108317 non-null  bool    
 2   Shift paddles                    108317 non-null  bool    
 3   Heated front seats               108317 non-null  bool    
 4   CD                               108317 non-null  bool    
 5   Rain sensor                      108317 non-null  bool    
 6   Parking assistant                108317 non-null  bool    
 7   Alloy wheels                     108317 non-null  bool    
 8   Pierwszy_wlasciciel              32690 non-null   category
 9   Electrically adjustable seats    108317 non-null  bool    
 10  Heated side mirrors              108317 non-null  bool    
 11  TV tuner                         108317 non-null  bool

In [158]:
del X_train["Marka_Model"]
del X_test["Marka_Model"]

## Szukanie najlepszych parametrów

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import numpy as np
import cupy as cp

# Parametry do tuningu
param_grid = {
    # 5h <
    "n_estimators": [500, 195],  # Liczba drzew
    "max_depth": [9, 14, 16],  # Maksymalna głębokość drzewa
    "learning_rate": [ 0.05, 0.07],  # Szybkość uczenia
    "subsample": [1.0],  # Frakcja próbek do treningu
    "colsample_bytree": [0.6, 0.8, 1.0],  # Frakcja cech do treningu
    "gamma": [0.1, 0.2, 0.3],  # Minimalna redukcja straty do podziału
    "reg_alpha": [0, 0.1, 1],  # Regularyzacja L1 (alpha)
    "reg_lambda": [0, 0.1, 1],  # Regularyzacja L2 (lambda)
}
param_grid_restricted = {
    # 21 min
    "n_estimators": [ 500, 600],  # Zamiast [50, 100, 200, 300]
    "max_depth": [17],  # Zamiast [3, 5, 7, 9]
    "learning_rate": [0.04, 0.05],  # Zamiast [0.01, 0.05, 0.1, 0.2]
    "subsample": [0,9, 1],  # Zamiast [0.6, 0.8, 1.0]
    "colsample_bytree": [0.6],  # Zamiast [0.6, 0.8, 1.0]
    "gamma": [0.3, 0.04],  # Zamiast [0, 0.1, 0.2]
    "reg_alpha": [0.8, 1],  # Zamiast [0, 0.1, 1]
    "reg_lambda": [0.1, 0.06],  # Zamiast [0, 0.1, 1]
}

# Parametry specyficzne dla GPU
params = {
    # "device" : "cuda",
    "tree_method" : "gpu_hist",
    "objective": "reg:squarederror",  # Zadanie regresji
    "eval_metric": "rmse",  # Metryka RMSE
    "gpu_id": 0,  # Użyj GPU o indeksie 0
    "predictor": "gpu_predictor",  # Użyj GPU do predykcji
}

# Inicjalizacja modelu XGBRegressor z parametrami GPU
xgb = XGBRegressor(**params, enable_categorical=True, random_state=42)

# Konfiguracja Grid Search
# grid_search = GridSearchCV(
#     estimator=xgb,
#     param_grid=param_grid_restricted,
#     scoring="neg_mean_squared_error",  # Metryka do optymalizacji (RMSE)
#     cv=5,  # 5-krotna walidacja krzyżowa
#     n_jobs=-1,  # Użyj wszystkich dostępnych rdzeni CPU
#     verbose=2,  # Wyświetl postęp
# )

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_restricted,
    n_iter=50,  # Przetestuj tylko 50 losowych kombinacji
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-2,
    verbose=3,
    random_state=42,
)

# Trenowanie modelu z Grid Search
# grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)

# Najlepsze parametry i wynik
print("Najlepsze parametry:", random_search.best_params_)
print("Najlepszy wynik (Negative MSE):", random_search.best_score_)

# Ocena modelu na zbiorze testowym
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE na zbiorze testowym: {rmse}")
print(f"R² na zbiorze testowym: {r2}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [208]:
y_pred = best_model.predict(df_test.loc[:, X_train.columns])

In [182]:
param_grid_restricted = {
    'subsample': [1],
    'reg_lambda': [0.06],
    'reg_alpha': [1],
    'n_estimators': [195, 196],
    'max_depth': [11],
    'learning_rate': [0.05],
    'gamma': [0.3],
    'colsample_bytree':[0.6]
}

# Parametry specyficzne dla GPU
params = {
    # "device" : "cuda",
    "tree_method" : "gpu_hist",
    "objective": "reg:squarederror",  # Zadanie regresji
    "eval_metric": "rmse",  # Metryka RMSE
    "gpu_id": 0,  # Użyj GPU o indeksie 0
    "predictor": "gpu_predictor",  # Użyj GPU do predykcji
}

# Inicjalizacja modelu XGBRegressor z parametrami GPU
xgb = XGBRegressor(**params, enable_categorical=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_restricted,
    n_iter=50,  # Przetestuj tylko 50 losowych kombinacji
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-2,
    verbose=3,
    random_state=42
)

# Trenowanie modelu z Grid Search
# grid_search.fit(X_train, y_train)
random_search.fit(df_train.loc[:, (df_train.columns != "Cena") & (df_train.columns != "Model_pojazdu")],
        df_train["Cena"])

# Najlepsze parametry i wynik
print("Najlepsze parametry:", random_search.best_params_)
print("Najlepszy wynik (Negative MSE):", random_search.best_score_)

# Ocena modelu na zbiorze testowym
best_model = random_search.best_estimator_
y_pred = best_model.predict(df_test[:, (df_test.columns != "Model_pojazdu")])



Fitting 5 folds for each of 2 candidates, totalling 10 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



Najlepsze parametry: {'subsample': 1, 'reg_lambda': 0.06, 'reg_alpha': 1, 'n_estimators': 196, 'max_depth': 11, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.6}
Najlepszy wynik (Negative MSE): -1322341477.6955178


ValueError: feature_names mismatch: ['Sunroof', 'Electrochromic side mirrors', 'Shift paddles', 'Heated front seats', 'CD', 'Rain sensor', 'Parking assistant', 'Alloy wheels', 'Pierwszy_wlasciciel', 'Electrically adjustable seats', 'Heated side mirrors', 'TV tuner', 'Four-zone air conditioning', 'ASR (traction control)', 'Rok_publikacji_oferty', 'HUD(head-up display)', 'CD changer', 'Power steering', 'Wojewodztwo', 'Immobilizer', 'Fog lights', 'ESP(stabilization of the track)', 'On-board computer', 'Front side airbags', 'Heated rear seats', 'Rok_startu_produkcji', 'Bluetooth', 'Isofix', 'Leather upholstery', 'Rear side airbags', 'ABS', 'Passengers airbag', 'SD socket', 'LED lights', 'Kraj_pochodzenia', 'Central locking', 'Rok_pierwszej_rejestracji', 'Electrically adjustable mirrors', 'MP3', 'Air curtains', 'Miesiac_publikacji_oferty', 'Kolor', 'Airbag protecting the knees', 'Daytime running lights', 'Pojemnosc_cm3', 'Typ_silnika', 'Manual air conditioning', 'Liczba_drzwi', 'Panoramic roof', 'Electrochromic rear view mirror', 'Twilight sensor', 'Emisja_CO2', 'GPS navigation', 'Heated windscreen', 'Blind spot sensor', 'Przebieg_km', 'Speed limiter', 'Multifunction steering wheel', 'Rok_konca_produkcji', 'Hook', 'Dual zone air conditioning', 'Front parking sensors', 'AUX socket', 'Active cruise control', 'Drivers airbag', 'Adjustable suspension', 'Start-Stop system', 'Rear view camera', 'DVD player', 'Rear parking sensors', 'Miesiac_pierwszej_rejestracji', 'Cruise control', 'Moc_KM', 'Alarm', 'USB socket', 'Factory radio', 'Xenon lights', 'Lane assistant', 'Electric rear windows', 'Aftermarket radio', 'Auxiliary heating', 'Marka_pojazdu', 'Rok_produkcji', 'Automatic air conditioning', 'Tinted windows', 'Velor upholstery', 'Electric front windows', 'Roof rails', 'Marka_Model'] ['Sunroof', 'Electrochromic side mirrors', 'Shift paddles', 'Heated front seats', 'CD', 'Rain sensor', 'Parking assistant', 'Alloy wheels', 'Pierwszy_wlasciciel', 'Electrically adjustable seats', 'Heated side mirrors', 'TV tuner', 'Four-zone air conditioning', 'ASR (traction control)', 'Rok_publikacji_oferty', 'HUD(head-up display)', 'CD changer', 'Power steering', 'Wojewodztwo', 'Immobilizer', 'Fog lights', 'ESP(stabilization of the track)', 'On-board computer', 'Front side airbags', 'Heated rear seats', 'Rok_startu_produkcji', 'Bluetooth', 'Isofix', 'Leather upholstery', 'Rear side airbags', 'ABS', 'Passengers airbag', 'SD socket', 'LED lights', 'Kraj_pochodzenia', 'Central locking', 'Rok_pierwszej_rejestracji', 'Electrically adjustable mirrors', 'MP3', 'Air curtains', 'Miesiac_publikacji_oferty', 'Kolor', 'Airbag protecting the knees', 'Daytime running lights', 'Pojemnosc_cm3', 'Typ_silnika', 'Manual air conditioning', 'Liczba_drzwi', 'Panoramic roof', 'Electrochromic rear view mirror', 'Twilight sensor', 'Emisja_CO2', 'GPS navigation', 'Heated windscreen', 'Blind spot sensor', 'Przebieg_km', 'Speed limiter', 'Multifunction steering wheel', 'Rok_konca_produkcji', 'Hook', 'Dual zone air conditioning', 'Front parking sensors', 'AUX socket', 'Active cruise control', 'Drivers airbag', 'Adjustable suspension', 'Start-Stop system', 'Rear view camera', 'DVD player', 'Rear parking sensors', 'Miesiac_pierwszej_rejestracji', 'Cruise control', 'Moc_KM', 'Cena', 'Alarm', 'USB socket', 'Factory radio', 'Xenon lights', 'Lane assistant', 'Electric rear windows', 'Aftermarket radio', 'Auxiliary heating', 'Marka_pojazdu', 'Rok_produkcji', 'Automatic air conditioning', 'Tinted windows', 'Velor upholstery', 'Electric front windows', 'Roof rails', 'Marka_Model']
training data did not have the following fields: Cena

In [211]:
print(df_test.shape)
print(y_pred.shape)

anwser_df = pd.DataFrame({"ID": df_test.index,"Cena": y_pred})
anwser_df.to_csv("Anwser.csv", index=False)

(72907, 92)
(72907,)


In [73]:
# Parametry specyficzne dla GPU
param_grid_restricted = {
    # 21 min
    "n_estimators": [200],  # Zamiast [50, 100, 200, 300]
    "max_depth": [7],  # Zamiast [3, 5, 7, 9]
    "learning_rate": [0.05],  # Zamiast [0.01, 0.05, 0.1, 0.2]
    "subsample": [1],  # Zamiast [0.6, 0.8, 1.0]
    "colsample_bytree": [0.8],  # Zamiast [0.6, 0.8, 1.0]
    "gamma": [0],  # Zamiast [0, 0.1, 0.2]
    "reg_alpha": [0.1],  # Zamiast [0, 0.1, 1]
    "reg_lambda": [0.1],  # Zamiast [0, 0.1, 1]
}
params = {
    "tree_method": "gpu_hist",  # Użyj GPU do budowy drzew
    "objective": "reg:squarederror",  # Zadanie regresji
    "eval_metric": "rmse",  # Metryka RMSE
    "gpu_id": 0,  # Użyj GPU o indeksie 0
    "predictor": "gpu_predictor",  # Użyj GPU do predykcji
}

# Inicjalizacja modelu XGBRegressor z parametrami GPU
xgb = XGBRegressor(**params, enable_categorical=True, random_state=42)

# Konfiguracja Grid Search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_restricted,
    scoring="neg_mean_squared_error",  # Metryka do optymalizacji (RMSE)
    cv=5,  # 5-krotna walidacja krzyżowa
    n_jobs=-1,  # Użyj wszystkich dostępnych rdzeni CPU
    verbose=2,  # Wyświetl postęp
)

# random_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=param_grid,
#     n_iter=50,  # Przetestuj tylko 50 losowych kombinacji
#     scoring="neg_mean_squared_error",
#     cv=5,
#     n_jobs=-1,
#     verbose=2,
#     random_state=42,
# )

# Trenowanie modelu z Grid Search
grid_search.fit(X_train, y_train)

# Najlepsze parametry i wynik
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik (Negative MSE):", grid_search.best_score_)

# Ocena modelu na zbiorze testowym
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE na zbiorze testowym: {rmse}")
print(f"R² na zbiorze testowym: {r2}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



Najlepsze parametry: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1}
Najlepszy wynik (Negative MSE): -1444583433.5396085
RMSE na zbiorze testowym: 38406.08798380288
R² na zbiorze testowym: 0.7980510874610559



    E.g. tree_method = "hist", device = "cuda"



In [174]:
final_anwser = best_model.predict(df_test)

IndexError: Boolean index has wrong length: 89 instead of 27080

In [129]:
anwser_df = pd.DataFrame()
anwser_df.loc[:, "ID"] = X_test.index
anwser_df.loc[:, "Cena"] = final_anwser

In [130]:
anwser_df

Unnamed: 0,ID,Cena
0,22440,38665.882812
1,4159,20987.648438
2,80228,3643.710938
3,89706,7591.453125
4,74416,9124.679688
...,...,...
27075,124795,19188.574219
27076,64019,28267.101562
27077,107470,44034.601562
27078,47227,20445.632812


In [26]:
df_train["Marka_pojazdu"].unique()

['Renault', 'Opel', 'Ford', 'Toyota', 'BMW', ..., 'Autobianchi', 'Nysa', 'Vanderhall', 'Zaporożec', 'Saturn']
Length: 103
Categories (102, object): ['Abarth', 'Acura', 'Aixam', 'Alfa Romeo', ..., 'Zaporożec', 'Zastava', 'Škoda', 'Żuk']