In [13]:
!pip install seaborn -q
!pip install catboost -q
!pip install scikit-learn -q

import numpy as np
import pandas as pd

In [14]:
catboost_data = pd.read_csv("./input.csv")

In [15]:
cat_features = ["model", "car_type", "fuel_type"]
targets = ["target_class", "target_reg"]
features2drop = ["car_id"] 

filtered_features = [i for i in catboost_data.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

print("cat_features", cat_features)
print("num_features", len(num_features))
print("targets", targets)

for c in cat_features:  # Избавлеямся от NaN'ов
    catboost_data[c] = catboost_data[c].astype(str)

cat_features ['model', 'car_type', 'fuel_type']
num_features 11
targets ['target_class', 'target_reg']


In [16]:
from sklearn.model_selection import train_test_split

X = catboost_data[filtered_features].drop(targets, axis=1, errors="ignore")
y = catboost_data["target_reg"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

model = CatBoostRegressor(cat_features=cat_features, eval_metric="RMSE")

model.fit(
    X_train,
    y_train,
    verbose=500,
    plot=False,
)

print(model.best_score_)

Learning rate set to 0.045195
0:	learn: 17.1862834	total: 53.6ms	remaining: 53.5s
500:	learn: 7.8920293	total: 976ms	remaining: 972ms
999:	learn: 6.0539782	total: 1.86s	remaining: 0us
{'learn': {'RMSE': 6.053978188487934}}


In [18]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

cbr = CatBoostRegressor(
        depth=4,
        iterations=1000,
        learning_rate=0.09,   
        cat_features=cat_features,
        colsample_bylevel=0.99,
        max_bin=190,
        l2_leaf_reg=5,
        subsample=0.5,)

cbr.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        verbose=500,
        plot=False)


print(cbr.best_score_)

test = pd.read_csv('test.csv')
for c in cat_features:
    test[c] = test[c].astype(str)
    
x_test = test[filtered_features].drop(targets, axis=1, errors="ignore")

y_pred = cbr.predict(x_test)
CatBoostReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
CatBoostReg_result.to_csv('catboost_result.csv', index=False)

0:	learn: 16.8709193	test: 17.5903213	best: 17.5903213 (0)	total: 2.65ms	remaining: 2.65s
500:	learn: 8.6450532	test: 12.2367538	best: 11.9853255 (111)	total: 512ms	remaining: 510ms
999:	learn: 7.2909046	test: 12.3802641	best: 11.9853255 (111)	total: 970ms	remaining: 0us

bestTest = 11.98532547
bestIteration = 111

Shrink model to first 112 iterations.
{'learn': {'RMSE': 7.2909045759399325}, 'validation': {'RMSE': 11.985325469871208}}


In [20]:
# LGBM
!pip install lightgbm -q

import lightgbm as lgb

df = pd.read_csv("input.csv")
cat_cols = ["car_type", "fuel_type", "model"]
drop_cols = ["car_id", "target_reg", "target_class"]

X = df.drop(drop_cols, axis=1)
y = df["target_reg"]

In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in cat_cols:
    X[col] = label_encoder.fit_transform(X[col])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from lightgbm import Dataset

train_data = Dataset(
    X_train,
    y_train,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

val_data = Dataset(
    X_test,
    y_test,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

In [24]:
from lightgbm import train ,LGBMRegressor


reg = LGBMRegressor(metric = "RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)
print(reg.best_score_)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 13
[LightGBM] [Info] Start training from score 44.797913
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 12.286649812737659)])})


In [25]:
reg = LGBMRegressor(
    n_estimators=40,
    learning_rate = 0.09,
    cat_feature=[0, 1, 2],
    num_leaves = 8,
    metric="RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)

print(reg.best_score_)

test = pd.read_csv("./test.csv")

drop_cols = ["car_id", "target_class"]
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = label_encoder.fit_transform(x_test[col])

y_pred = reg.predict(x_test)
LGBMReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
LGBMReg_result.to_csv('lbgm_result.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 13
[LightGBM] [Info] Start training from score 44.797913
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 11.980153054703948)])})


In [26]:
# XGBoost
!pip install xgboost -q
import xgboost as xgb
import warnings; warnings.filterwarnings("ignore")

In [28]:
xgb_data = pd.read_csv("./input.csv")
drop_cols = ['car_id', 'target_reg', 'target_class']
cat_cols = ['car_type', 'fuel_type', 'model']

X = xgb_data.drop(drop_cols, axis=1)
y = xgb_data['target_reg']

 
for col in cat_cols:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.metrics import mean_poisson_deviance, mean_squared_error

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=30, n_jobs=-1
)
reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

[0]	validation_0-rmse:15.73818	validation_0-mean_poisson_deviance:4.83291
[3]	validation_0-rmse:13.04271	validation_0-mean_poisson_deviance:3.25132
[6]	validation_0-rmse:12.64687	validation_0-mean_poisson_deviance:3.07632
[9]	validation_0-rmse:12.65160	validation_0-mean_poisson_deviance:3.07671
[12]	validation_0-rmse:12.67780	validation_0-mean_poisson_deviance:3.08867
[15]	validation_0-rmse:12.75195	validation_0-mean_poisson_deviance:3.13961
[18]	validation_0-rmse:12.88637	validation_0-mean_poisson_deviance:3.20034


12.621295288019933

In [30]:
reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=80, 
                       n_jobs=-1,
                       min_child_weight=16,
                       max_bin=128,
                       reg_alpha=275,
                       reg_lambda=275,)

reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

[0]	validation_0-rmse:16.96575	validation_0-mean_poisson_deviance:5.67149
[3]	validation_0-rmse:14.67370	validation_0-mean_poisson_deviance:4.13927
[6]	validation_0-rmse:13.60951	validation_0-mean_poisson_deviance:3.54065
[9]	validation_0-rmse:13.03824	validation_0-mean_poisson_deviance:3.24689
[12]	validation_0-rmse:12.70254	validation_0-mean_poisson_deviance:3.08529
[15]	validation_0-rmse:12.46497	validation_0-mean_poisson_deviance:2.97226
[18]	validation_0-rmse:12.30981	validation_0-mean_poisson_deviance:2.90461
[21]	validation_0-rmse:12.22979	validation_0-mean_poisson_deviance:2.86940
[24]	validation_0-rmse:12.15357	validation_0-mean_poisson_deviance:2.83741
[27]	validation_0-rmse:12.10781	validation_0-mean_poisson_deviance:2.81949
[30]	validation_0-rmse:12.06578	validation_0-mean_poisson_deviance:2.80452
[33]	validation_0-rmse:12.05146	validation_0-mean_poisson_deviance:2.80217
[36]	validation_0-rmse:12.01737	validation_0-mean_poisson_deviance:2.78888
[39]	validation_0-rmse:12.002

11.983180884345987

In [31]:
test = pd.read_csv('./test.csv')

drop_cols = ['car_id', 'target_class']
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = x_test[col].astype('category')

y_pred = reg.predict(x_test)
XGBReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
XGBReg_result.to_csv('xgb_result.csv', index=False)