# Import the BMW dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import pandas_profiling as pp
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/bmw.csv')

In [None]:
# Take a quick look at the dataset
print(f"Shape of the BMW dataset: {df.shape}\n")

df.info()

df.head()

The dataset has 9 columns and 10,781 rows without any missing value

# Auto EDA

In [None]:
eda = pp.ProfileReport(df, title="Exploratory Data Analysis of the BMW dataset", explorative=True)
eda.to_notebook_iframe()

In [None]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.2, random_state=42)

In [None]:
df_train['kfold'] = -1
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
# Seperate numerical features and categorical features
feats = [col for col in df_train.columns if col not in ('price', 'kfold')]
cat_feats = [col for col in feats if df_train[col].dtype == 'object']
num_feats = [col for col in feats if df_train[col].dtype != 'object']
print(f"Numerical features: {num_feats}\nCategorical features: {cat_feats}\n")

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indices, test_indices) in enumerate(kf.split(X=df_train)):
    df_train.loc[test_indices, 'kfold'] = fold

In [None]:
for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
    X_test = df_test.copy()
    
    y_train = X_train.price
    y_valid = X_valid.price
    y_test = X_test.price
    
    X_train = X_train[feats]
    X_valid = X_valid[feats]
    X_test = X_test[feats]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[cat_feats]))
    X_valid_ohe = pd.DataFrame(ohe.transform(X_valid[cat_feats]))
    X_test_ohe = pd.DataFrame(ohe.transform(X_test[cat_feats]))
    
    X_train = pd.concat([X_train_ohe, X_train[num_feats]], axis=1)
    X_valid = pd.concat([X_valid_ohe, X_valid[num_feats]], axis=1)
    X_test = pd.concat([X_test_ohe, X_test[num_feats]], axis=1)
    # Baseline
    #model = ensemble.RandomForestRegressor(n_estimators=1200, random_state=42)
    model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=fold, 
                         tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(X_train, y_train,
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)
    mse = mean_squared_error(model.predict(X_test), y_test, squared=False)
    print(f'Mean squared error of fold {fold}: {mse}')

# Improvement
This is just baseline. There are something that need to be improved:
* Apply GridSearch to evaluate many models
* Feature engineering
* Create pipelines