In [None]:
import numpy as np
import pandas as pd

# Import cross-validation functions
from sklearn.model_selection import KFold, train_test_split, cross_val_score

# Import One-Hot encoder
from sklearn.preprocessing import OneHotEncoder

# Import evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import XGBoost library
import xgboost as xgb

In [2]:
# Read the dataset from CSV file
df = pd.read_csv('./datasets/cars_dataset.csv', low_memory=False)

In [3]:
# Dataset size
len(df)

195928

In [4]:
# Define features to use for predictions and target variable
target_variable = 'price'
categorical_features = ['make', 'model', 'trim', 'year', 'transmission']
numerical_features = ['kilometers', 'age_years']
all_features = categorical_features + numerical_features

In [5]:
# Show subsample of the dataset
df[all_features].head(5)

Unnamed: 0,make,model,trim,year,transmission,kilometers,age_years
0,volkswagen,touareg,sel,2014,automatic,29000,3.0
1,ford,focus,trend,2014,automatic,23000,3.0
2,kia,rio,ex,2015,automatic,40000,2.0
3,dodge,challenger,other,2013,automatic,28000,4.0
4,hyundai,veloster,other,2013,automatic,120000,4.0


In [6]:
# Perform One-Hot encoding on categorical features
final_df = pd.get_dummies(data=df[all_features + [target_variable]], 
                          columns=categorical_features, 
                          prefix=categorical_features)

In [7]:
# Get the final list of features, except target variable
oh_features = final_df.columns.tolist()
oh_features.remove(target_variable)

In [8]:
# Split dataset into two subsets: 80% of the data for training, 20% for model testing
X_train, X_test, y_train, y_test = train_test_split(final_df[oh_features], 
                                                    final_df[[target_variable]], 
                                                    test_size=0.2, random_state=70)

In [9]:
print(' Training set size: %d \n Test set size: %d' % (len(X_train), len(X_test)))

 Training set size: 156742 
 Test set size: 39186


In [10]:
# Initialize XGBoost regressor
regressor = xgb.XGBRegressor(n_estimators=500, max_depth=5)

In [11]:
# Fit the model
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [12]:
# Predict prices in a test dataset
predictions = regressor.predict(X_test)

In [13]:
# Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print('Mean squared error = {0:.6f}'.format(mse))

Mean squared error = 1176143117.425487


In [14]:
# Mean Absolute Error
mae = mean_absolute_error(y_test, predictions)
print('Mean absolute error = {0:.6f}'.format(mae))

Mean absolute error = 17162.403842


In [15]:
# R^2 score: 1 means that all predictions are perfect
r2_score = regressor.score(X_test, y_test)
print('R^2 score (explained variance): {0:.6f}'.format(r2_score))

R^2 score (explained variance): 0.923917
