In this notebook I will demonstrate the process of model building and selection to predict used bike prices.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Data

I will split the data into training and testing, and in the model building phase I will only use the training set. Imagine the testing set as "new" data that I can't see at all before I finish building my models.

In [None]:
full_data = pd.read_csv('../input/used-bikes-prices-in-india/Used_Bikes.csv')
X, y = full_data.drop('price', axis = 1), full_data['price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 2207)

In [None]:
X_train.head()

In [None]:
X_train.info()

## No missing value
## 3 continuous and 4 categorical predictors

In [None]:
y_train

# Response

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10, 5))
sns.histplot(y_train, kde = True, ax = ax[0]).set_title('Price')
sns.histplot(y_train, kde = True, log_scale = True, ax = ax[1]).set_title('Price, in log-scale')
plt.show()

## The distribution of response variable is skewed,
# so maybe it's a good idea? to try transforming the response using log later.

# Continuous predictor

In [None]:
sns.jointplot(
    x = np.log1p(X_train['power']),
    y = np.log(y_train),
    kind = 'reg'
)

sns.jointplot(
    x = np.log1p(X_train['age']),
    y = np.log(y_train),
    kind = 'reg'
)

sns.jointplot(
    x = np.log1p(X_train['kms_driven']),
    y = np.log(y_train),
    kind = 'reg'
)

plt.show()

# Preprocessors

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

# mapper that transforms each specified column
# with the specified preprocessor/transformer
# note that I don't include bike names on this preprocessor
# because I want to drop it entirely
preprocessor = DataFrameMapper([
    (['kms_driven'], FunctionTransformer(np.log1p)),
    (['age'], FunctionTransformer(np.log1p)),
    (['power'], FunctionTransformer(np.log1p)),
    (['city'], OneHotEncoder(handle_unknown = 'ignore')),
    (['brand'], OneHotEncoder(handle_unknown = 'ignore'))
], df_out = True)

# Model

In [None]:
## Regularized regression using cross-validation and grid search to choose
## the best value of regularization term.

from sklearn.linear_model import Lasso, Ridge

alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
lasso = Lasso()
ridge = Ridge()

# Joining into pipeline

Preprocessor and model is joined into a pipeline. Raw data then can be fed into the pipeline. The preprocessors, transformers, models inside the pipeline will work automatically (we need not to transform manually etc.)

In [None]:
from sklearn.pipeline import Pipeline

lasso_pipeline = Pipeline([
    ('prep', preprocessor),
    ('clf', lasso)
])

ridge_pipeline = Pipeline([
    ('prep', preprocessor),
    ('clf', ridge)
])

# Wrapping the grid search

In [None]:
from sklearn.model_selection import GridSearchCV

lasso_gs = GridSearchCV(
    lasso_pipeline,
    {
        'clf__alpha': alphas
    },
    scoring = 'neg_mean_squared_error'
)

ridge_gs = GridSearchCV(
    ridge_pipeline,
    {
        'clf__alpha': alphas
    },
    scoring = 'neg_mean_squared_error'
)

# Cross-validation score on training set

This is just to demonstrate how I can gauge/evaluate the performance of my model only using the training set. Remember, up until now, I haven't seen the test set yet! ;)

In [None]:
%%capture --no-display --no-stdout
# This just jupyter magic
# to suppress so many ConvergenceWarnings that becomes
# annoying to read.

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# I have to define RMSE manually because regression models in
# sklearn have R^2 as default scoring method.
# Also, sklearn only provides -RMSE instead of RMSE.
RMSE_score = make_scorer(mean_squared_error, squared = False)

lasso_score = cross_val_score(
    lasso_gs,
    X_train, y_train,
    scoring = RMSE_score
)

ridge_score = cross_val_score(
    ridge_gs,
    X_train, y_train,
    scoring = RMSE_score
)

In [None]:
_score = pd.DataFrame({
    'lasso': lasso_score,
    'ridge': ridge_score
})

_score.loc['mean'] = _score.mean()
_score

In [None]:
%%capture

# Fitting the model using entire data
lasso_pipeline.fit(X_train, y_train)
ridge_pipeline.fit(X_train, y_train)

# The final test

In [None]:
from sklearn.metrics import mean_squared_error

lasso_pred = lasso_pipeline.predict(X_test)
ridge_pred = ridge_pipeline.predict(X_test)

print("Lasso test RMSE:", mean_squared_error(y_test, lasso_pred, squared = False))
print("Ridge test RMSE:", mean_squared_error(y_test, ridge_pred, squared = False))

Thanks for visiting my notebook!