In [13]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
sns.set_palette("dark")
sns.set_style("white")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression

# import dataset

In [9]:
df = pd.read_feather("dataset/carUsedPriceFix.feather")

In [11]:
X = df.drop(columns="price").copy()
y = df["price"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61268, 8), (15318, 8), (61268,), (15318,))

# pipeline

In [27]:
X_train.head()

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
30718,C Class,2018,Semi-Auto,10798,Diesel,120.256183,55.166825,2.0
19353,Crossland X,2019,Manual,8971,Petrol,145.0,44.8,1.2
32951,Focus,2017,Manual,28001,Petrol,120.256183,55.166825,1.0
29920,C Class,2015,Manual,43927,Diesel,120.256183,55.166825,2.1
75671,Aygo,2017,Manual,31245,Petrol,145.0,68.9,1.0


In [36]:
num_pipe = make_pipeline(KNNImputer(n_neighbors=5), StandardScaler())
cat_pipe = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))

preprocesor = make_column_transformer(
    (num_pipe, X_train.select_dtypes(include=["int64", "float64"]).columns),
    (cat_pipe, X_train.select_dtypes(include=["object"]).columns)
)

pipeline_model = make_pipeline(preprocesor, LinearRegression(n_jobs=-1, normalize=True))

# training

In [38]:
pipeline_model.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('knnimputer', KNNImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['year', 'mileage', 'tax', 'mpg', 'engineSize'], dtype='object')),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    Index(['model', 'transmission', 'fuelType'], dtype='object'))])),
  ('linearregression', LinearRegression(n_jobs=-1, normalize=True))],
 'verbose':

In [39]:
params_tune = {
     'linearregression__normalize': [True, False]
}

model = RandomizedSearchCV(pipeline_model, params_tune, cv=4, n_iter=10, n_jobs=-1, verbose=3, random_state=42)
model.fit(X_train, y_train)

Fitting 4 folds for each of 2 candidates, totalling 8 fits


RandomizedSearchCV(cv=4,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('knnimputer',
                                                                                                KNNImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['year', 'mileage', 'tax', 'mpg', 'engineSize'], dtype='object')),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps=[('simpleimputer',
     

In [40]:
model.best_params_

{'linearregression__normalize': False}

In [42]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8953969325320684, 0.8922710134533144)