## Showing data

In [1]:
import pandas as pd
data = pd.read_csv('car-sales-extended-missing-data.csv')
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [2]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

### Steps to follow (all in one cell):-
1. Fill missing values.
2. Convert data to numbers.
3. Building a model on data

In [4]:
#Getting data ready 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#Setup random seed
import numpy as np
np.random.seed(9)

#Importing data and drop rows with no labels
data = pd.read_csv('car-sales-extended-missing-data.csv')
data.dropna(subset = ['Price'], inplace = True)

#Defining different features and transformers
cat_features = ['Make','Colour']
cat_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'missing')),
    ('one_hot',OneHotEncoder(handle_unknown = 'ignore'))])

door_feature = ['Doors']
door_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 4))])

numeric_feature = ['Odometer (KM)']
numeric_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'mean'))])

#Seting up preprocessing steps (fill missing data,convert to numbers) 
preprocessor = ColumnTransformer(transformers=[
    ('cat',cat_transformer,cat_features),
    ('door',door_transformer,door_feature),
    ('num',numeric_transformer,numeric_feature)])

#Creating a preprocessor and modelling pipeline
model = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())])

#Split data
X = data.drop('Price', axis = 1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Fit and Score
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.32675658770525595

In [7]:
pipe_grid = {
    'preprocessor__num__imputer__strategy' : ['mean','median'],
    'model__n_estimators' : [200,1000],
    'model__max_depth' : [30],
    'model__max_features' : ['auto','sqrt'],
    'model__min_samples_split' : [4],
    'model__min_samples_leaf' : [2,4],
}

gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(X_train,y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=2, m

[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=30, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.4s
[CV] model__max_depth=30, model__max_features=auto, model__min_samples_l

[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples

[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean, total=   1.4s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean, total=   1.2s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean, total=   1.3s
[CV] model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.3min finished


In [8]:
gs_model.score(X_test,y_test)

0.37111429887575276