# Putting All Together

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data=pd.read_csv("car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

Steps we have to do in one cell:
    
    1. Fill the missing values.
    
    2. Convert data into numbers.
    
    3. Build a model on data.

In [5]:
# Getting data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split , GridSearchCV

np.random.seed(42)

# Import Data And drop rows with missing labels
data=pd.read_csv("car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"],inplace=True)
 
# Define different features and Pipelines
categorical_features=["Make","Colour"]
categorical_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant" , fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

door_features=["Doors"]
door_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant" , fill_value=4))
])
numeric_features=["Odometer (KM)"]
numeric_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="mean"))
])

# Setup Preprocessing steps (fill missing values, then convert them into numbers)
preprocessor=ColumnTransformer(
                               transformers=[
                                   ("cat",categorical_transformer,categorical_features),
                                   ("door",door_transformer,door_features),
                                   ("num",numeric_transformer,numeric_features)
                                   
                               ]
)

# Creating a preprocessing and modelling pipeline
model=Pipeline(steps=[("preprocessor",preprocessor),
                      ("model",RandomForestRegressor())
                     ])

# Split Data
X=data.drop("Price", axis=1)
y=data["Price"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

# Fit and score model
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.22188417408787875

### It's possible to use GridSearchCV or RandomizedSearchCV with pipeline

In [7]:
# Use GridSearchCV with our regression Pipeline
from sklearn.model_selection import GridSearchCV

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["sqrt","log2"],
    "model__min_samples_split": [2, 4]    
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samp

In [13]:
gs_model.best_score_
gs_model.best_params_

{'model__max_depth': 5,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 4,
 'model__n_estimators': 100,
 'preprocessor__num__imputer__strategy': 'median'}

In [8]:
gs_model.score(X_test,y_test)

0.28553567798887214

In [9]:
# pipeline1 = Pipeline([('std', StandardScaler()),('reg1', MultiOutputRegressor(SVR()))])

In [10]:
gcv = GridSearchCV(estimator=model,
                       param_grid=pipe_grid,
                       
                       n_jobs=-1,
                       
                       verbose=0,
                       refit='r2',
                       return_train_score=True)

In [11]:
# Split data in train and test set
    # X_train, X_test, y_train, y_test = train_test_split(weighted_grid, params, test_size=0.2,random_state=1)
# Take SVR model from the gridcvs dictionary
gcv_model_select = gcv
gcv_model_select.fit(X_train, y_train)
    # results = gcv_model_select.cv_results_
    # bm = gcv_model_select.best_estimator_