# Predict car price

In [20]:
import pandas as pd
import numpy as np

from sklearn.inspection import permutation_importance
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split
from sklearn import set_config; set_config(display='diagram')

In [6]:
data = pd.read_csv("data/cars_price.csv")
data.drop_duplicates(inplace=True)
data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


👇 Build an optimal pipeline to predict the price of cars according to their specificities. Once your pipeline is ready, use `permutation_importance` to find out which feature is the most informative of the car price.

In [14]:
X = data.drop(columns=['price', 'car_ID', 'CarName'])
y = data['price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

## Preprocess and model the data in a pipeline

In [16]:
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

def process_dataframe(input_df):
    return pd.DataFrame(input_df.map({'four': 5, 'six': 6, 'five': 5, 'three': 3, 'twelve': 12, 'two': 2, 'eight': 8}))

In [18]:
# Impute then Scale for numerical variables: 
robust_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', RobustScaler())])

standard_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

oho_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

label_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

custom_transformer = Pipeline([
    ("custom", DataframeFunctionTransformer(process_dataframe))])

oho_columns = ['symboling', 'carbody', 'drivewheel', 'enginetype', 'fuelsystem']
label_columns = ['aspiration', 'fueltype', 'doornumber', 'enginelocation']
robust_columns = ['peakrpm','wheelbase', 'carlength', 'carwidth', 'enginesize', 'stroke', 'compressionratio', 'horsepower', 'citympg', 'highwaympg']
standard_columns = ['carheight', 'curbweight', 'boreratio']
custom_columns = 'cylindernumber'

preprocessor = ColumnTransformer([
    ('robust_transformer', robust_transformer, robust_columns),
    ('standard_transformer', standard_transformer, standard_columns),
    ('oho_transformer', oho_transformer, oho_columns),
    ('label_transformer', label_transformer, label_columns),
    ('custom_transformer', custom_transformer, custom_columns)],
    remainder='passthrough')

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression())])

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression())])
final_pipe

In [19]:
final_pipe.fit(X_train, y_train)
final_pipe.score(X_train, y_train)

0.9544259751565938

In [21]:
permutation_score = permutation_importance(final_pipe, X_train, y_train, n_repeats=10,
                                random_state=10, n_jobs=-1)

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T)
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False)

Unnamed: 0,feature,score decrease
14,enginesize,0.830042
18,compressionratio,0.76296
1,fueltype,0.333386
11,curbweight,0.223304
12,enginetype,0.190935
15,fuelsystem,0.091744
6,enginelocation,0.091416
16,boreratio,0.088402
17,stroke,0.08025
7,wheelbase,0.058656
