# Car Price Prediction Model

### Automobile Data Set url:   
https://archive.ics.uci.edu/ml/datasets/Automobile

In [1]:
# Importing required library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn import set_config
set_config(display='diagram')

In [2]:
# Load dataset
data=pd.read_csv("data.csv")
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


In [3]:
# Shape of dataset
data.shape

(201, 26)

In [4]:
# Info of complete dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       199 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non

In [5]:
# Checking the null values
data.isnull().sum()

symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [6]:
X=data.drop(columns=['price'])
y=data['price']

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
198,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,six,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23
38,0,85.0,honda,gas,std,four,sedan,fwd,front,96.5,...,four,110,1bbl,3.15,3.58,9.0,86.0,5800.0,27,33
24,1,148.0,dodge,gas,std,four,sedan,fwd,front,93.7,...,four,90,2bbl,2.97,3.23,9.4,68.0,5500.0,31,38
122,3,186.0,porsche,gas,std,two,hatchback,rwd,front,94.5,...,four,151,mpfi,3.94,3.11,9.5,143.0,5500.0,19,27
196,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,four,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28


In [9]:
y_train.head()

198    21485
38      8845
24      6692
122    22018
196    16845
Name: price, dtype: int64

In [10]:
# Imputation transformer
trf1 = ColumnTransformer([
    ('impute', SimpleImputer(),[1,18,19,21,22]),
    ('impute_num-of-doors'), SimpleImputer(strategy='most_frequent'), [5]
], remainder='passthrough')

In [11]:
# Ordinal encoding transformer
trf2 = ColumnTransformer([
    ('order_num-of-doors', OrdinalEncoder(categories=['two', 'four']),[5]),
    ('order_num-of-cylinders', OrdinalEncoder(categories=['twelve', 'eight', 'six', 'five', 'four', 'three', 'two']),[15])
], remainder='passthrough')

In [12]:
# One hot encoding transformer
trf3 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first'),[2,3,4,6,7,14,17,8])
], remainder='passthrough')

In [13]:
# Scaling transformer
trf4 = ColumnTransformer([
    ('scale', StandardScaler(),slice(0,40))
])

In [14]:
# Feature selection transformer
trf5 = SelectKBest(score_func= f_regression,k=20)

In [15]:
# Model transformer
trf6 = LinearRegression()

In [16]:
# Creating Pipeline
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
    ('trf6', trf6)
])

In [17]:
# Train the model
pipe.fit(X_train, y_train)

TypeError: 'SimpleImputer' object is not iterable