# Car Price Prediction Project
#### This dataset contains various features related to cars, including the year of manufacture, selling price, kilometers driven, fuel type, seller type, transmission type, number of previous owners, mileage, and engine specifications. These attributes provide valuable insights into the factors influencing car prices and can be used to develop predictive models for estimating the selling price of cars.

## Importing Necessary Libraries

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Loading dataset

In [63]:
dataset = pd.read_csv("car_prices.csv")

In [64]:
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


## Columns that needs to be removed to simplify the model building process.
- seller_type
- transmission
- owner


In [65]:
cars_df = dataset.drop(["seller_type","transmission","owner"],axis=1)

In [66]:
cars_df.head(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,16.1,1298.0,88.2,5.0


In [67]:
cars_df.shape

(8128, 9)

In [68]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   mileage(km/ltr/kg)  7907 non-null   float64
 6   engine              7907 non-null   float64
 7   max_power           7913 non-null   object 
 8   seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(3)
memory usage: 571.6+ KB


## Data Cleaning

In [69]:
backup = cars_df.copy()

In [70]:
cars_df['year']

0       2014
1       2014
2       2006
3       2010
4       2007
        ... 
8123    2013
8124    2007
8125    2009
8126    2013
8127    2013
Name: year, Length: 8128, dtype: int64

In [71]:
#cars_df["max_power"] = cars_df["max_power"].astype(float)
cars_df.dropna(inplace=True)

In [72]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7907 entries, 0 to 8127
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                7907 non-null   object 
 1   year                7907 non-null   int64  
 2   selling_price       7907 non-null   int64  
 3   km_driven           7907 non-null   int64  
 4   fuel                7907 non-null   object 
 5   mileage(km/ltr/kg)  7907 non-null   float64
 6   engine              7907 non-null   float64
 7   max_power           7907 non-null   object 
 8   seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(3)
memory usage: 617.7+ KB


In [73]:
cars_df["max_power"] = pd.to_numeric(cars_df['max_power'],errors = 'coerce')

In [74]:
cars_df['name']=cars_df['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [75]:
cars_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire,2014,450000,145500,Diesel,23.40,1248.0,74.00,5.0
1,Skoda Rapid 1.5,2014,370000,120000,Diesel,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020,2006,158000,140000,Petrol,17.70,1497.0,78.00,5.0
3,Hyundai i20 Sportz,2010,225000,127000,Diesel,23.00,1396.0,90.00,5.0
4,Maruti Swift VXI,2007,130000,120000,Petrol,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,18.50,1197.0,82.85,5.0
8124,Hyundai Verna CRDi,2007,135000,119000,Diesel,16.80,1493.0,110.00,5.0
8125,Maruti Swift Dzire,2009,382000,120000,Diesel,19.30,1248.0,73.90,5.0
8126,Tata Indigo CR4,2013,290000,25000,Diesel,23.57,1396.0,70.00,5.0


In [76]:
cars_df.reset_index(drop=True)

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire,2014,450000,145500,Diesel,23.40,1248.0,74.00,5.0
1,Skoda Rapid 1.5,2014,370000,120000,Diesel,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020,2006,158000,140000,Petrol,17.70,1497.0,78.00,5.0
3,Hyundai i20 Sportz,2010,225000,127000,Diesel,23.00,1396.0,90.00,5.0
4,Maruti Swift VXI,2007,130000,120000,Petrol,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...
7902,Hyundai i20 Magna,2013,320000,110000,Petrol,18.50,1197.0,82.85,5.0
7903,Hyundai Verna CRDi,2007,135000,119000,Diesel,16.80,1493.0,110.00,5.0
7904,Maruti Swift Dzire,2009,382000,120000,Diesel,19.30,1248.0,73.90,5.0
7905,Tata Indigo CR4,2013,290000,25000,Diesel,23.57,1396.0,70.00,5.0


In [77]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7907 entries, 0 to 8127
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                7907 non-null   object 
 1   year                7907 non-null   int64  
 2   selling_price       7907 non-null   int64  
 3   km_driven           7907 non-null   int64  
 4   fuel                7907 non-null   object 
 5   mileage(km/ltr/kg)  7907 non-null   float64
 6   engine              7907 non-null   float64
 7   max_power           7906 non-null   float64
 8   seats               7907 non-null   float64
dtypes: float64(4), int64(3), object(2)
memory usage: 617.7+ KB


In [78]:
cars_df.describe()

Unnamed: 0,year,selling_price,km_driven,mileage(km/ltr/kg),engine,max_power,seats
count,7907.0,7907.0,7907.0,7907.0,7907.0,7906.0,7907.0
mean,2013.982168,649741.7,69192.56,19.418783,1458.625016,91.587374,5.416719
std,3.86665,813556.5,56789.76,4.037145,503.916303,35.747216,0.959588
min,1994.0,29999.0,1.0,0.0,624.0,32.8,2.0
25%,2012.0,270000.0,35000.0,16.78,1197.0,68.05,5.0
50%,2015.0,450000.0,60000.0,19.3,1248.0,82.0,5.0
75%,2017.0,690000.0,95750.0,22.32,1582.0,102.0,5.0
max,2020.0,10000000.0,2360457.0,42.0,3604.0,400.0,14.0


In [79]:
cars_df[cars_df['selling_price']>6e6]

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
170,Volvo XC90 T8,2017,10000000,30000,Petrol,42.0,1969.0,400.0,4.0
2938,BMW X7 xDrive,2020,7200000,5000,Diesel,13.38,2993.0,265.0,7.0
4950,Audi A6 35,2019,6223000,7800,Petrol,15.26,1798.0,187.74,5.0
4952,Audi A6 35,2019,6523000,23600,Petrol,15.26,1798.0,187.74,5.0


In [80]:
cars_df=cars_df[cars_df['selling_price']<6e6]

In [81]:
cars_df=cars_df.reset_index(drop=True)

In [82]:
cars_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire,2014,450000,145500,Diesel,23.40,1248.0,74.00,5.0
1,Skoda Rapid 1.5,2014,370000,120000,Diesel,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020,2006,158000,140000,Petrol,17.70,1497.0,78.00,5.0
3,Hyundai i20 Sportz,2010,225000,127000,Diesel,23.00,1396.0,90.00,5.0
4,Maruti Swift VXI,2007,130000,120000,Petrol,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...
7892,Hyundai i20 Magna,2013,320000,110000,Petrol,18.50,1197.0,82.85,5.0
7893,Hyundai Verna CRDi,2007,135000,119000,Diesel,16.80,1493.0,110.00,5.0
7894,Maruti Swift Dzire,2009,382000,120000,Diesel,19.30,1248.0,73.90,5.0
7895,Tata Indigo CR4,2013,290000,25000,Diesel,23.57,1396.0,70.00,5.0


In [87]:
cars_df.isna().sum()

name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
mileage(km/ltr/kg)    0
engine                0
max_power             0
seats                 0
dtype: int64

In [86]:
cars_df.dropna(inplace=True)

In [88]:
cars_df.to_csv("cleaned_car_prices.csv")

# Model

In [89]:
X=cars_df.drop(columns="selling_price")
y=cars_df["selling_price"]

In [90]:
X

Unnamed: 0,name,year,km_driven,fuel,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire,2014,145500,Diesel,23.40,1248.0,74.00,5.0
1,Skoda Rapid 1.5,2014,120000,Diesel,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020,2006,140000,Petrol,17.70,1497.0,78.00,5.0
3,Hyundai i20 Sportz,2010,127000,Diesel,23.00,1396.0,90.00,5.0
4,Maruti Swift VXI,2007,120000,Petrol,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...
7892,Hyundai i20 Magna,2013,110000,Petrol,18.50,1197.0,82.85,5.0
7893,Hyundai Verna CRDi,2007,119000,Diesel,16.80,1493.0,110.00,5.0
7894,Maruti Swift Dzire,2009,120000,Diesel,19.30,1248.0,73.90,5.0
7895,Tata Indigo CR4,2013,25000,Diesel,23.57,1396.0,70.00,5.0


In [91]:
y

0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
7892    320000
7893    135000
7894    382000
7895    290000
7896    290000
Name: selling_price, Length: 7896, dtype: int64

In [92]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=39)

In [93]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [94]:
ohe = OneHotEncoder()

In [95]:
ohe.fit(X[["name","fuel"]])

In [96]:
ohe.categories_

[array(['Ambassador CLASSIC 1500', 'Ambassador Classic 2000',
        'Ambassador Grand 1500', 'Ambassador Grand 2000',
        'Ashok Leyland Stile', 'Audi A3 35', 'Audi A3 40', 'Audi A4 1.8',
        'Audi A4 2.0', 'Audi A4 35', 'Audi A6 2.0', 'Audi A6 35',
        'Audi Q3 2.0', 'Audi Q3 35', 'Audi Q5 2.0', 'Audi Q5 3.0',
        'Audi Q5 35TDI', 'Audi Q5 45', 'Audi Q7 3.0', 'Audi Q7 35',
        'BMW 3 Series', 'BMW 5 Series', 'BMW 6 Series', 'BMW 7 Series',
        'BMW X1 sDrive', 'BMW X1 sDrive20d', 'BMW X1 sDrive20i',
        'BMW X3 xDrive20d', 'BMW X4 M', 'BMW X5 3.0d', 'BMW X6 xDrive30d',
        'Chevrolet Aveo 1.4', 'Chevrolet Aveo U-VA',
        'Chevrolet Beat Diesel', 'Chevrolet Beat LS', 'Chevrolet Beat LT',
        'Chevrolet Captiva 2.2', 'Chevrolet Captiva LT',
        'Chevrolet Cruze LT', 'Chevrolet Cruze LTZ', 'Chevrolet Enjoy 1.3',
        'Chevrolet Enjoy 1.4', 'Chevrolet Enjoy Petrol',
        'Chevrolet Enjoy TCDi', 'Chevrolet Optra 1.6',
        'Chevrolet O

In [97]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','fuel']),remainder='passthrough')

In [98]:
lr = LinearRegression()

In [99]:
pipe=make_pipeline(column_trans,lr)

In [100]:
pipe.fit(X_train,y_train)

In [101]:
y_pred=pipe.predict(X_test)

In [102]:
y_pred

array([317510.25236858, 744146.21295395, 119717.94662465, ...,
       305132.55496138, 630811.32586102, 633000.36465974])

In [103]:
r2_score(y_test,y_pred)

0.8471490051823811

In [104]:
import pickle

In [105]:
pickle.dump(pipe,open("LinearRegressionModel.pkl","wb"))