# Importing Libraries

In [178]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

# Loading dataset

In [179]:
data = pd.read_csv("D:\\Projects\\car_price_prediction\\car_data.csv")
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [180]:
data.shape

(8128, 13)

In [181]:
data.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [182]:
data.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [183]:
data.dropna(inplace=True)

In [184]:
data.duplicated().sum()

1189

In [185]:
data.drop_duplicates(inplace=True)

In [186]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6717 entries, 0 to 8125
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6717 non-null   object 
 1   year           6717 non-null   int64  
 2   selling_price  6717 non-null   int64  
 3   km_driven      6717 non-null   int64  
 4   fuel           6717 non-null   object 
 5   seller_type    6717 non-null   object 
 6   transmission   6717 non-null   object 
 7   owner          6717 non-null   object 
 8   mileage        6717 non-null   object 
 9   engine         6717 non-null   object 
 10  max_power      6717 non-null   object 
 11  torque         6717 non-null   object 
 12  seats          6717 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 734.7+ KB


In [187]:
# Getting the unique value of all column

In [188]:
for col in data.columns:
    print("Unique values of column: " + col)
    print(data[col].unique())
    print("==============================")

Unique values of column: name
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
Unique values of column: year
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2019 2008 2020 1999 2000 2003 2004 1994 1998 1997 1995 1996]
Unique values of column: selling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000   730000   650000   330000   366000
  1149000   425000  2100000   925000   675000   819999   390000  1500000
   700000  1450000  1090000   850000  1650000  1750000  1590000  1689999
  1425000   265000   190000   630000   540000   448000   745000  1025000
   235000  1700000  1200000   610000  250

In [189]:
# Getting first word of column

In [190]:
def first_word(column_name):
    column_name = column_name.split(' ')[0]
    return column_name.strip()

In [191]:
def get_value(value):
    value = value.split(' ')[0]
    value = value.strip()
    if value == '':
        value = 0
    return float(value)

In [192]:
first_word("Tata Nexon 1.5 Revotorq XT")

'Tata'

In [193]:
data['name'] = data['name'].apply(first_word)

In [194]:
data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [195]:
# Applying same method for different column

In [196]:
data['mileage'] = data['mileage'].apply(first_word)

In [197]:
data['engine'] = data['engine'].apply(first_word)

In [198]:
data['max_power'] = data['max_power'].apply(first_word)

In [199]:
data = data.drop(columns=['torque'],axis=1)

In [200]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['name'] = encoder.fit_transform(data['name'])
data['fuel'] = encoder.fit_transform(data['fuel'])
data['seller_type'] = encoder.fit_transform(data['seller_type'])
data['transmission'] = encoder.fit_transform(data['transmission'])
data['owner'] = encoder.fit_transform(data['owner'])

In [201]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,2014,450000,145500,1,1,1,0,23.4,1248,74.0,5.0
1,26,2014,370000,120000,1,1,1,2,21.14,1498,103.52,5.0
2,10,2006,158000,140000,3,1,1,4,17.7,1497,78.0,5.0
3,11,2010,225000,127000,1,1,1,0,23.0,1396,90.0,5.0
4,20,2007,130000,120000,3,1,1,0,16.1,1298,88.2,5.0


In [202]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6717 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6717 non-null   int32  
 1   year           6717 non-null   int64  
 2   selling_price  6717 non-null   int64  
 3   km_driven      6717 non-null   int64  
 4   fuel           6717 non-null   int32  
 5   seller_type    6717 non-null   int32  
 6   transmission   6717 non-null   int32  
 7   owner          6717 non-null   int32  
 8   mileage        6717 non-null   object 
 9   engine         6717 non-null   object 
 10  max_power      6717 non-null   object 
 11  seats          6717 non-null   float64
dtypes: float64(1), int32(5), int64(3), object(3)
memory usage: 551.0+ KB


In [203]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,2014,450000,145500,1,1,1,0,23.4,1248,74.0,5.0
1,26,2014,370000,120000,1,1,1,2,21.14,1498,103.52,5.0
2,10,2006,158000,140000,3,1,1,4,17.7,1497,78.0,5.0
3,11,2010,225000,127000,1,1,1,0,23.0,1396,90.0,5.0
4,20,2007,130000,120000,3,1,1,0,16.1,1298,88.2,5.0


In [204]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6717 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6717 non-null   int32  
 1   year           6717 non-null   int64  
 2   selling_price  6717 non-null   int64  
 3   km_driven      6717 non-null   int64  
 4   fuel           6717 non-null   int32  
 5   seller_type    6717 non-null   int32  
 6   transmission   6717 non-null   int32  
 7   owner          6717 non-null   int32  
 8   mileage        6717 non-null   object 
 9   engine         6717 non-null   object 
 10  max_power      6717 non-null   object 
 11  seats          6717 non-null   float64
dtypes: float64(1), int32(5), int64(3), object(3)
memory usage: 551.0+ KB


# Model training

In [205]:
X = data.drop(['selling_price'],axis=1)
y = data['selling_price']

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [207]:
model = LinearRegression()

In [208]:
model.fit(X_train,y_train)

In [209]:
y_pred = model.predict(X_test)

In [210]:
y_pred

array([1518263.77923562,  697839.1259103 ,  238732.05918653, ...,
        392994.53239796,  897597.34946352,   42407.09547326])

In [224]:
input = pd.DataFrame([[5,2014,1500,1,1,1,2,21.14,1500,103.52,8.0]],
                    columns=['name','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats'])

In [225]:
model.predict(input)

array([674003.79414371])

In [226]:
import pickle
pickle.dump(model,open('carprice_model.pkl','wb'))