In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/Cars.csv")
df.head(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [4]:
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [5]:
df['owner'].isnull().unique()

array([False])

In [6]:
#Using manual mapping for the owner column
owner_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}
print(df['owner'].unique())

df["owner"] = df["owner"].map(owner_mapping)

print(df["owner"].unique())


['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
[1 2 3 4 5]


In [7]:
df["fuel"].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [8]:
# Removing rows with fuel LPG or CNG.
df = df[~df['fuel'].isin(['LPG', 'CNG'])]
df["fuel"].unique()

array(['Diesel', 'Petrol'], dtype=object)

In [9]:
df["mileage"].head(1)

0    23.4 kmpl
Name: mileage, dtype: object

In [10]:
# Striping kmpl from mileage and converting it into float
df['mileage'] = df['mileage'].astype(str).str.replace('kmpl', '').str.strip().astype(float)
df["mileage"].head(5)

0    23.40
1    21.14
2    17.70
3    23.00
4    16.10
Name: mileage, dtype: float64

In [11]:
# Removing CC from engine column and converting to float.
df['engine'] = df['engine'].astype(str).str.replace('CC', '').str.strip().astype(float)
df["engine"].head(5)

0    1248.0
1    1498.0
2    1497.0
3    1396.0
4    1298.0
Name: engine, dtype: float64

In [12]:
# replacing bhp from max power
df['max_power'] = df['max_power'].astype(str).str.replace('bhp', '').str.strip().astype(float)
df["max_power"].head(5)

0     74.00
1    103.52
2     78.00
3     90.00
4     88.20
Name: max_power, dtype: float64

In [13]:
# Only taking first name of the brand by splitting the name..

print(df['name'].unique())
df['name'] = df['name'].astype(str).str.split(' ').str[0]
print("----------------------------------------------------------")
print(df['name'].unique())

['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
----------------------------------------------------------
['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata' 'Chevrolet' 'Fiat' 'Datsun' 'Jeep' 'Mercedes-Benz' 'Mitsubishi'
 'Audi' 'Volkswagen' 'BMW' 'Nissan' 'Lexus' 'Jaguar' 'Land' 'MG' 'Volvo'
 'Daewoo' 'Kia' 'Force' 'Ambassador' 'Ashok' 'Isuzu' 'Opel' 'Peugeot']


In [14]:
## dropping torque column
## ignoring errors as repeated action will cause column not found error
df = df.drop(columns=['torque'], errors='ignore')
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

In [15]:
# Test Drive Cars are ridiculously expensive. Since we do not want to
# involve this, we will simply delete all samples related to it.
print(df["owner"].unique())
df = df[~df['owner'].isin(['Test Drive Car'])]
df["owner"].unique()

[1 2 3 4 5]


array([1, 2, 3, 4, 5])

In [16]:
from sklearn.preprocessing import LabelEncoder

features = ["transmission", "fuel", "name", "seller_type"]

for feat in features:
    le = LabelEncoder()
    df[feat] = le.fit_transform(df[feat])

print(df["transmission"].unique())
print(df["fuel"].unique())
print(df["name"].unique())
print(df["seller_type"].unique())

[1 0]
[0 1]
[20 27 10 11 29  9 26 19 28  4  7  6 14 21 22  2 30  3 23 17 13 16 18 31
  5 15  8  0  1 12 24 25]
[1 0 2]


In [17]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

3. Feature Selection

In [18]:
#x is our features
X = df[ ['name', 'year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'] ]

# y is selling price (to be predicted).
# y = df["selling_price"]
y = np.log(df['selling_price'])

Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 39)

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8033 entries, 0 to 8127
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          8033 non-null   int64  
 1   year          8033 non-null   int64  
 2   km_driven     8033 non-null   int64  
 3   fuel          8033 non-null   int64  
 4   seller_type   8033 non-null   int64  
 5   transmission  8033 non-null   int64  
 6   owner         8033 non-null   int64  
 7   mileage       7819 non-null   float64
 8   engine        7819 non-null   float64
 9   max_power     7825 non-null   float64
 10  seats         7819 non-null   float64
dtypes: float64(4), int64(7)
memory usage: 753.1 KB


In [21]:
X_train.isna().sum()

name              0
year              0
km_driven         0
fuel              0
seller_type       0
transmission      0
owner             0
mileage         171
engine          171
max_power       167
seats           171
dtype: int64

In [22]:
# from sklearn.linear_model import LinearRegression  #we are using regression models
# from sklearn.metrics import mean_squared_error, r2_score

# lr = LinearRegression()
# lr.fit(X_train, y_train)
# yhat = lr.predict(X_test)

# print("MSE: ", mean_squared_error(y_test, yhat))
# print("r2: ", r2_score(y_test, yhat))

In [23]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,2014,450000,145500,0,1,1,1,23.4,1248.0,74.0,5.0
1,27,2014,370000,120000,0,1,1,2,21.14,1498.0,103.52,5.0
2,10,2006,158000,140000,1,1,1,3,17.7,1497.0,78.0,5.0
3,11,2010,225000,127000,0,1,1,1,23.0,1396.0,90.0,5.0
4,20,2007,130000,120000,1,1,1,1,16.1,1298.0,88.2,5.0
