In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [2]:
df = pd.read_csv('./used_cars_data.csv')
df.head()

Unnamed: 0,S.No.,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              7253 non-null   int64  
 1   Name               7253 non-null   object 
 2   Location           7253 non-null   object 
 3   Year               7253 non-null   int64  
 4   Kilometers_Driven  7253 non-null   int64  
 5   Fuel_Type          7253 non-null   object 
 6   Transmission       7253 non-null   object 
 7   Owner_Type         7253 non-null   object 
 8   Mileage            7251 non-null   object 
 9   Engine             7207 non-null   object 
 10  Power              7207 non-null   object 
 11  Seats              7200 non-null   float64
 12  New_Price          1006 non-null   object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 793.4+ KB


In [4]:
df[df.duplicated()]

Unnamed: 0,S.No.,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price


In [5]:
df = df.drop(columns=["S.No.","Name","Engine","Power","New_Price"])
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Seats,Price
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,5.0,1.75
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,5.0,17.74


In [6]:
df.isna().sum()

Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Seats                  53
Price                1234
dtype: int64

In [7]:
df.shape

(7253, 9)

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(5975, 9)

In [10]:
def field_value_counts(field):
    return df[field].value_counts()

In [11]:
field_value_counts('Location')

Location
Mumbai        784
Hyderabad     741
Kochi         648
Coimbatore    634
Pune          613
Delhi         549
Kolkata       530
Chennai       490
Jaipur        410
Bangalore     353
Ahmedabad     223
Name: count, dtype: int64

In [12]:
field_value_counts('Year')

Year
2014    795
2015    741
2016    740
2013    648
2017    586
2012    572
2011    461
2010    339
2018    298
2009    196
2008    169
2007    123
2019    102
2006     75
2005     55
2004     29
2003     15
2002     14
2001      7
2000      4
1998      4
1999      2
Name: count, dtype: int64

In [13]:
field_value_counts("Fuel_Type")

Fuel_Type
Diesel    3195
Petrol    2714
CNG         56
LPG         10
Name: count, dtype: int64

In [14]:
field_value_counts("Transmission")

Transmission
Manual       4266
Automatic    1709
Name: count, dtype: int64

In [15]:
field_value_counts("Owner_Type")

Owner_Type
First             4903
Second             953
Third              111
Fourth & Above       8
Name: count, dtype: int64

In [16]:
df[df['Seats'] == 0]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Seats,Price
3999,Hyderabad,2012,125000,Petrol,Automatic,First,10.5 kmpl,0.0,18.0


In [17]:
# There is no any car with seat = 0
drop_records = []
for indx in df[df['Seats'] == 0].index:
    drop_records.append(indx)
    
df.drop(labels=drop_records, axis=0, inplace=True)

In [18]:
df[df['Seats'] == 0]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Seats,Price


In [19]:
field_value_counts("Seats")

Seats
5.0     5012
7.0      674
8.0      134
4.0       99
6.0       31
2.0       16
10.0       5
9.0        3
Name: count, dtype: int64

In [20]:
## Clubs Seats to reduce seat categories [ 4 = 4 Seats and less, 5 Seats, 7 = x > 5 and <=7, 8 more than 7 seats]

df['Seats'] = df['Seats'].apply(lambda x: 4.0 if x <= 4 else x)
df['Seats'] = df['Seats'].apply(lambda x: 7 if x > 5 and x <=7 else x)
df['Seats'] = df['Seats'].apply(lambda x: 8 if x > 7 else x)

In [21]:
field_value_counts("Seats")

Seats
5.0    5012
7.0     705
8.0     142
4.0     115
Name: count, dtype: int64

In [22]:
df['Year'] = df['Year'].apply(lambda yr: 2010 if yr <= 2010 else yr)

In [23]:
field_value_counts('Year')

Year
2010    1032
2014     795
2015     741
2016     740
2013     648
2017     586
2012     571
2011     461
2018     298
2019     102
Name: count, dtype: int64

In [24]:
df['Mileage'] = df['Mileage'].apply(lambda x: x.split(' ')[0])

In [25]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

In [27]:
df.columns

Index(['Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission',
       'Owner_Type', 'Mileage', 'Seats', 'Price'],
      dtype='object')

## Without ColumnTransformer

In [28]:
# Perform OneHotEncoder on Location, FuelType and Transmission
# Location: 11 -> one remove -> 10
# Fuel_Type: 4 -> one remove -> 3
# Transmission: 2-> one remove -> 1
# Total 14 column must be generated through OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[['Location','Fuel_Type','Transmission']])  

X_test_ohe = ohe.transform(X_test[['Location','Fuel_Type','Transmission']])
X_train_ohe.shape

# Ordinal Columns
# I am considering Year as an ordinal column as price may vary based on Year
# Year: lower (2006) -> higher (2019)
# Owner_Type: lower (Fourth & Above) -> higher (First)
# Seats: 4, 5, 7, 8

ordEnc = OrdinalEncoder(categories=[
    ["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019"],
    ["Fourth & Above","Third","Second","First"],
    [4.0, 5.0,7.0,8.0]
    ])

X_train_ordencode = ordEnc.fit_transform(X_train[["Year","Owner_Type","Seats"]])

X_test_ordencode = ordEnc.transform(X_test[["Year","Owner_Type","Seats"]])

# Apply Standardization to Kilometers_Driven, Mileage

stdScalar = StandardScaler()

X_train_scaled = stdScalar.fit_transform(X_train[['Kilometers_Driven','Mileage']])

X_test_scaled = stdScalar.transform(X_test[['Kilometers_Driven','Mileage']])

In [29]:
X_train_all = np.concatenate((X_train_scaled, X_train_ordencode, X_train_ohe), axis=1)
X_test_all = np.concatenate((X_test_scaled, X_test_ordencode, X_test_ohe), axis= 1)

In [30]:
X_train_all.shape

(4779, 22)

## Apply ColumnTransformer and Pipeline

In [31]:
# Perform OneHotEncoder on Location, FuelType and Transmission
# Location: 11 
# Fuel_Type: 4 
# Transmission: 2
# Total 17 column must be generated through OneHotEncoder

# Ordinal Columns
# I am considering Year as an ordinal column as price may vary based on Year
# Year: lower (2006) -> higher (2019)
# Owner_Type: lower (Fourth & Above) -> higher (First)
# Seats: 4, 5, 7, 8

encoding_trns = ColumnTransformer(
    [
        ("OneHot_Encoder",  OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[0, 3, 4 ]  ),
        
        ("Ordinal_Encoder", OrdinalEncoder(
            categories=[
                ["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019"],
                ["Fourth & Above","Third","Second","First"],
                [4.0, 5.0,7.0,8.0]
                    ]), [1,5,7] )
    ],    
    remainder='passthrough'
)

# Apply Standardization to Kilometers_Driven, Mileage
stnd_scaler_trns = ColumnTransformer(
    [("std_scaler", StandardScaler(),[2, 6 ]  )],
    remainder='passthrough'
)



In [34]:
rf = RandomForestRegressor(n_estimators = 100)

In [35]:
from sklearn import set_config
set_config(display='diagram')

In [36]:
pipe = Pipeline([
    ('encoding',encoding_trns),
    ('standard_scaler',stnd_scaler_trns ) ,
    ('random_forest',rf) 
])

pipe.fit(X_train, y_train)

In [37]:
y_pipe_pred = pipe.predict(X_test)

In [38]:
r2_score(y_test, y_pipe_pred)

0.7760744553663026

In [39]:
lr = LinearRegression()
lr.fit(X_train_all, y_train)

In [40]:
y_pred = lr.predict(X_test_all)

In [41]:
r2_score(y_test, y_pred)

0.5656929918232674

In [42]:
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train_all, y_train)
y_pred_1 = rf.predict(X_test_all)
r2_score(y_test, y_pred_1)

0.779047070609078