In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("car-details.csv")
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
2602,Ford Endeavour 2.5L 4X2,Ford,Endeavour,2.5L 4X2,2013,First,Diesel,Dealer,Manual,62000,30.78,2499.0,141.0,330.0,7.0,700000
3315,Maruti Alto 800 LXI,Maruti,Alto,800 LXI,2013,Second,Petrol,Individual,Manual,60000,53.44,796.0,47.3,69.0,5.0,170000
3547,Maruti Swift VDI Optional,Maruti,Swift,VDI Optional,2016,First,Diesel,Individual,Manual,50000,59.22,1248.0,74.0,190.0,5.0,550000
2905,Tata Indigo CS eLX BS IV,Tata,Indigo,CS eLX BS IV,2014,Second,Diesel,Individual,Manual,100000,58.75,1396.0,69.01,140.0,5.0,215000
440,Chevrolet Enjoy 1.3 TCDi LS 7,Chevrolet,Enjoy,1.3 TCDi LS 7,2016,Second,Diesel,Individual,Manual,110000,42.78,1248.0,73.74,172.5,7.0,420000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [4]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [5]:
df.shape

(6926, 16)

In [8]:
for col in df.select_dtypes(include="O").columns:
    print(f'Column:{col}')
    print(f"Cardinality:{df[col].nunique()}")
    print(df[col].unique())
    print(df[col].value_counts(normalize=True))
    print()

Column:name
Cardinality:2058
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
name
Maruti Swift Dzire VDI                          0.017037
Maruti Alto 800 LXI                             0.010973
Maruti Alto LXi                                 0.009962
Maruti Swift VDI                                0.008663
Maruti Swift VDI BSIV                           0.008085
                                                  ...   
Mahindra KUV 100 G80 K4 Plus                    0.000144
Hyundai Getz 1.1 GVS                            0.000144
Maruti 800 EX BSII                              0.000144
Datsun GO T Option                              0.000144
Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV    0.000144
Name: proportion, Length: 2058, dtype: float64

Column:company
Cardinality:32
['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford

In [9]:
df=df.drop(columns=["name","model","edition"])

In [10]:
df

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.00,1248.0,74.00,190.000000,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.70,1498.0,103.52,250.000000,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.60,1497.0,78.00,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.00,219.668960,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.20,112.776475,5.0,130000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6921,Maruti,2013,Second,Petrol,Individual,Manual,50000,44.40,998.0,67.10,90.000000,5.0,260000
6922,Hyundai,2014,Second,Diesel,Individual,Manual,80000,52.97,1396.0,88.73,219.700000,5.0,475000
6923,Hyundai,2013,First,Petrol,Individual,Manual,110000,43.47,1197.0,82.85,113.700000,5.0,320000
6924,Hyundai,2007,Fourth & Above,Diesel,Individual,Manual,119000,39.47,1493.0,110.00,235.359600,5.0,135000


In [11]:
df.info

<bound method DataFrame.info of       company  year           owner    fuel seller_type transmission  \
0      Maruti  2014           First  Diesel  Individual       Manual   
1       Skoda  2014          Second  Diesel  Individual       Manual   
2       Honda  2006           Third  Petrol  Individual       Manual   
3     Hyundai  2010           First  Diesel  Individual       Manual   
4      Maruti  2007           First  Petrol  Individual       Manual   
...       ...   ...             ...     ...         ...          ...   
6921   Maruti  2013          Second  Petrol  Individual       Manual   
6922  Hyundai  2014          Second  Diesel  Individual       Manual   
6923  Hyundai  2013           First  Petrol  Individual       Manual   
6924  Hyundai  2007  Fourth & Above  Diesel  Individual       Manual   
6925   Maruti  2009           First  Diesel  Individual       Manual   

      km_driven  mileage_mpg  engine_cc  max_power_bhp   torque_nm  seats  \
0        145500        55.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company        6926 non-null   object 
 1   year           6926 non-null   int64  
 2   owner          6926 non-null   object 
 3   fuel           6926 non-null   object 
 4   seller_type    6926 non-null   object 
 5   transmission   6926 non-null   object 
 6   km_driven      6926 non-null   int64  
 7   mileage_mpg    6718 non-null   float64
 8   engine_cc      6718 non-null   float64
 9   max_power_bhp  6717 non-null   float64
 10  torque_nm      6717 non-null   float64
 11  seats          6718 non-null   float64
 12  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 703.6+ KB


In [13]:
df.duplicated().sum()

19

In [18]:
df=df.drop_duplicates()

In [20]:
df.duplicated().sum()
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [22]:
X=df.drop(columns="selling_price")
y=df.selling_price.copy()
print(X.shape,y.shape)

(6907, 12) (6907,)


In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5525, 12) (5525,)
(1382, 12) (1382,)


In [37]:
num_cols=X_train.select_dtypes(include="number").columns.tolist()
cat_cols=[col for col in X_train.columns if col not in num_cols]
print(num_cols)
print(cat_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [42]:
num_pipe=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())

])

cat_pipe = Pipeline(steps=[
    ('imputer' ,SimpleImputer(strategy='constant',fill_value="missing") ),
    ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num",num_pipe,num_cols),
    ("cat",cat_pipe,cat_cols)
])
preprocessor.fit_transform(X_train)

regressor = RandomForestRegressor(
    n_estimators = 10,max_depth=5,random_state=42
)

rf_model = Pipeline(steps=[
    ("pre",preprocessor),
    ("reg",regressor)
])
rf_model.fit(X_train,y_train)

In [45]:
y_train_pred = rf_model.predict(X_train)
train_rmse = mean_squared_error(y_train,y_train_pred,squared=False)
print(f"Train RMSE :{train_rmse:,.3f}")

y_test_pred = rf_model.predict(X_test)
test_rmse = mean_squared_error(y_test,y_test_pred,squared=False)
print(f"Test RMSE :{test_rmse:,.3f}")



Train RMSE :169,947.490
Test RMSE :172,392.131


