In [36]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [23]:
df = pd.read_csv('car-details.csv')
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
2714,Maruti Swift AMT VXI,Maruti,Swift,AMT VXI,2020,First,Petrol,Individual,Automatic,1000,49.84,1197.0,81.8,113.0,5.0,654000
6098,Ford EcoSport 1.5 Diesel Titanium Plus BSIV,Ford,EcoSport,1.5 Diesel Titanium Plus BSIV,2017,First,Diesel,Individual,Manual,33000,54.06,1498.0,98.96,205.0,5.0,880000
5092,Ford Fusion 1.4 TDCi Diesel,Ford,Fusion,1.4 TDCi Diesel,2009,Third,Diesel,Individual,Manual,110000,41.6,1399.0,68.0,156.9064,5.0,155000
5600,Tata Nexon 1.5 Revotorq XZ Plus Dual Tone,Tata,Nexon,1.5 Revotorq XZ Plus Dual Tone,2018,First,Diesel,Individual,Manual,60000,50.53,1497.0,108.5,260.0,5.0,860000
3050,Hyundai Xcent 1.1 CRDi Base,Hyundai,Xcent,1.1 CRDi Base,2014,First,Diesel,Dealer,Manual,42312,57.34,1120.0,71.0,180.4,5.0,455000


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [16]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [17]:
df = df.drop(columns=['name','model','edition'])
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [18]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [19]:
df.duplicated().sum()

np.int64(0)

In [20]:
x = df.drop(columns = 'selling_price')
y = df.selling_price.copy()

print(x.shape,y.shape)


(6907, 12) (6907,)


In [21]:
x_train , x_test ,y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)



In [26]:
num_cols = x_train.select_dtypes(include='number').columns.tolist()
cat_cols = [col for col in x_train.columns if col not in num_cols]

print(num_cols)
print(cat_cols )

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [31]:
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))

])

preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

regressor = RandomForestRegressor(
    n_estimators=10, max_depth=5, random_state=42
)

rf_model = Pipeline(steps=[
    ('pre', preprocessor),
    ('reg', regressor)
])
rf_model.fit(x_train,y_train)

0,1,2
,steps,"[('pre', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [39]:
y_train_pred = rf_model.predict(x_train)
train_rmse = root_mean_squared_error(y_train,y_train_pred)


y_test_pred = rf_model.predict(x_test)
test_rmse = root_mean_squared_error(y_test,y_test_pred)

