## Foward Feature Selection

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/Cars93.csv"
import pandas as pd
df = pd.read_csv(path,na_values=['','NA'],keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


## Perform basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [4]:
df.shape

(93, 28)

## Separate X and Y features
    Y: Weight
    X: apart from Weight and ID, considering all the features

In [5]:
df.columns

Index(['id', 'Manufacturer', 'Model', 'Type', 'Min.Price', 'Price',
       'Max.Price', 'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain',
       'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile',
       'Man.trans.avail', 'Fuel.tank.capacity', 'Passengers', 'Length',
       'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room',
       'Weight', 'Origin', 'Make'],
      dtype='object')

In [6]:
X = df.drop(columns=['id','Weight'])
Y = df[['Weight']]

In [7]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [8]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


In [9]:
X.isna().sum()

Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags                4
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Origin                 0
Make                   0
dtype: int64

## Data Cleaning and Data preprocessing

In [11]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


In [12]:
num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

In [13]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder()
)

In [14]:
pre = ColumnTransformer([
    ("cat",cat_pipe,cat),
    ("con",num_pipe,con)
]).set_output(transform='pandas')

In [15]:
pre

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [16]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,cat__Manufacturer,cat__Model,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin,cat__Make,con__Min.Price,...,con__RPM,con__Rev.per.mile,con__Fuel.tank.capacity,con__Passengers,con__Length,con__Wheelbase,con__Width,con__Turn.circle,con__Rear.seat.room,con__Luggage.room
0,0.0,48.0,3.0,2.0,1.0,1.0,1.0,1.0,0.0,-0.485787,...,1.717489,1.12953,-1.062184,-0.083243,-0.427186,-0.286932,-0.366184,-0.610436,-0.452197,-1.033015
1,0.0,55.0,2.0,0.0,1.0,3.0,1.0,1.0,1.0,1.388017,...,0.369586,0.005661,0.409445,-0.083243,0.812171,1.629649,0.431983,-0.29851,0.73809,0.396643
2,1.0,8.0,0.0,1.0,1.0,3.0,1.0,1.0,3.0,1.008658,...,0.369586,-0.105713,0.072197,-0.083243,-0.220626,-0.286932,-0.632239,-0.610436,0.057926,0.039228
3,1.0,0.0,2.0,1.0,1.0,3.0,1.0,1.0,2.0,1.571949,...,0.369586,0.410659,1.359872,0.884457,0.674465,0.302785,0.165927,-0.610436,1.078172,1.111472
4,2.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,4.0,0.755752,...,0.706562,0.430909,1.359872,-1.050944,0.192493,0.745073,-0.100128,0.013416,-0.282156,-0.318186


## Feature Selection: Forward Selection

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [18]:
base_model = LinearRegression()
for_sel = SequentialFeatureSelector(base_model,direction='forward',n_features_to_select=10)
for_sel.fit(X_pre,Y)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,10
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
for_sel.get_feature_names_out()

array(['cat__Manufacturer', 'cat__Origin', 'con__MPG.highway',
       'con__Horsepower', 'con__RPM', 'con__Fuel.tank.capacity',
       'con__Passengers', 'con__Wheelbase', 'con__Width',
       'con__Rear.seat.room'], dtype=object)

In [20]:
imp_cols = for_sel.get_feature_names_out()
imp_cols

array(['cat__Manufacturer', 'cat__Origin', 'con__MPG.highway',
       'con__Horsepower', 'con__RPM', 'con__Fuel.tank.capacity',
       'con__Passengers', 'con__Wheelbase', 'con__Width',
       'con__Rear.seat.room'], dtype=object)

In [21]:
len(imp_cols)

10

In [23]:
sel_cols = []
for i in imp_cols:
    cols =i.split('__')[-1]
    sel_cols.append(cols)
sel_cols

['Manufacturer',
 'Origin',
 'MPG.highway',
 'Horsepower',
 'RPM',
 'Fuel.tank.capacity',
 'Passengers',
 'Wheelbase',
 'Width',
 'Rear.seat.room']

In [24]:
X_sel = X[sel_cols]
X_sel.head()

Unnamed: 0,Manufacturer,Origin,MPG.highway,Horsepower,RPM,Fuel.tank.capacity,Passengers,Wheelbase,Width,Rear.seat.room
0,Acura,non-USA,31,140,6300,13.2,5,102,68,26.5
1,Acura,non-USA,25,200,5500,18.0,5,115,71,30.0
2,Audi,non-USA,26,172,5500,16.9,5,102,67,28.0
3,Audi,non-USA,26,172,5500,21.1,6,106,70,31.0
4,BMW,non-USA,30,208,5700,21.1,4,109,69,27.0


## 2nd step Data Preprocessing

In [25]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
X_sel_cat = list(X_sel.columns[X_sel.dtypes=='object'])
X_sel_con = list(X_sel.columns[X_sel.dtypes!='object'])

In [27]:
num_pipe2 = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

In [28]:
cat_pipe2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore',sparse_output=False)
)

In [30]:
pre2 = ColumnTransformer([
    ('cat',cat_pipe2,X_sel_cat),
    ('con',num_pipe2,X_sel_con)
]).set_output(transform='pandas')

In [31]:
pre2

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [33]:
X_sel_pre = pre2.fit_transform(X_sel)
X_sel_pre.head()

Unnamed: 0,cat__Manufacturer_Acura,cat__Manufacturer_Audi,cat__Manufacturer_BMW,cat__Manufacturer_Buick,cat__Manufacturer_Cadillac,cat__Manufacturer_Chevrolet,cat__Manufacturer_Chrylser,cat__Manufacturer_Chrysler,cat__Manufacturer_Dodge,cat__Manufacturer_Eagle,...,cat__Origin_USA,cat__Origin_non-USA,con__MPG.highway,con__Horsepower,con__RPM,con__Fuel.tank.capacity,con__Passengers,con__Wheelbase,con__Width,con__Rear.seat.room
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.360925,-0.073484,1.717489,-1.062184,-0.083243,-0.286932,-0.366184,-0.452197
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.770514,1.078322,0.369586,0.409445,-0.083243,1.629649,0.431983,0.73809
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.581941,0.540813,0.369586,0.072197,-0.083243,-0.286932,-0.632239,0.057926
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.581941,0.540813,0.369586,1.359872,0.884457,0.302785,0.165927,1.078172
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.172352,1.231897,0.706562,1.359872,-1.050944,0.745073,-0.100128,-0.282156


## Final model building

In [34]:
model = LinearRegression()
model.fit(X_sel_pre,Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [36]:
model.intercept_

array([3076.35884896])

In [38]:
model.coef_

array([[-241.4945376 ,  135.67452245,  196.9822348 ,  -35.46778865,
        -142.1256961 ,    1.63033905,  -32.23766846,  181.72302522,
          77.40946888, -164.44905232,  -44.10939764,   33.4110582 ,
         -54.59909782,  -11.44126709, -128.46721722,    5.04904538,
          40.22148121,  -56.5068397 , -191.63241052,  -44.92020547,
         -12.85959976,   66.74956779,  -11.63404834,  169.69604497,
         -16.89098592,  -88.08342616,  -43.31501959,  107.68948866,
         131.2242809 ,   -7.97178603,   94.85249614,   85.89299076,
         -64.46950319,   64.46950319, -109.8153361 ,  215.39114976,
         -62.86604206,  -18.87129663,   63.53531886,  254.47061074,
         118.49924298,  -52.79312459]])

In [40]:
ypreds = model.predict(X_sel_pre).round(2)
ypreds[:5]

array([[2658.12],
       [3606.88],
       [3276.02],
       [3503.98],
       [3640.  ]])

In [41]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


## Evaluation metrics

In [43]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

mse = mean_squared_error(Y,ypreds)
mae = mean_absolute_error(Y,ypreds)
rmse = mse**(1/2)
r2 = r2_score(Y,ypreds)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2*100:.2f}%")

MSE: 8301.02
MAE: 68.77
RMSE: 91.11
R2 score: 97.59%


## Model can be considered for final model building