In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("data.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       199 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn import set_config
set_config(display='diagram')

In [5]:
X=df.drop(columns=['price'])
y=df['price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [7]:
num_col=[col for col in X_train.columns if X_train[col].dtypes!='object']
num_col

['symboling',
 'normalized-losses',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-size',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg']

In [8]:
cat_col=[col for col in X_train.columns if X_train[col].dtypes=='object']
cat_col

['make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'engine-type',
 'num-of-cylinders',
 'fuel-system']

In [9]:
num_pipe = Pipeline([
    ('num_imputer', SimpleImputer()),
    ('num_scalar', StandardScaler())
])

cat_pipe = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_ohe', OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first'))
])

In [10]:
col_trf = ColumnTransformer([
    ('num_pipe', num_pipe, num_col),
    ('cat_pipe', cat_pipe, cat_col)
])

In [11]:
col_trf.fit_transform(X_train)

array([[-1.43486011, -0.81257593,  1.75388872, ...,  1.        ,
         0.        ,  0.        ],
       [-0.61494005, -1.12097163, -0.38295763, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.20498002,  0.82192126, -0.85781238, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.20498002,  0.20512986, -0.72213959, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.02490008,  0.02009245, -0.24728485, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.84482014,  2.24054146, -1.26483073, ...,  1.        ,
         0.        ,  0.        ]])

In [12]:
col_trf.transform(X_test)

array([[ 1.02490008,  1.43871265, -0.62038501, ...,  0.        ,
         0.        ,  0.        ],
       [-0.61494005,  0.        ,  0.80417923, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.02490008,  0.48268599, -2.06190834, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.02490008,  0.39016728, -0.06073477, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20498002,  1.22283566, -0.97652606, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.84482014,  0.        , -0.48471222, ...,  0.        ,
         1.        ,  0.        ]])

In [13]:
final_pipe = Pipeline([
    ('col_trf', col_trf),
    ('featute_selection', SelectKBest(score_func= f_regression,k=20)),
    ('model', LinearRegression())
])

In [14]:
final_pipe.fit(X_train, y_train)

In [15]:
final_pipe.predict(X_test)

array([ 7488.59068329, 26339.1001639 ,  6973.43634597,  7036.07170102,
       12358.81462386,  5743.81376925, 32265.2532727 , 11779.76415726,
       22633.24368798, 31967.00592027, 32690.0149026 ,  8014.19411475,
       15532.48574246,  9961.60384855, 16772.50697077,  8363.02572305,
       10258.85221606,  7237.72212237,  9999.90237565, 33889.01648987,
       24535.54271028, 28180.78289322,  6352.70269476,  6471.25989654,
       32158.21705018,  9957.75800186, 10790.21946008, 33127.72738179,
       23663.64581235, 13964.13053002, 12204.5964074 ,  5914.8743886 ,
       15022.21848264,  6977.42411106,  7464.69206913, 15089.48083978,
        6254.34422109,  7613.63118899, 16427.83491109,  8347.22173727,
       14701.8074328 ])

In [16]:
final_pipe.score(X_test, y_test)

0.832302662114066

In [17]:
final_pipe_1 = Pipeline([
    ('col_trf', col_trf),
    ('featute_selection', SelectKBest(score_func= f_regression,k="all")),
    ('model', LinearRegression())
])

In [18]:
final_pipe_1.fit(X_train, y_train)

In [19]:
final_pipe_1.predict(X_test)

array([ 7336.63797409, 28028.70180871,  6611.21305803,  7451.83163669,
       12950.25422133,  5546.80588593, 43451.16323349,  9664.41584018,
       16375.81039077, 32342.37565344, 25089.2624309 , 10372.67747163,
       10590.65288478,  8137.22074311, 12278.20701751,  8324.57446504,
       10129.73012638,  7196.03296648,  9048.79805029, 42355.88951909,
       32528.        , 31202.99120283,  6784.82868948,  6986.44832993,
       25918.00056665, 10229.10082694, 11012.40494085, 27864.45217064,
       35665.55974908, 15357.25697481, 18245.80869764,  5652.75746405,
       19343.16778198,  6762.29566849,  5081.32472447, 13783.0448426 ,
        6484.26578837,  8115.63009491, 10961.15359863,  8659.64202142,
       12860.25634444])

In [20]:
final_pipe_1.score(X_test, y_test)

0.9107122773565837