In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import loadmat

In [2]:
data = pd.read_csv('input/auto-mpg[1].csv')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
data.replace('?',np.nan, inplace= True)

In [4]:
# Check for missing values
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [5]:
## filling missing value with median
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')

data['horsepower'].fillna(data['horsepower'].median(), inplace = True)

In [6]:
# Summary statistics
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.30402,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.222625,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [7]:
data = data.drop(['car name'], axis=1)
target = data['mpg']
features = data.drop(['mpg'], axis=1) # axis=1 for columns

In [None]:
# Selecting the features
from sklearn.feature_selection import SelectKBest, chi2, f_regression

select_k_best = SelectKBest(score_func=f_regression, k='all')
fit = select_k_best.fit_transform(features, target)

print("Selected features:", features.columns[select_k_best.get_support()])
print("Feature Scores:", select_k_best.scores_)

Selected features: Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')
Feature Scores: [597.07704785 724.99430337 589.63820515 888.85068265  84.95770025
 199.98200802 184.19963937]


### **Select K Best**

| Function/Class              | Task           | Works With                     | Best Use Case                                      |
|-----------------------------|---------------|--------------------------------|----------------------------------------------------|
| **f_classif**               | Classification | Continuous Features            | ANOVA F-value test for significance               |
| **mutual_info_classif**     | Classification | Any                            | Captures non-linear relationships                 |
| **chi2**                    | Classification | Non-negative categorical/count data | Feature selection based on chi-square test  |
| **f_regression**            | Regression     | Continuous Features            | ANOVA F-value test for regression                 |
| **mutual_info_regression**  | Regression     | Any                            | Captures non-linear relationships in regression   |
| **SelectPercentile**        | Any           | Any                            | Selects top X% of features                        |
| **SelectFpr**               | Any           | Any                            | Selects features based on a false positive rate  |
| **SelectFdr**               | Any           | Any                            | Controls false discovery rate                     |
| **SelectFwe**               | Any           | Any                            | Strict control of family-wise error rate         |
| **GenericUnivariateSelect** | Any           | Any                            | Customizable selection strategy                   |
