### Feature selection in Regression using Mutual Information

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor,AdaBoostRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('./CarPricesData.csv')
data.head(10)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986,Diesel,90,1,0,2000.0,3,1165.0
1,13750,23.0,72937,Diesel,90,1,0,2000.0,3,1165.0
2,13950,24.0,41711,Diesel,90,1,0,2000.0,3,1165.0
3,14950,26.0,48000,Diesel,90,0,0,2000.0,3,1165.0
4,13750,30.0,38500,Diesel,90,0,0,2000.0,3,1170.0
5,12950,32.0,61000,Diesel,90,0,0,2000.0,3,1170.0
6,16900,27.0,94612,Diesel,90,1,0,2000.0,3,1245.0
7,18600,30.0,75889,Diesel,90,1,0,2000.0,3,1245.0
8,21500,27.0,19700,Petrol,192,0,0,1800.0,3,1185.0
9,12950,23.0,71138,Diesel,69,0,0,1900.0,3,1105.0


In [5]:
data.isnull().mean()*100

Price        0.000000
Age          0.139276
KM           0.000000
FuelType     0.278552
HP           0.000000
MetColor     0.000000
Automatic    0.000000
CC           0.139276
Doors        0.000000
Weight       0.139276
dtype: float64

In [6]:
data.dropna(inplace=True)

In [8]:
data.duplicated().sum()

1

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
le = LabelEncoder()

In [11]:
for i in data.columns:
    if data[i].dtype == 'object':
        data[i] = le.fit_transform(data[i])

In [12]:
data.sample(10)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
29,17950,30.0,11090,2,110,1,0,1600.0,3,1120.0
1430,8450,80.0,23000,2,86,0,0,1300.0,3,1015.0
680,7950,61.0,102106,2,110,0,0,1600.0,3,1050.0
476,10250,54.0,63792,2,110,1,0,1600.0,5,1075.0
1212,8250,72.0,86860,2,110,0,0,1600.0,5,1075.0
895,8950,61.0,60532,2,110,1,0,1600.0,3,1050.0
430,12200,50.0,82805,2,110,1,0,1600.0,3,1040.0
1176,9250,80.0,93841,2,110,0,0,1600.0,5,1070.0
1211,8250,80.0,87000,2,86,1,0,1300.0,3,1015.0
265,11950,38.0,49500,2,110,1,0,1600.0,5,1075.0


#### Feature selection is performed using Information Gain.

#### Selecting the features and targets

In [13]:
x = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=47, shuffle=True)

#### Defining feature selection with all features included

In [15]:
fs = SelectKBest(score_func=mutual_info_regression, k='all')

#### Fitting the fs

In [16]:
fs.fit(x_train, y_train)

SelectKBest(k='all',
            score_func=<function mutual_info_regression at 0x0000028782324AF0>)

#### Transorm the train and test data

In [17]:
x_train_fs = fs.transform(x_train)
x_test_fs = fs.transform(x_test)