<a href="https://colab.research.google.com/github/salmankhaliq22/MachineLearning-Templates/blob/main/RFE_for_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RFE for Regression

- Here we are using data from sklearn.datasets and would be using diabetes dataset for regression models, you can check out the details in the [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes)

In [1]:
# test regression dataset
from sklearn.datasets import load_diabetes

# define dataset
X, y = load_diabetes(return_X_y = True, as_frame = True)

# summarize the dataset
print(X.shape, y.shape)

(442, 10) (442,)


In [2]:
# Lets look at the head of the X variable , which contains the features
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [3]:
# Lets do the same for the target variable
y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64

## Preprocessing

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 22)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

In [5]:
# creating a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

lr = LinearRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)
y_pred = lr.predict(X_test_std)
print('R2:{} MAE:{}'.format(r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred)))

R2:0.5189640022043543 MAE:41.97889352744687


In [6]:
# inspecting feature coefficients
print(lr.coef_)

[  0.92916588 -13.13212515  22.28972267  14.42771481 -25.04044597
  14.44227429  -8.31652567   1.74232069  31.61608597   4.41066135]


In [7]:
d = dict(zip(X.columns, abs(lr.coef_)))

In [9]:
import pandas as pd
df = pd.DataFrame([d]).T.sort_values(by = 0, ascending = False)
df

Unnamed: 0,0
s5,31.616086
s1,25.040446
bmi,22.289723
s2,14.442274
bp,14.427715
sex,13.132125
s3,8.316526
s6,4.410661
s4,1.742321
age,0.929166


In [None]:
# evaluate RFE for Regression
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
# this ignores the depreciation warnings etc
import warnings
warnings.filterwarnings("ignore")

# create pipeline
rfe = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = 4)
model = DecisionTreeRegressor()
pipeline = make_pipeline(rfe, model)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 3, random_state = 22)
n_scores = cross_val_score(pipeline, X, y, scoring = 'neg_mean_absolute_error', cv=cv, n_jobs=-1)

# report performance
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# evaluate RFE for Regression
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
# this ignores the depreciation warnings etc
import warnings
warnings.filterwarnings("ignore")

# Create the RFE object and compute a cross-validated score.
min_features_to_select = 1  # Minimum number of features to consider
cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 10, random_state = 22)
rfecv = RFECV(estimator = DecisionTreeRegressor(), step = 1, cv = cv, scoring = 'neg_mean_absolute_error', min_features_to_select = min_features_to_select)
rfecv.fit(X, y)
# model = DecisionTreeRegressor()
# pipeline = make_pipeline(rfe, model)

# evaluate model
# cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 3, random_state = 22)
# n_scores = cross_val_score(pipeline, X, y, scoring = 'neg_mean_absolute_error', cv=cv, n_jobs=-1)

# report performance
print("Optimal number of features : %d" % rfecv.n_features_)
# print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# summarize the selection of the attributes
print(rfecv.support_)

In [None]:
# summarize the ranking of the attributes
fea_rank_ = pd.DataFrame({'cols':X.columns, 'fea_rank':rfecv.ranking_})
fea_rank_.loc[fea_rank_.fea_rank > 0].sort_values(by=['fea_rank'], ascending = True)

In [None]:
# Plot number of features VS. cross-validation scores
import matplotlib.pyplot as plt

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (MAE)")
plt.plot(
    range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
    rfecv.grid_scores_,
)
# plt.legend()
plt.show()

[Reference](https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html)

In [None]:
from yellowbrick.model_selection import RFECV

# Instantiate RFECV visualizer with a DecisionTreeRegressor
min_features_to_select = 1
cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 10, random_state = 22)
rfecv = RFECV(estimator = DecisionTreeRegressor(), step = 1, cv = cv, scoring = 'neg_mean_absolute_error', min_features_to_select = min_features_to_select)
visualizer = rfecv

# print("Optimal number of features : %d" % rfecv.n_features_)

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure
plt.show()