# Recursive Feature Elimination using Mobile data
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [1]:
import pandas as pd
import numpy as np

from IPython.display import display, Markdown

In [2]:
url = "https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv"

In [3]:
mob_data = pd.read_csv(url)
mob_data.head(10)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
5,1859,0,0.5,1,3,0,22,0.7,164,1,...,1004,1654,1067,17,1,10,1,0,0,1
6,1821,0,1.7,0,4,1,10,0.8,139,8,...,381,1018,3220,13,8,18,1,0,1,3
7,1954,0,0.5,1,0,0,24,0.8,187,4,...,512,1149,700,16,3,5,1,1,1,0
8,1445,1,0.5,0,0,0,53,0.7,174,7,...,386,836,1099,17,1,20,1,0,0,0
9,509,1,0.6,1,2,1,9,0.1,93,5,...,1137,1224,513,19,10,12,1,0,0,0


In [4]:
mob_data.shape

(2000, 21)

In [5]:
mob_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [6]:
mob_data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Split the dataset into X and y

In [7]:
X = mob_data.iloc[:,0:20]
y = mob_data.iloc[:,-1] 

In [8]:
X.shape, y.shape

((2000, 20), (2000,))

In [9]:
X = mob_data.drop("price_range", axis=1)
y = mob_data.price_range

In [10]:
X.shape, y.shape

((2000, 20), (2000,))

### How many features

In [11]:
X.shape

(2000, 20)

### Make a feature list

In [12]:
feature_names = X.columns.to_list()
feature_names

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi']

---
## Recurisve Feature Elimination - Logistic Regression Estimator

### Import the Recursive Feature Elimination

In [13]:
from sklearn.feature_selection import RFE

### Import the Logistic Regression model

In [14]:
from sklearn.linear_model import LogisticRegression

### Build a Logistic Regression model

In [15]:
lr = LogisticRegression()

### Build RFE model with Logistic Regression as Learning Algorithm / Estimator

In [16]:
rfe_lr = RFE(estimator=lr, 
             n_features_to_select=5,
             step=1,
             verbose=5
)

In [17]:
[m for m in dir(rfe_lr) if not m.startswith("_")]

['classes_',
 'decision_function',
 'estimator',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_params',
 'get_support',
 'importance_getter',
 'inverse_transform',
 'n_features_to_select',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'step',
 'transform',
 'verbose']

The model contains a predict method because RFE is a meta-estimator that **wraps around the Logistic Regression model.** During the RFE training process, <span style="color:yellow">the logistic regression model is fitted and trained during the process. </span> Once the process is completed, the RFE retains the trained lr model, allowing it to make predictions with it.

In [18]:
rfe_lr.estimator

LogisticRegression()

After the training, the attribute estimator_ will be created. Is going to be the trained model.

In [19]:
# rfe_lr.predict(X)
# ↓
# estimator_.predict(X_reduced)

### Train the RFE model

In [20]:
rfe_lr = rfe_lr.fit(X, y)

Fitting estimator with 20 features.
Fitting estimator with 19 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting estimator with 18 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting estimator with 17 features.
Fitting estimator with 16 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.


In [21]:
[m for m in dir(rfe_lr) if not m.startswith("_")]

['classes_',
 'decision_function',
 'estimator',
 'estimator_',
 'feature_names_in_',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_params',
 'get_support',
 'importance_getter',
 'inverse_transform',
 'n_features_',
 'n_features_in_',
 'n_features_to_select',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'ranking_',
 'score',
 'set_params',
 'step',
 'support_',
 'transform',
 'verbose']

### Get Support from the model

In [22]:
rfe_lr_support = rfe_lr.get_support()
rfe_lr_support

array([False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False,  True,  True,  True, False,
       False, False])

### Best features from the model

In [23]:
rfe_lr_feature = X.loc[:, rfe_lr_support].columns.tolist()
rfe_lr_feature

['clock_speed', 'n_cores', 'sc_h', 'sc_w', 'talk_time']

Another way to do it:

In [None]:
rfe_lr_feature = X.columns[rfe_lr_support].tolist()
rfe_lr_feature

['clock_speed', 'n_cores', 'sc_h', 'sc_w', 'talk_time']

### Feature Ranking

In [25]:
rfe_lr.ranking_

array([10, 13,  1, 12,  2, 15,  4, 16,  5,  1,  3,  7,  8,  6,  1,  1,  1,
        9, 11, 14])

In [26]:
top_features_df = pd.DataFrame({
    "Feature": X.columns,
    "Ranking": rfe_lr.ranking_
})
top_features_df.sort_values("Ranking", inplace=True)
top_features_df

Unnamed: 0,Feature,Ranking
9,n_cores,1
2,clock_speed,1
16,talk_time,1
15,sc_w,1
14,sc_h,1
4,fc,2
10,pc,3
6,int_memory,4
8,mobile_wt,5
13,ram,6


---
## Recursive Feature Elimination - Random Forest Classifier

### Import RandomForestClassifier

In [27]:
from sklearn.ensemble import RandomForestClassifier

### Build the Random Forest model

In [28]:
rf = RandomForestClassifier()

### Build the Recursive Feature Elimination with Random Forest as the Learning Algorithm

In [29]:
rfe_rf = RFE(estimator=rf, 
             n_features_to_select=5,
             step=1,
             verbose=5
)

### Train the model

In [30]:
rfe_rf = rfe_rf.fit(X, y)

Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.


### Get support

In [31]:
rfe_rf_support = rfe_rf.get_support()
rfe_rf_support

array([ True, False, False, False, False, False, False, False,  True,
       False, False,  True,  True,  True, False, False, False, False,
       False, False])

### Best features from the model

In [None]:
rfe_rf_support = X.columns[rfe_rf_support].tolist()
rfe_rf_support

['battery_power', 'mobile_wt', 'px_height', 'px_width', 'ram']

In [33]:
rfe_rf.ranking_

array([ 1, 12,  6, 14,  8, 11,  2,  9,  1, 10,  3,  1,  1,  1,  7,  5,  4,
       16, 13, 15])

In [35]:
top_features_df = pd.DataFrame({
    "Feature": X.columns,
    "Ranking": rfe_rf.ranking_
})
top_features_df.sort_values("Ranking", inplace=True)
top_features_df

Unnamed: 0,Feature,Ranking
0,battery_power,1
8,mobile_wt,1
13,ram,1
11,px_height,1
12,px_width,1
6,int_memory,2
10,pc,3
16,talk_time,4
15,sc_w,5
2,clock_speed,6


In [39]:
[m for m in dir(rfe_rf) if not m.startswith("_")]

['classes_',
 'decision_function',
 'estimator',
 'estimator_',
 'feature_names_in_',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_params',
 'get_support',
 'importance_getter',
 'inverse_transform',
 'n_features_',
 'n_features_in_',
 'n_features_to_select',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'ranking_',
 'score',
 'set_params',
 'step',
 'support_',
 'transform',
 'verbose']