# Embedded Feature Selection using Mobile data
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv"

In [3]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Import the necessary libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

### Read the mobile data

In [6]:
data = pd.read_csv("data/mobile_price_train.csv")

### Split the dataset into X and y

In [7]:
X = data.iloc[:,0:20]
y = data.iloc[:,-1] 

### Sanity check

In [8]:
X.shape, y.shape

((2000, 20), (2000,))

### How many features

In [9]:
X.shape[1]

20

In [10]:
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

## Embedded Feature Selection - Logistic Regression Estimator with Lasso Regularization (L1)

### Import the SelectFromModel

In [11]:
from sklearn.feature_selection import SelectFromModel

### Import the Logistic Regression model

In [12]:
from sklearn.linear_model import LogisticRegression

### Build a Logistic Regression model

In [13]:
logreg = LogisticRegression(penalty='l1', solver='liblinear')

### Build Embedded model with Logistic Regression as Learning Algorithm / Estimator

In [14]:
embedded_lr_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', max_iter=50000), max_features=5)

### Train the RFE model

In [15]:
embedded_lr_selector = embedded_lr_selector.fit(X, y)

### Get Support from the model

In [16]:
embedded_lr_support = embedded_lr_selector.get_support()
embedded_lr_support

array([False, False, False, False, False,  True, False,  True, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True])

### Best features from the model

In [17]:
embedded_lr_feature = X.loc[:, embedded_lr_support].columns.tolist()
embedded_lr_feature

['four_g', 'm_dep', 'three_g', 'touch_screen', 'wifi']

## Embedded Feature Selection - Random Forest Classifier

### Import RandomForestClassifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

### Build the Random Forest model

In [19]:
rf = RandomForestClassifier(n_estimators=100)

### Build the Embedded model with Random Forest as the Learning Algorithm

In [20]:
embedded_rf_selector = SelectFromModel(rf, 
                           max_features=5
)

### Train the model

In [21]:
embedded_rf_selector = embedded_rf_selector.fit(X, y)

### Get support

In [22]:
embedded_rf_support = embedded_rf_selector.get_support()
embedded_rf_support

array([ True, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False])

### Best Features

In [23]:
embedded_rf_feature = X.loc[:, embedded_rf_support].columns.tolist()
embedded_rf_feature

['battery_power', 'px_height', 'px_width', 'ram']

## Embedded Feature Selection - LightGBM

In [24]:
from lightgbm import LGBMClassifier

### Choose LightGBM as your learning algorithm

In [25]:
lgbmc = LGBMClassifier(n_estimators=500,
                      learning_rate=0.05,
                      num_leaves=32,
                      colsample_bytree=0.2,
                      reg_alpha=3,
                      reg_lambda=1,
                      min_split_gain=0.01,
                      min_child_weight=40
)

### Build a Embedded model with LGBM as learning algorithm

In [26]:
embedded_lgbm_selector = SelectFromModel(lgbmc,
                                         max_features=5
)

### Train the model

In [27]:
embedded_lgbm_selector = embedded_lgbm_selector.fit(X, y)

### Get the support

In [28]:
embedded_lgbm_support = embedded_lgbm_selector.get_support()
embedded_lgbm_support

array([ True, False, False, False, False, False,  True, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False])

### Get the feature names

In [29]:
embedded_lgbm_feature = X.loc[:, embedded_lgbm_support].columns.tolist()
embedded_lgbm_feature

['battery_power', 'int_memory', 'px_height', 'px_width', 'ram']

### Try some other learning algorithms you know of
- Anything of your choice

### Criteria:
- Estimator must have either a **feature_importances_** or **coef_** attribute after fitting.

### Summarize the list of features chosen by different algorithms
- Algorithm | Best Features 

Example: 

- Logistic Regression | ['battery_power', 'ram']  
- KNN | ???? 
- RF | ???? 
- Algorithms of your choice | ????? 