# Feature Selection from Mobile data using Pearson Correlation filter method
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv"

In [3]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Import the necessary libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### Read the mobile data

In [6]:
data = pd.read_csv("data/mobile_price_train.csv")

### Split the dataset into X and y

In [7]:
X = data.iloc[:,0:20]
y = data.iloc[:,-1] 

### Sanity check

In [8]:
X.shape, y.shape

((2000, 20), (2000,))

### How many features

In [9]:
X.shape[1]

20

### Make a feature list

In [10]:
feature_name = X.columns.tolist()

### Use corrcoef from Numpy to calculate correlation between two features (battery power and y)

In [11]:
cor = np.corrcoef(X['battery_power'], y)[0, 1]
cor

0.20072261211373102

### Use corrcoef from Numpy to calculate correlation between two features (ram and y)

In [12]:
cor = np.corrcoef(X['ram'], y)[0, 1]
cor

0.9170457362649905

### Which feature comparison has the highest correlation?

### Calculate the correlation with y for each feature and collect all correlation values in a list

In [13]:
cor_list = []
for i in X.columns.tolist():
    cor = np.corrcoef(X[i], y)[0, 1]
    cor_list.append(cor)

### Look at correlation values

In [14]:
cor_list

[0.20072261211373102,
 0.020572854061418504,
 -0.006605690881732071,
 0.017444479237224732,
 0.02199820777690427,
 0.0147717114172394,
 0.04443495938898743,
 0.0008530365050864312,
 -0.030302171314386415,
 0.004399274799457285,
 0.03359930021353949,
 0.1488575550004218,
 0.16581750172625515,
 0.9170457362649905,
 0.022986073167424414,
 0.038711271664484154,
 0.02185887116237479,
 0.023611216880045017,
 -0.030411071898218043,
 0.018784812012789004]

### Replace NaN with 0

In [15]:
cor_list = [0 if np.isnan(i) else i for i in cor_list]
cor_list

[0.20072261211373102,
 0.020572854061418504,
 -0.006605690881732071,
 0.017444479237224732,
 0.02199820777690427,
 0.0147717114172394,
 0.04443495938898743,
 0.0008530365050864312,
 -0.030302171314386415,
 0.004399274799457285,
 0.03359930021353949,
 0.1488575550004218,
 0.16581750172625515,
 0.9170457362649905,
 0.022986073167424414,
 0.038711271664484154,
 0.02185887116237479,
 0.023611216880045017,
 -0.030411071898218043,
 0.018784812012789004]

### Choose the feature

In [16]:
cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-10:]].columns.tolist()
cor_feature

['three_g',
 'mobile_wt',
 'touch_screen',
 'pc',
 'sc_w',
 'int_memory',
 'px_height',
 'px_width',
 'battery_power',
 'ram']

### Feature Selection or not

In [17]:
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in feature_name]
cor_support

[True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False]

### Can you turn the above notebook cells logic into a function like below ?

In [18]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

### Call the above function using the following line

In [19]:
cor_support, cor_feature = cor_selector(X, y,num_feats=10)
cor_support, cor_feature

([True,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  True,
  False,
  True,
  True,
  True,
  True,
  False,
  True,
  False,
  True,
  True,
  False],
 ['three_g',
  'mobile_wt',
  'touch_screen',
  'pc',
  'sc_w',
  'int_memory',
  'px_height',
  'px_width',
  'battery_power',
  'ram'])

### Create dataframes from Scores (cor_list) and Features

In [20]:
dfscores = pd.DataFrame(cor_list)
dfcolumns = pd.DataFrame(X.columns)

### Concatenate two dataframes together

In [21]:
featureScores = pd.concat([dfcolumns,dfscores], axis=1)
featureScores

Unnamed: 0,0,0.1
0,battery_power,0.200723
1,blue,0.020573
2,clock_speed,-0.006606
3,dual_sim,0.017444
4,fc,0.021998
5,four_g,0.014772
6,int_memory,0.044435
7,m_dep,0.000853
8,mobile_wt,-0.030302
9,n_cores,0.004399


### Add column names as Specs and Scores for the above dataframe

In [22]:
featureScores.columns = ['Specs', 'Score']

In [23]:
featureScores

Unnamed: 0,Specs,Score
0,battery_power,0.200723
1,blue,0.020573
2,clock_speed,-0.006606
3,dual_sim,0.017444
4,fc,0.021998
5,four_g,0.014772
6,int_memory,0.044435
7,m_dep,0.000853
8,mobile_wt,-0.030302
9,n_cores,0.004399


### Which are the best features?

In [24]:
print(featureScores.nlargest(10,'Score')) 

            Specs     Score
13            ram  0.917046
0   battery_power  0.200723
12       px_width  0.165818
11      px_height  0.148858
6      int_memory  0.044435
15           sc_w  0.038711
10             pc  0.033599
17        three_g  0.023611
14           sc_h  0.022986
4              fc  0.021998
