# Feature Selection using Fisher Score filter method
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [1]:
import pandas as pd

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Import the libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Load the diabetes data

In [3]:
from sklearn.datasets import load_diabetes
db = load_diabetes()

### Split the dataset into X and y

In [4]:
y_train = db.target
X_train = db.data

### Sanity check

In [5]:
X_train.shape, y_train.shape

((442, 10), (442,))

### How many features

In [6]:
X_train.shape[1]

10

In [7]:
db.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [8]:
db.target.shape

(442,)

In [9]:
db.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [10]:
db.data.shape

(442, 10)

### Create Fisher score

In [11]:
from skfeature.function.similarity_based import fisher_score

### Calculating Scores

In [12]:
ranks = fisher_score.fisher_score(X_train, y_train)
ranks

array([1.01656037, 1.00650066, 1.89801535, 1.45142089, 0.86089456,
       0.83205219, 1.51317925, 1.43564244, 1.9187962 , 1.11003516])

### Create dataframes from Scores and Features

In [13]:
dfscores = pd.DataFrame(ranks)
dfcolumns = pd.DataFrame(db.feature_names)

### Concatenate two dataframes together

In [14]:
featureScores = pd.concat([dfcolumns,dfscores], axis=1)
featureScores

Unnamed: 0,0,0.1
0,age,1.01656
1,sex,1.006501
2,bmi,1.898015
3,bp,1.451421
4,s1,0.860895
5,s2,0.832052
6,s3,1.513179
7,s4,1.435642
8,s5,1.918796
9,s6,1.110035


### Add column names as Specs and Scores for the above dataframe

In [15]:
featureScores.columns = ['Specs', 'Score']

In [16]:
featureScores

Unnamed: 0,Specs,Score
0,age,1.01656
1,sex,1.006501
2,bmi,1.898015
3,bp,1.451421
4,s1,0.860895
5,s2,0.832052
6,s3,1.513179
7,s4,1.435642
8,s5,1.918796
9,s6,1.110035


### Which are the best features?

In [17]:
print(featureScores.nlargest(10,'Score')) 

  Specs     Score
8    s5  1.918796
2   bmi  1.898015
6    s3  1.513179
3    bp  1.451421
7    s4  1.435642
9    s6  1.110035
0   age  1.016560
1   sex  1.006501
4    s1  0.860895
5    s2  0.832052
