# Feature Selection - Dropping Constant Features.
In this step we will be removing the features which have constant features which are actually not important for solving the problem statement.

In [1]:
import pandas as pd

## Make data Frame
df = pd.DataFrame({
    "A":[1,2,3,4,5,6],
    "B":[2,5,6,7,4,3],
    "C":[0,0,0,0,0,0],
    "D":[1,1,1,1,1,1]
})

In [2]:
df

Unnamed: 0,A,B,C,D
0,1,2,0,1
1,2,5,0,1
2,3,6,0,1
3,4,7,0,1
4,5,4,0,1
5,6,3,0,1


### Variance Threshold
Feature Selector that removes all low-variance features.
This feature selection algorithm looks only at the features(x) , not the desired output(y), and can thus be used for 
unsupervised learning.

It is a sklearn class, **which removes all low variance features, here we provide a threshold value.**

In [3]:
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=0)
df2 = var_thres.fit_transform(df)
df2

array([[1, 2],
       [2, 5],
       [3, 6],
       [4, 7],
       [5, 4],
       [6, 3]], dtype=int64)

In [4]:
var_thres.get_support()

array([ True,  True, False, False])

Here, true means the columns A,B are having non zero variance whereas false means C,D are having zero variance.

In [5]:
df.columns[var_thres.get_support()] # returning the columns that does not need to be DROPPED!

Index(['A', 'B'], dtype='object')

In [6]:
constant_columns = [col for col in df.columns if col not in df.columns[var_thres.get_support()]]
constant_columns

['C', 'D']

In [7]:
# dropping those columns 
df.drop(columns = constant_columns,axis=1,inplace=True)

In [8]:
df

Unnamed: 0,A,B
0,1,2
1,2,5
2,3,6
3,4,7
4,5,4
5,6,3


# Lets try with bigger data set

In [9]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv("E:/Datasets/winequality-red.csv",delimiter=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
len(df.columns)

12

In [12]:
## Splitting data
x = df.drop(columns='quality')
y = df['quality']

In [13]:
## Trian test split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [14]:
x_train.shape

(1119, 11)

In [15]:
x_test.shape

(480, 11)

## Applying Variace Threshold

In [16]:
## dropping the columns with variance 0.5
var_thres = VarianceThreshold(threshold=0.5)
var_thres.fit(x_train)

In [17]:
column_with_variance = [col for col in x_train.columns if col not in x_train.columns[var_thres.get_support()]]

print("THESE MANY COLUMNS ARE NEEDED TO BE DROPPED!!! = ",len(column_with_variance))

THESE MANY COLUMNS ARE NEEDED TO BE DROPPED!!! =  6


In [18]:
for column in column_with_variance:
    print(column)

volatile acidity
citric acid
chlorides
density
pH
sulphates
