### Feature Selection - Dropping Constant Features 

### How to use Variance Threshold Function 

In [1]:

# In this step we will be removing the features which have constant features 
# which are actually not important for solving the problem statement.

import pandas as pd 

data = pd.DataFrame({"A":[1,2,4,1,2,4],
                    "B":[4,5,6,7,8,9],
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]
                    })


In [2]:
data .head()

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


In [4]:

# Feature selector that removes all low-variance features.

from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)
var_thres

VarianceThreshold(threshold=0)

In [6]:
var_thres.get_support()

array([ True,  True, False, False])

In [9]:
constant_col= [columns for columns in data.columns 
               if columns not in data.columns[var_thres.get_support()]]
constant_col


['C', 'D']

In [11]:
data.drop(constant_col,axis=1,inplace=True)
data

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


### Example - 1 using Variance Threshold 

In [19]:
df=pd.read_csv("D:\\Sandesh\\Data Science\\Feature Selection\\train.csv")
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [22]:
X=df.drop(columns=['TARGET'],axis=1)
X.head()
y=df[['TARGET']]

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1)


In [25]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((60816, 370), (15204, 370), (60816, 1), (15204, 1))

### Lets Apply Variance Threshold 

In [37]:
from sklearn.feature_selection import VarianceThreshold
def_threshold=VarianceThreshold(threshold=0.5)
def_threshold.fit(df)
def_threshold

VarianceThreshold(threshold=0.5)

In [38]:
df.columns[def_threshold.get_support()]

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var29_ult3', 'saldo_medio_var33_hace2',
       'saldo_medio_var33_hace3', 'saldo_medio_var33_ult1',
       'saldo_medio_var33_ult3', 'saldo_medio_var44_hace2',
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38'],
      dtype='object', length=191)

In [39]:
constant_col_df= [columns for columns in df.columns 
                if columns not in df.columns[def_threshold.get_support()]]
constant_col_df

['ind_var1_0',
 'ind_var1',
 'ind_var2_0',
 'ind_var2',
 'ind_var5_0',
 'ind_var5',
 'ind_var6_0',
 'ind_var6',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13_largo_0',
 'ind_var13_largo',
 'ind_var13_medio_0',
 'ind_var13_medio',
 'ind_var13',
 'ind_var14_0',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var24_0',
 'ind_var24',
 'ind_var25_cte',
 'ind_var26_0',
 'ind_var26_cte',
 'ind_var26',
 'ind_var25_0',
 'ind_var25',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var29_0',
 'ind_var29',
 'ind_var30_0',
 'ind_var30',
 'ind_var31_0',
 'ind_var31',
 'ind_var32_cte',
 'ind_var32_0',
 'ind_var32',
 'ind_var33_0',
 'ind_var33',
 'ind_var34_0',
 'ind_var34',
 'ind_var37_cte',
 'ind_var37_0',
 'ind_var37',
 'ind_var39_0',
 'ind_var40_0',
 'ind_var40',
 'ind_var41_0',
 'ind_var41',
 'ind_var39',
 'ind_var44_0',
 '

In [40]:
len(constant_col_df)

180

In [45]:
df.drop(constant_col_df,axis=1,inplace=True)
df.shape


(76020, 191)

### THE END 