# Feature Selection using Filtering Method

## Univariate (we select features without considering other features)(Using Variance, Fisher Score, Mutual Information Gain etc) 

## Using Variance

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('santander-train.csv')
data.head(5)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score

In [4]:
X = data.drop(labels='TARGET', axis=1)
y = data['TARGET']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Constant fearture Removal

In [6]:
constant_filter = VarianceThreshold(threshold=0.0)
constant_filter.fit(X_train)

VarianceThreshold(threshold=0.0)

In [7]:
print(X.shape[1])

370


In [8]:
###Number of retained features
constant_filter.get_support().sum()

336

In [9]:
### Filtered train and Test Set
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [10]:
X_train_filter.shape[1]

336

In [11]:
print(X_test.shape[1])
print(X_test_filter.shape[1])

370
336


### Quasi Conatant Features Removal

In [12]:
quasi_constant_filter = VarianceThreshold(threshold=0.01) #Removing features having >=99% Similarity
quasi_constant_filter.fit(X_train_filter)

VarianceThreshold(threshold=0.01)

In [13]:
###Number of retained features
quasi_constant_filter.get_support().sum()

272

In [14]:
### Filtered train and Test Set
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter)

In [15]:
print(X_train_quasi_filter.shape[1])
print(X_test_quasi_filter.shape[1])

272
272


### Duplicate Features Removal (Removal of features having exactly same values)

In [16]:
### We dont have any method to remove Dublicates from columns but we have method to remove dublicates from Rows.
### So we will take transform and Remove from rows

In [17]:
X_train_T = X_train_quasi_filter.T
X_test_T = X_test_quasi_filter.T

In [18]:
print(X_train_T.shape)
print(X_test_T.shape)

(272, 57015)
(272, 19005)


In [19]:
type(X_train_T)

numpy.ndarray

In [20]:
### We need to convert this numpy array to Pandas DataFrame 

In [21]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [22]:
### Number of Dublicate Features
X_train_T.duplicated().sum()

17

In [23]:
### Removing these Dublicated Features

In [24]:
### Getting Dublicated Features boolean
duplicated_features = X_train_T.duplicated()

In [25]:
### Selecting Non Duplicated rows
features_to_keep = [not item for item in duplicated_features]

In [26]:
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [27]:
print(X_train_unique.shape)
print(X_test_unique.shape)

(57015, 255)
(19005, 255)


In [28]:
### Building the Model and Performance Checking

In [29]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9607997895290713
Wall time: 4.37 s


In [30]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train_unique, y_train)
y_pred = model.predict(X_test_unique)
print(accuracy_score(y_test, y_pred))

0.9607997895290713
Wall time: 3.54 s


## Multivariate (we select features considering other features)(Using Pearsons Correlation) 

## Using Correlations 

In [31]:
corrmat = X_train_unique.corr(method='pearson')
corrmat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,262,263,264,265,266,267,268,269,270,271
0,1.000000,0.001194,-0.001853,-0.003938,0.000109,-0.003770,-0.009275,-0.010554,-0.007310,-0.007864,...,-0.008809,-0.000128,0.001181,-0.002806,-0.002573,-0.003620,-0.006234,-0.001831,-0.000059,-0.008112
1,0.001194,1.000000,-0.001827,0.001988,0.007314,0.007741,0.001417,0.001574,0.000544,0.000618,...,0.000252,0.000687,0.000464,0.000585,0.000618,0.000582,0.000474,0.000725,0.000765,-0.000041
2,-0.001853,-0.001827,1.000000,0.040057,0.095506,0.101138,0.041632,0.048417,0.009907,0.010963,...,0.013472,0.028406,0.016710,0.027005,0.027922,0.030106,0.023319,0.035177,0.035580,0.007449
3,-0.003938,0.001988,0.040057,1.000000,0.038944,0.033910,0.011242,0.010576,0.000743,0.002513,...,0.008937,-0.000840,-0.000624,-0.000586,-0.000651,0.003535,-0.000551,0.007161,0.009017,0.000423
4,0.000109,0.007314,0.095506,0.038944,1.000000,0.888244,0.350951,0.309539,0.032660,0.057051,...,0.001278,0.018452,0.015337,0.014867,0.016131,0.011756,0.009377,0.015979,0.014475,0.009990
5,-0.003770,0.007741,0.101138,0.033910,0.888244,1.000000,0.316705,0.359242,0.029647,0.049465,...,0.000221,0.031368,0.028060,0.021555,0.023983,0.010169,0.007747,0.014444,0.012988,0.011707
6,-0.009275,0.001417,0.041632,0.011242,0.350951,0.316705,1.000000,0.858154,0.058951,0.120411,...,-0.000253,0.004228,0.008442,0.002438,0.003520,-0.000391,-0.000455,-0.000140,-0.000289,0.002612
7,-0.010554,0.001574,0.048417,0.010576,0.309539,0.359242,0.858154,1.000000,0.053506,0.098173,...,-0.000278,0.008017,0.014056,0.004587,0.006548,-0.000517,-0.000499,-0.000427,-0.000540,0.005096
8,-0.007310,0.000544,0.009907,0.000743,0.032660,0.029647,0.058951,0.053506,1.000000,0.950170,...,-0.000098,-0.000253,-0.000172,-0.000217,-0.000229,-0.000225,-0.000176,-0.000282,-0.000297,-0.000817
9,-0.007864,0.000618,0.010963,0.002513,0.057051,0.049465,0.120411,0.098173,0.950170,1.000000,...,-0.000111,-0.000286,-0.000195,-0.000246,-0.000260,-0.000255,-0.000200,-0.000319,-0.000337,0.000350


In [32]:
print(type(corrmat))

<class 'pandas.core.frame.DataFrame'>


In [33]:
corrmat.shape

(255, 255)

In [34]:
## Filtering out features which have correlation more than 0.85

In [35]:
correlated_features = []
for i in range(len(corrmat.index)):
    for j in range(i):
        if abs(corrmat.iloc[i, j])>0.85:
            correlated_features.append(corrmat.columns[j])

list_of_correlated_features = list(set(correlated_features))
print(list_of_correlated_features)
print(len(list_of_correlated_features))

[4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 41, 42, 43, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 65, 66, 67, 69, 70, 71, 75, 76, 78, 79, 80, 81, 82, 84, 85, 87, 88, 90, 94, 97, 104, 106, 108, 109, 110, 111, 113, 114, 116, 117, 119, 120, 124, 126, 127, 130, 131, 133, 134, 135, 136, 137, 138, 141, 144, 145, 146, 163, 170, 185, 186, 187, 197, 202, 204, 211, 212, 213, 215, 217, 219, 220, 221, 228, 230, 238, 240, 242, 244, 246, 248, 250, 252, 255, 256, 257, 261, 263, 265, 269]
127


In [36]:
X_train_uncorr = X_train_unique.drop(labels=list_of_correlated_features, axis=1)
X_test_uncorr = X_test_unique.drop(labels=list_of_correlated_features, axis=1)
print(X_train_uncorr.shape)
print(X_test_uncorr.shape)

(57015, 128)
(19005, 128)


In [37]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9607997895290713
Wall time: 3.86 s


In [38]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=4)
model.fit(X_train_uncorr, y_train)
y_pred = model.predict(X_test_uncorr)
print(accuracy_score(y_test, y_pred))

0.9607997895290713
Wall time: 3.04 s
