# Import Librariers

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# Loading the Data

In [2]:
data=pd.read_csv('train.csv')
data.shape

(76020, 371)

### Checking for null Values

In [3]:
[cols for cols in data.columns if data[cols].isnull().sum()>0]

[]

In [4]:
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In all feature selection problem it is a good practice to select features by examining only the training set. 
And this is to avoid overfitting

### Train Test Split

In [5]:
X_train,X_test,y_train,y_test=train_test_split(
    data.drop(labels=['TARGET'],axis=1),
    data['TARGET'],
    test_size=0.3,
    random_state=0)

In [6]:
(X_train.shape,y_train.shape)

((53214, 370), (53214,))

In [7]:
(X_test.shape,y_test.shape)

((22806, 370), (22806,))

# Removing Constant Features 

### Using Variance threshold function

The variance threshold function from sklearn is a baseline approach to feature selection. It remove all features where variance doesnt meet a thresold. By default it removes variables with zero variance

In [8]:
sel=VarianceThreshold()
sel.fit(X_train) #find the features in X_train with zero variance

VarianceThreshold(threshold=0.0)

In [9]:
#get_support is a boolean vector which indicates which features are retained
sum(sel.get_support())

332

There are 332 variables which are non constant

In [10]:
#No of constant features
len([x for x in X_train.columns if x not in X_train.columns[sel.get_support()]])

38

In [11]:
#print constant features
[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

['ind_var2_0',
 'ind_var2',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var41',
 'ind_var46_0',
 'ind_var46',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var41',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var41',
 'saldo_var46',
 'delta_imp_reemb_var33_1y3',
 'delta_num_reemb_var33_1y3',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var33_hace3',
 'imp_reemb_var33_ult1',
 'imp_trasp_var17_out_hace3',
 'imp_trasp_var33_out_hace3',
 'num_var2_0_ult1',
 'num_var2_ult1',
 'num_reemb_var13_hace3',
 'num_reemb_var33_hace3',
 'num_reemb_var33_ult1',
 'num_trasp_var17_out_hace3',
 'num_trasp_var33_out_hace3',
 'saldo_var2_ult1',
 'saldo_medio_var13_medio_hace3']

### Transforming the data

In [12]:
X_train=sel.transform(X_train)
X_test=sel.transform(X_test)

  if np.issubdtype(mask.dtype, np.int):


In [13]:
(X_train.shape,X_test.shape)

((53214, 332), (22806, 332))

##### The above code works for numerical features and for categorical feature one way is to do lable encoding, but the preprocessing takes  time. Another is way is as shown below


In [14]:
X_train,X_test,y_train,y_test=train_test_split(
    data.drop(labels=['TARGET'],axis=1),
    data['TARGET'],
    test_size=0.3,
    random_state=0)

In [15]:
X_train=X_train.astype('O')

In [16]:
categorical_features=[col for col in X_train.columns if len(X_train[col].unique())==1]

In [17]:
X_train.drop(labels=categorical_features,axis=1,inplace=True)
X_test.drop(labels=categorical_features,axis=1,inplace=True)

In [18]:
(X_train.shape,X_test.shape)

((53214, 332), (22806, 332))

# Removing Quasi Constant Features

In [19]:
sel=VarianceThreshold(threshold=0.01)
sel.fit(X_train) # find the features in X_train with given variance

VarianceThreshold(threshold=0.01)

In [20]:
sum(sel.get_support())

268

268 features are non Quasi constant

In [21]:
len([cols for cols in X_train.columns if cols not in X_train.columns[sel.get_support()]])

64

64 features are Quasi constant

##### Without package

In [39]:
Quasi_constant_features=[]

In [40]:
len(X_train['var15'])

53214

In [41]:
(X_train['var15'].value_counts()/np.float(len(X_train['var15']))).sort_values(ascending=False).values[0]

0.2657195474875033

In [42]:
for feature in X_train.columns:
    predominant=(X_train[feature].value_counts()/np.float(len(X_train[feature]))).sort_values(ascending=False).values[0]
    if predominant>0.998:
        Quasi_constant_features.append(feature)
                 

In [43]:
len(Quasi_constant_features)

139

This method is more aggressive compared to VarianceThreshold.

In [44]:
np.std(X_train['imp_op_var40_efect_ult1'])

34.93264763224657

In [48]:
X_train['imp_op_var40_efect_ult1'].value_counts()/len(X_train['imp_op_var40_efect_ult1'])

0.0       0.999493
900.0     0.000094
1800.0    0.000056
60.0      0.000056
120.0     0.000038
270.0     0.000038
600.0     0.000038
750.0     0.000019
870.0     0.000019
300.0     0.000019
930.0     0.000019
210.0     0.000019
150.0     0.000019
1200.0    0.000019
87.9      0.000019
1710.0    0.000019
6600.0    0.000019
Name: imp_op_var40_efect_ult1, dtype: float64

So we see that the above feature is not captured in VarianceThreshold as variance would be more . But when we code it without package this is captured