This code is a simple demonstration of the filter method (i.e., constant feature elimination) extracted from https://www.kaggle.com/code/raviprakash438/filter-method-feature-selection. For a more complete filter method data processing, visit the link above.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import collections

from sklearn.model_selection import train_test_split

In [22]:
df_train = pd.read_csv('./datasets/train.csv')
df_test = pd.read_csv("./datasets/test.csv")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 370 entries, ID to var38
dtypes: float64(110), int64(260)
memory usage: 214.0 MB


FILTER METHOD

In [5]:
X_train, X_label = df_train.drop(labels=['TARGET'], axis=1), df_train['TARGET']
X_test = df_test

In [18]:
from sklearn.feature_selection import VarianceThreshold

model = VarianceThreshold(threshold=0)
# setting the variance threshold = 0 removes features that do not have any variance, i.e., have features with constant value.
model.fit(X_train)

VarianceThreshold(threshold=0)

In [19]:
# get mask; False means the feature is removed from the model.
constArr=model.get_support()
# constArr

In [20]:
collections.Counter(constArr)
# 336 features are included in the model, meaning they are not constant.
# 34 features are not included in the model due to having constant values.

Counter({True: 336, False: 34})

In [21]:
# list of all features that are constant
constCol=[col for col in X_train.columns if col not in X_train.columns[constArr]]
# constCol

In [22]:
print('Shape before drop-->',X_train.shape, X_test.shape)

#X_train=varModel.transform(X_train)
#X_test=varModel.transform(X_test)

# remove constant features
X_train.drop(columns=constCol,axis=1,inplace=True)
X_test.drop(columns=constCol,axis=1,inplace=True)
print('Shape after drop-->',X_train.shape, X_test.shape)

Shape before drop--> (76020, 370) (75818, 370)
Shape after drop--> (76020, 336) (75818, 336)


WRAPPER METHOD

In [23]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [26]:
df_train = pd.read_csv('./datasets/train.csv')

# reduce training set
X_train, X_label = df_train.drop(labels=['TARGET'], axis=1), df_train['TARGET']
X_train, X_label = X_train.head(1000), X_label.head(1000)

In [27]:
knn = KNeighborsClassifier(n_neighbors=3)

sfs = SequentialFeatureSelector(knn, n_features_to_select=5, direction='forward')
sfs.fit(X_train, X_label)

SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
                          n_features_to_select=5)

In [None]:
sfs.get_support()

In [29]:
constArr=sfs.get_support()
constCol=[col for col in X_train.columns if col not in X_train.columns[constArr]]

In [30]:
print('Shape before drop-->',X_train.shape, X_test.shape)

#X_train=varModel.transform(X_train)
#X_test=varModel.transform(X_test)

# remove constant features
X_train.drop(columns=constCol,axis=1,inplace=True)
X_test.drop(columns=constCol,axis=1,inplace=True)
print('Shape after drop-->',X_train.shape, X_test.shape)

Shape before drop--> (1000, 370) (75818, 370)
Shape after drop--> (1000, 5) (75818, 5)


In [33]:
X_train.columns

Index(['var3', 'var15', 'imp_ent_var16_ult1', 'saldo_var37', 'num_var22_ult3'], dtype='object')

EMBEDDED METHOD

In [74]:
df_train = pd.read_csv('./datasets/train.csv')

# reduce training set
X_train, X_label = df_train.drop(labels=['TARGET'], axis=1), df_train['TARGET']
X_train, X_label = X_train, X_label

In [75]:
from sklearn import linear_model

# model = linear_model.LinearRegression()
# model = linear_model.Perceptron(tol=1e-3, random_state=0)
model = linear_model.RidgeClassifier(alpha=0.5)

model.fit(X_train, X_label)

RidgeClassifier(alpha=0.5)

In [68]:
# create mask
abs_weight = abs(model.coef_[0])
mean = np.mean(abs_weight)
mask = [i > mean for i in abs_weight]

In [78]:
model_coeff = pd.DataFrame()
model_coeff["Features"] = X_train.columns[mask]
model_coeff["Weights"] = pd.Series(model.coef_[0][mask])

In [79]:
model_coeff

Unnamed: 0,Features,Weights
0,ind_var1_0,-0.795797
1,ind_var8_0,0.246297
2,ind_var12_0,-0.030674
3,ind_var12,0.078606
4,ind_var13_0,-0.031095
...,...,...
59,num_meses_var44_ult3,-0.059479
60,num_reemb_var17_ult1,0.094214
61,num_sal_var16_ult1,0.018886
62,num_trasp_var33_in_hace3,0.042184
