In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # chi square filter method

In [2]:
data = pd.read_csv('german_credit.csv')

## Filter methods
- feature selection independent of the model used
- based on general methods like correlation
- find the best subset of variables
- helps reduce overfitting

## Types of filter methods
- information gain
- chi-square test
- fisher score
- correlation coefficient
- variance threshold

## let's select the top 15 / 20 features with the higher chi squared statistics

In [5]:
X, y = data.loc[:, data.columns!='Creditability'], data['Creditability']

In [6]:
chisq_selector = SelectKBest(chi2, k=15)
X_kbest = chisq_selector.fit_transform(X, y)

In [7]:
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])


Original number of features: 20
Reduced number of features: 15


In [12]:
X_kbest.shape

(1000, 15)

In [21]:
chisq_selector.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True, False,  True, False, False,
        True,  True])

In [22]:
X.columns

Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status', 'Guarantors',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'Type of apartment',
       'No of Credits at this Bank', 'Occupation', 'No of dependents',
       'Telephone', 'Foreign Worker'],
      dtype='object')

In [24]:
print(X.columns[chisq_selector.get_support()]) 

Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status',
       'Most valuable available asset', 'Age (years)', 'Concurrent Credits',
       'No of Credits at this Bank', 'Telephone', 'Foreign Worker'],
      dtype='object')


## dropped columns:
1. Guarantors
2. Duration in Current address
3. Type of apartment
4. Occupation
5. No of dependents