# Feature Selection Techniques
# Sanjay Gupta
# Date: 30-August-2021

# Objective

In [1]:
import pandas as pd
import io
import requests
url = "https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank-names.txt"
read_data=requests.get(url).content.decode('utf-8')
print(read_data)

Citation Request:
  This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

1. Title: Bank Marketing

2. Sources
   Created by: Paulo Cortez (Univ. Minho) and Sérgio Moro (ISCTE-IUL) @ 2012
   
3. Past Usage:

  The full dataset was described and analyzed in:

  S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedin

# Importing Libraries

In [2]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

# Load the Dataset

In [3]:
#specify URL where data is located
urltrain = 'https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank-full.csv'
urltest = 'https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank.csv'

# Load Dataset from the Github URL
dftrain = pd.read_csv(urltrain,sep=';')
dftest = pd.read_csv(urltest,sep=';')

In [4]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [6]:
# Rename the Column Names 'y' as 'tdeposit' for avoiding any confusion
dftrain=dftrain.rename(columns={"y": "tdeposit"})
dftest=dftest.rename(columns={"y": "tdeposit"})
# Replace 
dftrain=dftrain.replace({'no': 0, 'yes': 1})
dftest=dftest.replace({'no': 0, 'yes': 1})

In [7]:
dftrain.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,tdeposit
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,0
5,35,management,married,tertiary,0,231,1,0,unknown,5,may,139,1,-1,0,unknown,0
6,28,management,single,tertiary,0,447,1,1,unknown,5,may,217,1,-1,0,unknown,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,unknown,5,may,380,1,-1,0,unknown,0
8,58,retired,married,primary,0,121,1,0,unknown,5,may,50,1,-1,0,unknown,0
9,43,technician,single,secondary,0,593,1,0,unknown,5,may,55,1,-1,0,unknown,0


In [8]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  int64 
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  int64 
 7   loan       45211 non-null  int64 
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  tdeposit   45211 non-null  int64 
dtypes: int64(11), object(6)
memory usage: 5.9+ MB


# Feature Selection with Univariate Statistical Tests

In [9]:
# Feature Selection with Univariate Statistical Tests
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# import necessary modules 
import pandas  as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Drop All object datatype columns before the regression model
dftrain = dftrain.select_dtypes(include=['float64','int32','int64', 'uint8'])
dftest = dftest.select_dtypes(include=['float64','int32','int64', 'uint8'])
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       45211 non-null  int64
 1   default   45211 non-null  int64
 2   balance   45211 non-null  int64
 3   housing   45211 non-null  int64
 4   loan      45211 non-null  int64
 5   day       45211 non-null  int64
 6   duration  45211 non-null  int64
 7   campaign  45211 non-null  int64
 8   pdays     45211 non-null  int64
 9   previous  45211 non-null  int64
 10  tdeposit  45211 non-null  int64
dtypes: int64(11)
memory usage: 3.8 MB


In [10]:
dftrain.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'tdeposit'],
      dtype='object')

In [11]:
predictors = ['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration', 'campaign', 'pdays', 'previous', 'tdeposit']

# Dropping the output variable price from the train dataset to the X for Model Building
X = dftrain.drop("tdeposit", axis=1)
# Assigning the output variable price from the train dataset to the Y for Model prediction 
y = dftrain["tdeposit"]

# split into 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.head(10)

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous
13932,57,0,658,0,0,10,724,1,-1,0
9894,37,0,1699,0,0,9,63,1,-1,0
39946,35,0,2823,1,0,2,102,4,96,2
9217,35,0,214,1,1,5,247,1,-1,0
4124,38,0,323,1,0,19,138,1,-1,0
30085,30,0,57,1,0,4,153,2,-1,0
17266,39,0,643,0,1,28,24,14,-1,0
34553,40,0,47,1,0,5,159,1,365,1
5386,41,0,-762,1,1,23,145,1,-1,0
12146,52,0,945,0,0,20,16,8,-1,0


In [12]:
# Feature Extraction
# SelectKBest() - Select features based on the k highest scores.
selector = SelectKBest(score_func=f_classif, k=4)
fit = selector.fit(X_train, y_train)

# Summarize Scores
set_printoptions(precision=3)

# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)

# Select the Top 4 Features
# fit() - Fit to data, then transform it.
selectpredictors = pd.DataFrame(fit.transform(X_train))
# summarize selected features
print(selectpredictors)


# Print for knowing the Rank the features by importance. Highest Value of "Score_f_anova" should be at 1st Rank and 2nd Highest and so on ?
print(pd.DataFrame({'variable': X_train.columns, 'score_f_anova': selector.scores_, 'p-values':scores}))

# Results: Top 4 Selected Features
# ==================================
# 1st Rank = Duration
# 2nd Rank = Housing
# 3rd Rank = Pdays
# 4th Rank = Previous

       0    1    2  3
0      0  724   -1  0
1      0   63   -1  0
2      1  102   96  2
3      1  247   -1  0
4      1  138   -1  0
...   ..  ...  ... ..
36163  0   39   -1  0
36164  1  233   -1  0
36165  0  261   -1  0
36166  0  149  182  1
36167  1  194   -1  0

[36168 rows x 4 columns]
   variable  score_f_anova    p-values
0       age      14.543063    3.862584
1   default      18.166178    4.692601
2   balance     102.328504   23.295569
3   housing     740.545534  160.712206
4      loan     170.564449   38.166031
5       day      34.106383    8.278805
6  duration    6771.357059         inf
7  campaign     193.498226   43.148061
8     pdays     348.984310   76.786307
9  previous     269.064544   59.523338


  scores = -np.log10(selector.pvalues_)


# Feature Extraction with RFE - Recursive Feature Elimination

In [13]:
# Feature Extraction with RFE
# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and building a model on those attributes that remain.
# It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
# lbfgs — Stands for Limited-memory Broyden–Fletcher–Goldfarb–Shanno. 
# It approximates the second derivative matrix updates with gradient evaluations.
# It stores only the last few updates, so it saves memory. It isn't super fast with large data sets. 
#It will be the default solver as of Scikit-learn version 0.22.

model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 4)
fit = rfe.fit(X_train, y_train)

print("Num Features: %d" % fit.n_features_)
print(pd.DataFrame({'variable': X_train.columns, 'Selected Features': fit.support_, 'Feature Ranking': fit.ranking_}))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Num Features: 4
   variable  Selected Features  Feature Ranking
0       age              False                4
1   default               True                1
2   balance              False                7
3   housing               True                1
4      loan               True                1
5       day              False                3
6  duration              False                5
7  campaign               True                1
8     pdays              False                6
9  previous              False                2


# Principal Component Analysis

In [14]:
# Principal Component Analysis (or PCA) uses linear algebra to transform the dataset into a compressed form.
# Generally this is called a data reduction technique. A property of PCA is that you can choose the number of dimensions or principal component in the transformed result.

In [15]:
# Feature Extraction with PCA
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA

# feature extraction
pca = PCA(n_components=4)
fit = pca.fit(X_train, y_train)

# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [9.920e-01 6.932e-03 1.042e-03 1.163e-05]
[[ 3.483e-04 -2.853e-06  1.000e+00 -1.185e-05 -1.033e-05  1.332e-05
   1.455e-03 -1.369e-05  5.167e-05  1.355e-05]
 [-4.438e-04 -4.000e-06 -1.454e-03  1.794e-05 -1.638e-05 -1.122e-03
   1.000e+00 -1.016e-03 -1.219e-03 -7.912e-06]
 [-2.357e-03 -3.674e-05 -5.267e-05  6.209e-04 -6.891e-05 -7.625e-03
   1.206e-03 -2.726e-03  9.999e-01  1.064e-02]
 [ 9.997e-01 -1.267e-04 -3.487e-04 -8.169e-03 -1.546e-04 -2.160e-02
   4.229e-04  1.129e-03  2.174e-03  2.373e-03]]


# Feature Importance with Extra Trees Classifier

In [16]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X_train, y_train)

# You can see that we are given an importance score for each attribute where the larger score the more important the attribute. 
print(model.feature_importances_)
print(pd.DataFrame({'variable': X_train.columns, 'Selected Features': model.feature_importances_}))

[0.148 0.002 0.165 0.028 0.009 0.129 0.337 0.065 0.078 0.039]
   variable  Selected Features
0       age           0.147989
1   default           0.002219
2   balance           0.164703
3   housing           0.027982
4      loan           0.008708
5       day           0.129353
6  duration           0.337061
7  campaign           0.064576
8     pdays           0.078107
9  previous           0.039302
