# Feature Selection

# Data treatment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
census_data = pd.read_csv('census.csv')
census_data

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
columns = census_data.columns[:-1]
columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [5]:
x_census = census_data.iloc[:,0:14].values
x_census

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [6]:
y_census = census_data.iloc[:,14].values
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [7]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation   = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex  = LabelEncoder()
label_encoder_country = LabelEncoder()

In [8]:
x_census[:,1] = label_encoder_workclass.fit_transform(x_census[:,1])
x_census[:,3] = label_encoder_education.fit_transform(x_census[:,3])
x_census[:,5] = label_encoder_marital.fit_transform(x_census[:,5])
x_census[:,6] = label_encoder_occupation.fit_transform(x_census[:,6])
x_census[:,7] = label_encoder_relationship.fit_transform(x_census[:,7])
x_census[:,8] = label_encoder_race.fit_transform(x_census[:,8])
x_census[:,9] = label_encoder_sex.fit_transform(x_census[:,9])
x_census[:,13] = label_encoder_country.fit_transform(x_census[:,13])

In [9]:
x_census

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [10]:
from sklearn.preprocessing import MinMaxScaler #standardization
scaler = MinMaxScaler()
x_census_scaler = scaler.fit_transform(x_census)
x_census_scaler

array([[0.30136986, 0.875     , 0.0443019 , ..., 0.        , 0.39795918,
        0.95121951],
       [0.45205479, 0.75      , 0.0482376 , ..., 0.        , 0.12244898,
        0.95121951],
       [0.28767123, 0.5       , 0.13811345, ..., 0.        , 0.39795918,
        0.95121951],
       ...,
       [0.56164384, 0.5       , 0.09482688, ..., 0.        , 0.39795918,
        0.95121951],
       [0.06849315, 0.5       , 0.12849934, ..., 0.        , 0.19387755,
        0.95121951],
       [0.47945205, 0.625     , 0.18720338, ..., 0.        , 0.39795918,
        0.95121951]])

# Feature Selection - Low Variance

In [11]:
for i in range(x_census.shape[1]):
    print(x_census_scaler[:,i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [13]:
from sklearn.feature_selection import VarianceThreshold

features with low variance are too similar and is too close to the mean.

we look for higher variance

In [14]:
selection = VarianceThreshold(threshold=0.05) #minimun variance
x_census_variance = selection.fit_transform(x_census_scaler)
x_census_variance.shape

(32561, 5)

In [15]:
x_census_variance

array([[0.6       , 0.66666667, 0.07142857, 0.2       , 1.        ],
       [0.6       , 0.33333333, 0.28571429, 0.        , 1.        ],
       [0.73333333, 0.        , 0.42857143, 0.2       , 1.        ],
       ...,
       [0.73333333, 1.        , 0.07142857, 0.8       , 0.        ],
       [0.73333333, 0.66666667, 0.07142857, 0.6       , 1.        ],
       [0.73333333, 0.33333333, 0.28571429, 1.        , 0.        ]])

In [16]:
selection.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [17]:
index = np.where(selection.variances_> 0.05)
index # features selected

(array([3, 5, 6, 7, 9], dtype=int64),)

In [18]:
columns[index]

Index(['education', 'marital-status', 'occupation', 'relationship', 'sex'], dtype='object')

In [19]:
columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

### Random forest

In [22]:
census_data_variance = census_data.drop(columns= ['age', 'workclass', 'final-weight',  'education-num',
                                                  'race', 'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],axis=1)

In [23]:
census_data_variance

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K
...,...,...,...,...,...,...
32556,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,Female,<=50K
32557,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,Male,>50K
32558,HS-grad,Widowed,Adm-clerical,Unmarried,Female,<=50K
32559,HS-grad,Never-married,Adm-clerical,Own-child,Male,<=50K


In [24]:
x_census_variance = census_data_variance.iloc[:,0:5].values
y_census_variance = census_data_variance.iloc[:,5].values

In [25]:
x_census_variance

array([[' Bachelors', ' Never-married', ' Adm-clerical',
        ' Not-in-family', ' Male'],
       [' Bachelors', ' Married-civ-spouse', ' Exec-managerial',
        ' Husband', ' Male'],
       [' HS-grad', ' Divorced', ' Handlers-cleaners', ' Not-in-family',
        ' Male'],
       ...,
       [' HS-grad', ' Widowed', ' Adm-clerical', ' Unmarried', ' Female'],
       [' HS-grad', ' Never-married', ' Adm-clerical', ' Own-child',
        ' Male'],
       [' HS-grad', ' Married-civ-spouse', ' Exec-managerial', ' Wife',
        ' Female']], dtype=object)

In [26]:
y_census_variance

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [28]:
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation   = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_sex  = LabelEncoder()

x_census_variance[:,0] = label_encoder_education.fit_transform(x_census_variance[:,0])
x_census_variance[:,1] = label_encoder_marital.fit_transform(x_census_variance[:,1])
x_census_variance[:,2] = label_encoder_occupation.fit_transform(x_census_variance[:,2])
x_census_variance[:,3] = label_encoder_relationship.fit_transform(x_census_variance[:,3])
x_census_variance[:,4] = label_encoder_sex.fit_transform(x_census_variance[:,4])


In [29]:
x_census_variance

array([[9, 4, 1, 1, 1],
       [9, 2, 4, 0, 1],
       [11, 0, 6, 1, 1],
       ...,
       [11, 6, 1, 4, 0],
       [11, 4, 1, 3, 1],
       [11, 2, 4, 5, 0]], dtype=object)

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(),[0,1,2,3,4])],
                                                 remainder='passthrough')
x_census_variance = onehotencoder.fit_transform(x_census_variance).toarray()

In [31]:
scaler = MinMaxScaler()
x_census_variance_scaler = scaler.fit_transform(x_census_variance)
x_census_variance_scaler

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [32]:
from sklearn.model_selection import train_test_split
x_census_variance_scaler_training, x_census_variance_scaler_test, y_census_variance_training, y_census_variance_test = train_test_split(x_census_variance_scaler,
                                                                                  y_census_variance,
                                                                                      test_size = 0.15,
                                                                                      random_state = 0)

In [33]:
x_census_variance_scaler_training.shape, x_census_variance_scaler_test.shape

((27676, 46), (4885, 46))

In [34]:
from sklearn.ensemble import RandomForestClassifier
random_f_census = RandomForestClassifier(criterion='entropy',
                                         min_samples_leaf=1,
                                         min_samples_split=5,
                                         n_estimators=100)
random_f_census.fit(x_census_variance_scaler_training,y_census_variance_training)

In [35]:
from sklearn.metrics import accuracy_score
prediction = random_f_census.predict(x_census_variance_scaler_test)
accuracy_score(y_census_variance_test,prediction)

0.8167860798362334

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_census_variance_test, prediction))

              precision    recall  f1-score   support

       <=50K       0.85      0.92      0.88      3693
        >50K       0.67      0.50      0.57      1192

    accuracy                           0.82      4885
   macro avg       0.76      0.71      0.73      4885
weighted avg       0.81      0.82      0.81      4885



# Feature Selection - Extra tree

Extra tree can also be used for classification

In [37]:
from sklearn.ensemble import ExtraTreesClassifier

In [38]:
x_census_scaler.shape

(32561, 14)

In [39]:
selection =  ExtraTreesClassifier()
selection.fit(x_census_scaler,y_census)

In [41]:
columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [40]:
importance = selection.feature_importances_
importance #values in %

array([0.15233586, 0.04489977, 0.1643126 , 0.03891031, 0.08471053,
       0.08153804, 0.07707233, 0.0872892 , 0.01463236, 0.02580958,
       0.08863506, 0.0279468 , 0.09436621, 0.01754135])

In [42]:
importance.sum()

0.9999999999999998

In [43]:
index = []
for i in range(len(importance)):
    if importance[i] > 0.029:
        index.append(i)
index

[0, 1, 2, 3, 4, 5, 6, 7, 10, 12]

In [44]:
columns[index]

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'capital-gain',
       'hour-per-week'],
      dtype='object')

In [50]:
x_census_extra = x_census[:,index]
x_census_extra

array([[39, 7, 77516, ..., 1, 2174, 40],
       [50, 6, 83311, ..., 0, 0, 13],
       [38, 4, 215646, ..., 1, 0, 40],
       ...,
       [58, 4, 151910, ..., 4, 0, 40],
       [22, 4, 201490, ..., 3, 0, 20],
       [52, 5, 287927, ..., 5, 15024, 40]], dtype=object)

In [51]:
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(),[1,3,5,6,7])],
                                                 remainder='passthrough')
x_census_extra = onehotencoder.fit_transform(x_census_extra).toarray()

In [52]:
x_census_extra

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3000e+01, 2.1740e+03,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3000e+01, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0000e+00, 1.5024e+04,
        4.0000e+01]])

In [53]:
x_census_extra.shape

(32561, 58)

In [54]:

x_census_extra_training, x_census_extra_test, y_census_training, y_census_test = train_test_split(x_census_extra,
                                                                                  y_census,
                                                                                      test_size = 0.15,
                                                                                      random_state = 0)
x_census_extra_training.shape, x_census_extra_test.shape

((27676, 58), (4885, 58))

In [55]:
random_f_census = RandomForestClassifier(criterion='entropy',
                                         min_samples_leaf=1,
                                         min_samples_split=5,
                                         n_estimators=100)
random_f_census.fit(x_census_extra_training,y_census_training)

In [56]:
prediction = random_f_census.predict(x_census_extra_test)
accuracy_score(y_census_test,prediction)

0.8460593654042988

In [57]:
print(classification_report(y_census_test, prediction))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      3693
        >50K       0.72      0.60      0.66      1192

    accuracy                           0.85      4885
   macro avg       0.80      0.76      0.78      4885
weighted avg       0.84      0.85      0.84      4885

