# feature selection techniques
- SelectKBest
- RFE
- ExtraTreesClassifier
- PCA

In [1]:
import os 
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [2]:
df = pd.read_csv('../datasets/diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE

In [7]:
from sklearn.decomposition import PCA # principal component analysis

In [8]:
from sklearn.ensemble import ExtraTreesClassifier

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [11]:
x= df[df.columns.tolist()[:-1]]
y = df.Outcome

# univariate analysis
Univariate analysis is the simplest form of analyzing data. “Uni” means “one”, so in other words your data has only one variable. It doesn't deal with causes or relationships (unlike regression ) and it's major purpose is to describe; It takes data, summarizes that data and finds patterns in the data

In [12]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [13]:
test1 = SelectKBest(score_func=chi2,k=3)
test1.fit_transform(x,y)

array([[148.,   0.,  50.],
       [ 85.,   0.,  31.],
       [183.,   0.,  32.],
       ...,
       [121., 112.,  30.],
       [126.,   0.,  47.],
       [ 93.,   0.,  23.]])

In [14]:
for col,val in zip(df.columns, test1.scores_):
    print(col.ljust(30),'|',val)
print("select columns with higher chi2 score")

Pregnancies                    | 111.51969063588255
Glucose                        | 1411.887040644141
BloodPressure                  | 17.605373215320718
SkinThickness                  | 53.10803983632434
Insulin                        | 2175.5652729220137
BMI                            | 127.6693433310368
DiabetesPedigreeFunction       | 5.39268154697144
Age                            | 181.30368904430023
select columns with higher chi2 score


In [15]:
best_x = df[['Glucose','Insulin','Age']]

In [39]:
px.scatter_3d(df,x='Glucose',y='Insulin',z='Age',color='Outcome')

In [16]:
kclf = KNeighborsClassifier(n_neighbors=3)

In [17]:
kclf.fit(best_x,y)

KNeighborsClassifier(n_neighbors=3)

In [18]:
ypred = kclf.predict(best_x)

In [19]:
confusion_matrix(ypred,y)

array([[454,  70],
       [ 46, 198]], dtype=int64)

In [20]:
print(classification_report(ypred,y))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89       524
           1       0.74      0.81      0.77       244

    accuracy                           0.85       768
   macro avg       0.82      0.84      0.83       768
weighted avg       0.85      0.85      0.85       768



## recursive feature elimination


In [51]:
model = DecisionTreeClassifier()
rfe = RFE(model,n_features_to_select=5)
out = rfe.fit(x,y)

In [52]:
out.ranking_

array([2, 1, 1, 4, 3, 1, 1, 1])

In [53]:
for col,rank in zip(x,out.ranking_):
    print(f'{rank}\t{col}')

2	Pregnancies
1	Glucose
1	BloodPressure
4	SkinThickness
3	Insulin
1	BMI
1	DiabetesPedigreeFunction
1	Age


In [54]:
x_best =  out.transform(x)

In [55]:
x_best

array([[148.   ,  72.   ,  33.6  ,   0.627,  50.   ],
       [ 85.   ,  66.   ,  26.6  ,   0.351,  31.   ],
       [183.   ,  64.   ,  23.3  ,   0.672,  32.   ],
       ...,
       [121.   ,  72.   ,  26.2  ,   0.245,  30.   ],
       [126.   ,  60.   ,  30.1  ,   0.349,  47.   ],
       [ 93.   ,  70.   ,  30.4  ,   0.315,  23.   ]])

In [56]:
clf = DecisionTreeClassifier()
clf.fit(x_best,y)

DecisionTreeClassifier()

In [57]:
ypred = clf.predict(x_best)
print(confusion_matrix(y,ypred))

[[500   0]
 [  0 268]]


In [58]:
print(classification_report(y,ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [59]:
px.scatter_3d(df,x='Glucose',y='BMI',z='DiabetesPedigreeFunction',color='Outcome')

## Principal Component Analysis

expanded > 2 x 2 x 2 x 2 x 2 
reduced > 2**5

In [61]:
pca = PCA(n_components=3)
xp = pca.fit_transform(x)

In [66]:
clf = DecisionTreeClassifier()
clf.fit(xp,y)

DecisionTreeClassifier()

In [68]:
ypred = clf.predict(xp)
print(confusion_matrix(y,ypred))

[[500   0]
 [  0 268]]


In [69]:
kclf = KNeighborsClassifier(n_neighbors=3)
kclf.fit(xp,y)
ypred = clf.predict(xp)
print(confusion_matrix(y,ypred))

[[500   0]
 [  0 268]]


## feature importance using ExtratreeClassifier

In [72]:
feat = ExtraTreesClassifier(n_estimators=100)
feat.fit(x,y)
feat.feature_importances_

array([0.1060364 , 0.24434192, 0.09851986, 0.07918714, 0.07048312,
       0.14257544, 0.11838323, 0.1404729 ])

In [73]:
for col,rank in zip(x,feat.feature_importances_):
    print(f'{rank}\t{col}')

0.1060363968247983	Pregnancies
0.24434191860740515	Glucose
0.09851985592605876	BloodPressure
0.07918713935281425	SkinThickness
0.07048311624891618	Insulin
0.14257544009483256	BMI
0.11838323314676677	DiabetesPedigreeFunction
0.14047289979840807	Age


# task -> use these techinques to make a classifier on mushroom dataset.
- use label encoder to encoder each column to numeric 