In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (8, 8)

## Selecting features for model performance


### Building a diabetes classifier
We'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset.

In [4]:
diabetes_df = pd.read_csv('/content/diabetes.csv')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from pprint import pprint

X, y = diabetes_df.iloc[:, :-1], diabetes_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
lr = LogisticRegression()

In [6]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scaler the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Print accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred)))
pprint(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

74.0% accuracy on test set.
{'Age': 0.11,
 'BMI': 0.62,
 'BloodPressure': 0.35,
 'DiabetesPedigreeFunction': 0.37,
 'Glucose': 1.18,
 'Insulin': 0.23,
 'Pregnancies': 0.52,
 'SkinThickness': 0.02}


We get almost 75% accuracy on the test set. Take a look at the differences in model coefficients for the different features.

### Automatic Recursive Feature Elimination
Now let's automate the recursive process. We will Wrap a Recursive Feature Eliminator (RFE) around our logistic regression estimator and pass it the desired number of features.

In [8]:
X, y = diabetes_df.iloc[:, :-1], diabetes_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr = LogisticRegression()

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scaler the test features
X_test_std = scaler.transform(X_test)

In [9]:
from sklearn.feature_selection import RFE

# Create the RFE a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train_std, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not elimiated
print(X.columns[rfe.support_])

# CAlculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test_std))
print("{0:.1%} accuracy on test set.".format(acc))

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'Pregnancies': 1, 'Glucose': 1, 'BloodPressure': 3, 'SkinThickness': 5, 'Insulin': 6, 'BMI': 1, 'DiabetesPedigreeFunction': 2, 'Age': 4}
Index(['Pregnancies', 'Glucose', 'BMI'], dtype='object')
80.1% accuracy on test set.


## Tree-based feature selection
- Random forest classifier
![rf classifier](https://github.com/goodboychan/chans_jupyter/blob/main/_notebooks/image/rfc.png?raw=1)

### Building a random forest model
You'll again work on the Pima Indians dataset to predict whether an individual has diabetes. This time using a random forest classifier. You'll fit the model on the training data after performing the train-test split and consult the feature importance values.

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Perform a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
pprint(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc))

{'Age': 0.14,
 'BMI': 0.17,
 'BloodPressure': 0.09,
 'DiabetesPedigreeFunction': 0.13,
 'Glucose': 0.24,
 'Insulin': 0.08,
 'Pregnancies': 0.08,
 'SkinThickness': 0.07}
77.1% accuracy on test set.


### Random forest for feature selection
Now lets use the fitted random model to select the most important features from our input dataset `X`.

In [11]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

# Apply the mask to the feature dataset X
reduced_X = X.loc[:, mask]

# Prints out the selected column names
print(reduced_X.columns)

[False  True False False False  True False False]
Index(['Glucose', 'BMI'], dtype='object')


### Recursive Feature Elimination with random forests
You'll wrap a Recursive Feature Eliminator around a random forest model to remove features step by step. This method is more conservative compared to selecting features after applying a single importance threshold. Since dropping one feature can influence the relative importances of the others.

In [12]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask using an attribute of rfe
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Index(['Glucose', 'BMI'], dtype='object')


In [13]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask using an attribute of rfe
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 4 features.
Index(['Glucose', 'BMI'], dtype='object')
