In [1]:
# Import needed packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [2]:
# Load the Wisconsin Breast Cancer Database
wbcd = pd.read_csv('WisconsinBreastCancerDatabase.csv')

In [3]:
# Select and scale input features, create dataframe for output feature
X = wbcd[['Radius mean', 'Texture mean', 'Area mean', 'Smoothness mean',
         'Compactness mean', 'Concavity mean', 'Concave points mean',
          'Fractal dimension mean', 'Symmetry mean']]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = wbcd[['Diagnosis']]

In [4]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=123)

In [5]:
# Perform feature selection using the SelectKBest function
model_kbest = SelectKBest(score_func=f_classif, k=5)
X_new_kbest = model_kbest.fit_transform(X_train,np.ravel(y_train))

# Perform feature selection using the SelectPercentile function
model_percent = SelectPercentile(score_func=f_classif, percentile=30)
X_new_percent = model_percent.fit_transform(X_train,np.ravel(y_train))

In [6]:
# Get features selected by each function
filter_kbest = model_kbest.get_support()
filter_percent = model_percent.get_support()

# Get input feature names
features = np.array(X_train.columns)

In [7]:
# Display feature names selected by the SelectKBest function
features[filter_kbest]

array(['Radius mean', 'Area mean', 'Compactness mean', 'Concavity mean',
       'Concave points mean'], dtype=object)

In [8]:
# Display feature names selected by the SelectPercent function
features[filter_percent]

array(['Radius mean', 'Area mean', 'Concave points mean'], dtype=object)

In [9]:
# Display the F-statistic and p-value for each feature
data = {'F-statistic': model_kbest.scores_, 'p-value': model_kbest.pvalues_}
pd.DataFrame(data, index=X_train.columns)

Unnamed: 0,F-statistic,p-value
Radius mean,527.940342,5.088147999999999e-78
Texture mean,92.258507,5.256179e-20
Area mean,482.986271,2.138621e-73
Smoothness mean,58.955037,1.002751e-13
Compactness mean,240.920646,7.098448999999999e-44
Concavity mean,406.240946,5.813829e-65
Concave points mean,683.93047,1.465195e-92
Fractal dimension mean,0.07387,0.7859076
Symmetry mean,59.678172,7.24584e-14


In [10]:
# Construct MLP classifier using all features and display classification accuracy
clf = MLPClassifier(random_state=1, max_iter=1000).fit(X_train, np.ravel(y_train))
clf.score(X_test, y_test)

0.956140350877193

In [11]:
# Construct MLP classifier using 5 best features and display classification accuracy
clf_reduced_kbest = MLPClassifier(random_state=1,
                                  max_iter=1000).fit(X_train[features[filter_kbest]], np.ravel(y_train))
clf_reduced_kbest.score(X_test[features[filter_kbest]], y_test)

0.9385964912280702

In [12]:
# Construct MLP classifier using the top 30% features and display classification accuracy
clf_reduced_percent = MLPClassifier(random_state=1,
                              max_iter=1000).fit(X_train[features[filter_percent]], np.ravel(y_train))
clf_reduced_percent.score(X_test[features[filter_percent]], y_test)

0.956140350877193