In [9]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
#load data and create dataframe
data = load_breast_cancer()
train_X = data.data
train_y = data.target
feature_names = data.feature_names
df = pd.DataFrame(data=train_X, columns=feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [15]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import mutual_info_classif

#compute mutual information
mutual_info = mutual_info_classif(df, y)
mutual_info_s = sorted(mutual_info, reverse=True)
feat_list = [feature_names[list(mutual_info).index(m)] for m in mutual_info_s]
print('All features: ')
print(feat_list)
print()
#top n features
top5 = [feature_names[list(mutual_info).index(m)] for m in mutual_info_s[:5]]
top3 = [feature_names[list(mutual_info).index(m)] for m in mutual_info_s[:3]]
print('Top 5 features: ', top5)
print()
print('Top 3 features: ', top3)

All features: 
['worst perimeter', 'worst area', 'worst radius', 'mean concave points', 'worst concave points', 'mean perimeter', 'mean concavity', 'mean radius', 'mean area', 'area error', 'worst concavity', 'perimeter error', 'radius error', 'worst compactness', 'mean compactness', 'concave points error', 'worst texture', 'concavity error', 'worst smoothness', 'worst symmetry', 'mean texture', 'mean smoothness', 'compactness error', 'worst fractal dimension', 'mean symmetry', 'fractal dimension error', 'smoothness error', 'symmetry error', 'mean fractal dimension', 'texture error']

Top 5 features:  ['worst perimeter', 'worst area', 'worst radius', 'mean concave points', 'worst concave points']

Top 3 features:  ['worst perimeter', 'worst area', 'worst radius']


In [17]:
#create dataframe to store scores
result = pd.DataFrame(columns = ['model', 'all features', 'top5 features', 'top3 features'])
#models
models = [SVC(kernel='linear'), SVC(kernel='rbf'), KNeighborsClassifier(), DecisionTreeClassifier(),
             MLPClassifier(), GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=100)]
result['model'] = ['linear svm', 'rbf svm', 'KNN', 'DecisionTree',
             'MLP', 'GaussianNB', 'RF', 'AdaBoost']
feat_set = [feat_list, top5, top3]

In [18]:
import warnings
warnings.filterwarnings('ignore')
#consider every feature set
for num, feat in enumerate(feat_set):
    res = []
    #consider every model
    for model in models:
        #append to res mean cv accuracy for each model
        res.append(cross_val_score(model, df[feat], train_y, cv=10).mean())
    #put accuracies into dataframe
    result.iloc[:, num+1] = res
result

Unnamed: 0,model,all features,top5 features,top3 features
0,linear svm,0.954318,0.917532,0.917562
1,rbf svm,0.627427,0.67123,0.64134
2,KNN,0.929843,0.901738,0.901738
3,DecisionTree,0.917533,0.920885,0.887607
4,MLP,0.929721,0.898227,0.889516
5,GaussianNB,0.93868,0.947391,0.922887
6,RF,0.95441,0.936863,0.891241
7,AdaBoost,0.917533,0.913961,0.884036


Consider svm models: linear svc performs much better and the best accuracy obtained with all_features set is 0.95. KNN with all_features set also outperforms other feature sets. Random Forest(ensemble of Decision Trees) shows better result than single
Decision Tree: that's why we should use ensembles. Simple neural network shows moderate result, but it can be better due to further hyperparameter tuning. Another type of ensembles(boosting) - Adaboost performs like single Decision Tree, so boosting is not good model for this dataset. The highest accuracy showed Random Forest(all features set).