In [1]:
import pandas as pd
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# define col

In [3]:
x_col = df.drop('output',axis=1).columns.tolist()
y_col = 'output'
x_col,y_col

(['age',
  'sex',
  'cp',
  'trtbps',
  'chol',
  'fbs',
  'restecg',
  'thalachh',
  'exng',
  'oldpeak',
  'slp',
  'caa',
  'thall'],
 'output')

# LabelEncoder

In [4]:
from sklearn import preprocessing
le_dict = {}
for col in df.columns:
    le_dict[col] = preprocessing.LabelEncoder()
    le_dict[col].fit(df[col])
    df[col] = le_dict[col].transform(df[col])
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,29,1,3,31,64,1,0,49,0,22,0,0,1,1
1,3,1,2,22,80,0,1,84,0,32,0,0,2,1
2,7,0,1,22,35,0,0,71,0,14,2,0,2,1
3,22,1,1,14,67,0,1,76,0,8,2,0,2,1
4,23,0,0,14,145,0,1,62,1,6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,23,0,0,28,71,0,1,23,1,2,1,0,3,0
299,11,1,3,8,93,0,1,32,0,12,1,0,3,0
300,34,1,0,30,26,1,1,40,0,31,1,2,3,0
301,23,1,0,22,1,0,1,16,1,12,1,1,3,0


# 特徵擷取

In [5]:
X = df[x_col]
y = df[y_col]

# Tree-based feature selection

In [6]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
print(clf.feature_importances_ )
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X.shape,X_new.shape   

[0.07902916 0.05916331 0.11312197 0.0618024  0.05551995 0.02070818
 0.03445092 0.09339607 0.1010026  0.08706948 0.08067673 0.12025924
 0.09379999]




((303, 13), (303, 8))

# Univariate feature selection

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X.shape,X_new.shape

((303, 13), (303, 2))

# 學習演算法

In [8]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
clf.score(X,y)

0.9174917491749175

In [10]:
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(X,y)
clf.score(X,y)

1.0

# 參數最佳化

In [12]:
import optuna

def objective(trial):
    x1 = trial.suggest_float("C", 1, 100)
    x2 = trial.suggest_float("gamma", 0.01, 1)
    clf = make_pipeline(StandardScaler(), SVC(C = x1,gamma=x2))
    clf.fit(X, y)
    return clf.score(X,y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 50)

[32m[I 2022-06-28 09:54:28,719][0m A new study created in memory with name: no-name-1f8fb9e6-4491-439a-ba7b-fe8c0ea3367a[0m
[32m[I 2022-06-28 09:54:28,736][0m Trial 0 finished with value: 1.0 and parameters: {'C': 59.39575756303299, 'gamma': 0.3712816931641527}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-06-28 09:54:28,752][0m Trial 1 finished with value: 1.0 and parameters: {'C': 11.936951582884634, 'gamma': 0.15750302960548895}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-06-28 09:54:28,770][0m Trial 2 finished with value: 1.0 and parameters: {'C': 52.85849078678958, 'gamma': 0.89878715353868}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-06-28 09:54:28,788][0m Trial 3 finished with value: 1.0 and parameters: {'C': 66.90887523973393, 'gamma': 0.3578273554537114}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-06-28 09:54:28,806][0m Trial 4 finished with value: 1.0 and parameters: {'C': 10.422426301918579, 'gamma': 0.6094872490596704}. Best is trial 0 wit

In [17]:
study.best_params['C']
study.best_params['gamma']

0.3712816931641527

# 集成學習

In [18]:
import numpy as np
from sklearn.ensemble import VotingClassifier
clf1 = SVC(C = study.best_params['C'],gamma =study.best_params['gamma'])
clf2 = XGBClassifier()
eclf1 = VotingClassifier(estimators=[('SVC', clf1), ('XGB', clf2)], voting='hard')
eclf1 = eclf1.fit(X, y)
eclf1.score(X,y)

1.0