In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.datasets import load_breast_cancer

def make_dataset():
    iris = load_breast_cancer()
    df = pd.DataFrame(iris.data, columns = iris.feature_names)
    df['target'] = iris.target
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop('target', axis = 1), df['target'], test_size = 0.5, random_state = 1004)
    return X_train, X_test, y_train, y_test
    
X_train, X_test, y_train, y_test = make_dataset()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((284, 30), (285, 30), (284,), (285,))

In [3]:
y_train.value_counts()

1    190
0     94
Name: target, dtype: int64

In [4]:
#의사결정나무
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state = 0)

model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)


0.9263157894736842

In [5]:
#의사결정나무 하이퍼파라미터
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(
    criterion = "entropy", 
    max_depth = 5, 
    min_samples_split = 8,
    min_samples_leaf = 2, 
    random_state = 0)

model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9403508771929825

In [10]:
#랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    random_state = 0,
    n_estimators = 150,
    max_depth = 5)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)
from sklearn.metrics import precision_score
precision_score(y_test, pred)

0.9371428571428572

In [57]:
# xgboost
from xgboost import XGBClassifier
model = XGBClassifier(
    random_state = 0,
    use_label_encoder = False,
    eval_metric = 'logloss')
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(pred, y_test)


0.9508771929824561

In [64]:
# xgboost
from xgboost import XGBClassifier
model = XGBClassifier(
    random_state = 0,
    use_label_encoder = False,
    eval_metric = 'logloss', 
# xgboost 하이퍼파라미터
    booster = 'gbtree',
    objective = 'binary:logistic', #multi:softmax
    max_depth = 5, 
    learning_rate = 0.05,
    n_estimators = 500,
    subsample = 1,
    colsample_bytree = 1,
    n_jobs = -1
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(pred, y_test)
 

0.9649122807017544

In [12]:
# xgboost
from xgboost import XGBClassifier
model = XGBClassifier(
    random_state = 0,
    use_label_encoder = False,
    eval_metric = 'logloss',
    learning_rate = 0.05,
    n_estimators = 500
)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set = eval_set, early_stopping_rounds = 10)
#model.fit(X_train, y_train, eval_set = eval_set)
pred = model.predict(X_test)
accuracy_score(pred, y_test)


[0]	validation_0-logloss:0.65391
[1]	validation_0-logloss:0.61861
[2]	validation_0-logloss:0.58697
[3]	validation_0-logloss:0.55756
[4]	validation_0-logloss:0.53038
[5]	validation_0-logloss:0.50611
[6]	validation_0-logloss:0.48363
[7]	validation_0-logloss:0.46304
[8]	validation_0-logloss:0.44332
[9]	validation_0-logloss:0.42512
[10]	validation_0-logloss:0.40821
[11]	validation_0-logloss:0.39260
[12]	validation_0-logloss:0.37838
[13]	validation_0-logloss:0.36512
[14]	validation_0-logloss:0.35276
[15]	validation_0-logloss:0.34090
[16]	validation_0-logloss:0.33018
[17]	validation_0-logloss:0.31967
[18]	validation_0-logloss:0.30998
[19]	validation_0-logloss:0.30105
[20]	validation_0-logloss:0.29259
[21]	validation_0-logloss:0.28478
[22]	validation_0-logloss:0.27725
[23]	validation_0-logloss:0.27027
[24]	validation_0-logloss:0.26359
[25]	validation_0-logloss:0.25755
[26]	validation_0-logloss:0.25139
[27]	validation_0-logloss:0.24593
[28]	validation_0-logloss:0.24103
[29]	validation_0-loglos

0.9473684210526315

In [13]:
# 데이터셋 로드
def make_dataset2():
    bc = load_breast_cancer()
    df = pd.DataFrame(bc.data, columns=bc.feature_names)
    df['target'] = bc.target
    return df.drop('target', axis=1), df['target']
X, y = make_dataset2()

In [14]:
#KFold
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state = 0)

kfold = KFold(n_splits = 5)
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(accuracy_score(y_test, pred))


0.8771929824561403
0.9122807017543859
0.9473684210526315
0.9385964912280702
0.8407079646017699


In [15]:
#Stratified KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state = 0)

kfold = StratifiedKFold(n_splits = 5)
for train_idx, test_idx in kfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(accuracy_score(y_test, pred))


0.9035087719298246
0.9210526315789473
0.9122807017543859
0.9473684210526315
0.9026548672566371


In [20]:
#교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv = 5)
scores

array([0.90350877, 0.92105263, 0.9122807 , 0.94736842, 0.90265487])

In [17]:
scores.mean()

0.9173730787144851

In [18]:
#교차검증
from sklearn.model_selection import cross_val_score
kfold = StratifiedKFold(n_splits = 5)
scores = cross_val_score(model, X, y, cv = kfold)
print(scores)
print(scores.mean())

[0.90350877 0.92105263 0.9122807  0.94736842 0.90265487]
0.9173730787144851


In [21]:
#교차검증
from sklearn.model_selection import cross_val_score
kfold = StratifiedKFold(n_splits = 5)
scores = cross_val_score(model, X_train, y_train, cv = kfold)
print(scores)
print(scores.mean())

[0.9673913  0.94505495 0.91208791 0.95604396 0.89010989]
0.9341376015289058


In [22]:
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9026548672566371

In [23]:
#정확도
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9026548672566371

In [24]:
#정밀도
from sklearn.metrics import precision_score
precision_score(y_test, pred)

0.9545454545454546

In [25]:
#재현율
from sklearn.metrics import recall_score
recall_score(y_test, pred)

0.8873239436619719

In [26]:
#f1
from sklearn.metrics import f1_score
f1_score(y_test, pred)

0.9197080291970803

In [97]:
from xgboost import XGBClassifier
model = XGBClassifier(
    random_state = 0,
    use_label_encoder = False,
    eval_metric = 'logloss')
model.fit(X_train, y_train)
pred = model.predict_proba(X_test)
pred
#accuracy_score(pred, y_test)

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred[:,1])

0.999664654594232