In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('IRIS.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
y = df.iloc[:, 4].values
len(y), np.unique(y)

(150,
 array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object))

In [4]:
from sklearn.preprocessing import LabelEncoder

y_i = LabelEncoder().fit_transform(y)
len(y_i), np.unique(y_i)

(150, array([0, 1, 2]))

In [5]:
y_i[:50], y_i[-50:]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2]))

In [6]:
x = df.iloc[:, [2,3]].values
x.shape, x[:10]

((150, 2),
 array([[1.4, 0.2],
        [1.4, 0.2],
        [1.3, 0.2],
        [1.5, 0.2],
        [1.4, 0.2],
        [1.7, 0.4],
        [1.4, 0.3],
        [1.5, 0.2],
        [1.4, 0.2],
        [1.5, 0.1]]))

# Hold Out

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [31]:
print("留出法")

n = 10
scores = []
for i in range(n):
    # split
    x_train, x_test, y_train, y_test = train_test_split(
        x, y_i, test_size=0.3, stratify=y, random_state=i
    )
    
    # standardization
    sc = StandardScaler()
    sc.fit(x_train)
    x_train_std = sc.transform(x_train)
    x_test_std = sc.transform(x_test)
    
    # train
    lr = LogisticRegression(C=100.0, random_state=1)
    lr.fit(x_train_std, y_train)
    
    # score (准确率)
    score = lr.score(x_test_std, y_test)
    scores.append(score)
    
    print(f"Score: {score: {0}.{3}}")
    
print(f"Acc: {np.mean(scores): {0}.{3}} +/- {np.std(scores): {0}.{3}}")

留出法
Score:  0.978
Score:  0.978
Score:  1.0
Score:  0.911
Score:  0.956
Score:  0.911
Score:  0.933
Score:  0.978
Score:  0.889
Score:  0.933
Acc:  0.947 +/-  0.0347


# Cross-validation

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [20]:
print("交叉验证")

# split
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1).split(x, y_i)

lr = LogisticRegression(C=100.0, random_state=1)
scores = []
for k, (train, test) in enumerate(kfold): # k is an index
#     print(train)
#     print(test)

#     x_train = [x[i] for i in train]
#     x_test = [x[i] for i in test]
    sc = StandardScaler()
    sc.fit(x_train)
    x_train_std = sc.transform(x[train])
    x_test_std = sc.transform(x[test])
    
    lr.fit(x_train_std, y[train])
    score = lr.score(x_train_std, y[test])
    scores.append(score)
    print(f"Fold: {k+1}")
    print(f"Score: {score: {0}.{3}}")
    
print(f"Acc: {np.mean(scores): {0}.{3}} +/- {np.std(scores): {0}.{3}}")

交叉验证
Fold: 1
Score:  1.0
Fold: 2
Score:  0.933
Fold: 3
Score:  1.0
Fold: 4
Score:  1.0
Fold: 5
Score:  0.933
Fold: 6
Score:  0.933
Fold: 7
Score:  1.0
Fold: 8
Score:  1.0
Fold: 9
Score:  0.867
Fold: 10
Score:  0.933
Acc:  0.96 +/-  0.0442


# Bootstrapping

In [7]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [17]:
print("自助法")

num = len(y)

def boot_strap(num:int):
    chosen_idxes = []
    for i in range(num):
        rand_idx = random.randrange(0, num)
        chosen_idxes.append(rand_idx)
    return chosen_idxes

def get_data_by_index(train_index, test_index, x, y):
    x_train = [x[i] for i in train_index]
    x_test = [x[i] for i in test_index]
    y_train = [y[i] for i in train_index]
    y_test = [y[i] for i in test_index]
    return x_train, x_test, y_train, y_test
        

n = 10
scores = []
all_index = set(np.arange(num))
lr = LogisticRegression(C=100.0, random_state=1)
for i in range(n):
    train_index = boot_strap(num)
    test_index = list(set(all_index) - set(nodup_train))
    x_train, x_test, y_train, y_test = get_data_by_index(train_index, test_index, x, y)
    
    sc = StandardScaler()
    sc.fit(x_train)
    x_train_std = sc.transform(x_train)
    x_test_std = sc.transform(x_test)
    
    lr = 



自助法
[128, 130, 3, 5, 135, 9, 10, 11, 12, 14, 15, 145, 19, 20, 147, 148, 23, 24, 26, 27, 31, 32, 34, 40, 44, 46, 50, 52, 58, 60, 64, 65, 67, 68, 69, 70, 75, 78, 81, 82, 84, 86, 87, 91, 93, 96, 99, 100, 101, 102, 104, 105, 106, 109, 112, 118, 119, 124, 126, 127]
[128, 130, 3, 5, 135, 9, 10, 11, 12, 14, 15, 145, 19, 20, 147, 148, 23, 24, 26, 27, 31, 32, 34, 40, 44, 46, 50, 52, 58, 60, 64, 65, 67, 68, 69, 70, 75, 78, 81, 82, 84, 86, 87, 91, 93, 96, 99, 100, 101, 102, 104, 105, 106, 109, 112, 118, 119, 124, 126, 127]
[128, 130, 3, 5, 135, 9, 10, 11, 12, 14, 15, 145, 19, 20, 147, 148, 23, 24, 26, 27, 31, 32, 34, 40, 44, 46, 50, 52, 58, 60, 64, 65, 67, 68, 69, 70, 75, 78, 81, 82, 84, 86, 87, 91, 93, 96, 99, 100, 101, 102, 104, 105, 106, 109, 112, 118, 119, 124, 126, 127]
[128, 130, 3, 5, 135, 9, 10, 11, 12, 14, 15, 145, 19, 20, 147, 148, 23, 24, 26, 27, 31, 32, 34, 40, 44, 46, 50, 52, 58, 60, 64, 65, 67, 68, 69, 70, 75, 78, 81, 82, 84, 86, 87, 91, 93, 96, 99, 100, 101, 102, 104, 105, 106, 109

[145,
 80,
 47,
 113,
 132,
 125,
 104,
 130,
 29,
 104,
 67,
 129,
 132,
 79,
 146,
 4,
 105,
 70,
 37,
 89,
 134,
 32,
 80,
 132,
 98,
 13,
 27,
 104,
 109,
 68,
 39,
 70,
 13,
 80,
 64,
 91,
 33,
 84,
 51,
 33,
 21,
 61,
 143,
 134,
 144,
 82,
 103,
 124,
 80,
 126,
 112,
 111,
 54,
 20,
 149,
 66,
 79,
 133,
 41,
 94,
 51,
 133,
 27,
 8,
 131,
 23,
 40,
 78,
 49,
 53,
 27,
 90,
 49,
 145,
 131,
 45,
 31,
 68,
 35,
 136,
 24,
 84,
 114,
 101,
 58,
 49,
 137,
 13,
 70,
 84,
 45,
 78,
 134,
 132,
 143,
 15,
 37,
 16,
 132,
 24,
 116,
 96,
 90,
 91,
 31,
 81,
 53,
 127,
 141,
 2,
 97,
 123,
 65,
 39,
 11,
 4,
 123,
 44,
 45,
 114,
 72,
 95,
 4,
 96,
 15,
 24,
 135,
 30,
 105,
 143,
 24,
 35,
 149,
 95,
 107,
 95,
 85,
 51,
 120,
 15,
 0,
 41,
 61,
 72,
 118,
 58,
 30,
 113,
 94,
 54]