In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('IRIS.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
y = df.iloc[:, 4].values
len(y), np.unique(y)

(150,
 array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object))

In [4]:
from sklearn.preprocessing import LabelEncoder

y_i = LabelEncoder().fit_transform(y)
len(y_i), np.unique(y_i)

(150, array([0, 1, 2]))

In [5]:
y_i[:50], y_i[-50:]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2]))

In [6]:
x = df.iloc[:, [2,3]].values
x.shape, x[:10]

((150, 2),
 array([[1.4, 0.2],
        [1.4, 0.2],
        [1.3, 0.2],
        [1.5, 0.2],
        [1.4, 0.2],
        [1.7, 0.4],
        [1.4, 0.3],
        [1.5, 0.2],
        [1.4, 0.2],
        [1.5, 0.1]]))

# Hold Out

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [8]:
print("留出法")

n = 10
scores = []
for i in range(n):
    # split
    x_train, x_test, y_train, y_test = train_test_split(
        x, y_i, test_size=0.3, stratify=y_i, random_state=i
    )
    
    # standardization
    sc = StandardScaler()
    sc.fit(x_train)
    x_train_std = sc.transform(x_train)
    x_test_std = sc.transform(x_test)
    
    # train
    lr = LogisticRegression(C=100.0, random_state=1)
    lr.fit(x_train_std, y_train)
    
    # score (准确率)
    score = lr.score(x_test_std, y_test)
    scores.append(score)
    
    print(f"Score: {score: {0}.{3}}")
    
print(f"Acc: {np.mean(scores): {0}.{3}} +/- {np.std(scores): {0}.{3}}")

留出法
Score:  0.978
Score:  0.978
Score:  1.0
Score:  0.933
Score:  0.956
Score:  0.933
Score:  0.956
Score:  0.933
Score:  0.889
Score:  0.933
Acc:  0.949 +/-  0.0299


# Cross-validation

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [8]:
print("交叉验证")

# split
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1).split(x, y_i)

lr = LogisticRegression(C=100.0, random_state=1)
scores = []
for k, (train, test) in enumerate(kfold): # k is an index
#     print(train)
#     print(test)

    sc = StandardScaler()
    sc.fit(x[train])
    x_train_std = sc.transform(x[train])
    x_test_std = sc.transform(x[test])
    
    lr.fit(x_train_std, y_i[train])
    score = lr.score(x_test_std, y_i[test])
    scores.append(score)
    print(f"Fold: {k+1}")
    print(f"Score: {score: {0}.{3}}")
    
print(f"Acc: {np.mean(scores): {0}.{3}} +/- {np.std(scores): {0}.{3}}")

交叉验证
Fold: 1
Score:  1.0
Fold: 2
Score:  0.933
Fold: 3
Score:  1.0
Fold: 4
Score:  1.0
Fold: 5
Score:  0.933
Fold: 6
Score:  0.933
Fold: 7
Score:  1.0
Fold: 8
Score:  1.0
Fold: 9
Score:  0.867
Fold: 10
Score:  0.933
Acc:  0.96 +/-  0.0442


# Bootstrapping

- make the size of trainning the same length as the origional size.

In [11]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [12]:
print("自助法")

num = len(y)

def boot_strap(num:int):
    chosen_idxes = []
    for i in range(num):
        rand_idx = random.randrange(0, num)
        chosen_idxes.append(rand_idx)
    return chosen_idxes

n = 10
scores = []
all_index = np.arange(num)
for i in range(n):
    train = boot_strap(num)
    test = list(set(all_index) - set(train))
    
    sc = StandardScaler()
    sc.fit(x[train])
    x_train_std = sc.transform(x[train])
    x_test_std = sc.transform(x[test])
    
    lr = LogisticRegression(C=100.0, random_state=i)
    lr.fit(x_train_std, y_i[train])
    score = lr.score(x_test_std, y_i[test])
    scores.append(score)

    print(f"Score: {score: {0}.{3}}")
    
print(f"Acc: {np.mean(scores): {0}.{3}} +/- {np.std(scores): {0}.{3}}")

自助法
Score:  1.0
Score:  0.941
Score:  0.982
Score:  0.957
Score:  0.982
Score:  1.0
Score:  0.945
Score:  0.933
Score:  0.965
Score:  0.942
Acc:  0.965 +/-  0.0236


In [15]:
# slice worked

print(y_i)
print(train)
print(y_i[train])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[23, 132, 101, 74, 71, 145, 8, 124, 109, 39, 88, 86, 21, 42, 54, 68, 100, 52, 30, 86, 73, 83, 60, 131, 136, 87, 137, 141, 66, 116, 65, 9, 86, 112, 28, 43, 77, 35, 47, 1, 98, 81, 37, 31, 59, 135, 13, 109, 13, 122, 86, 24, 94, 50, 125, 38, 39, 5, 32, 60, 44, 78, 26, 120, 40, 62, 62, 70, 105, 77, 28, 128, 74, 52, 99, 70, 99, 100, 51, 89, 38, 146, 80, 19, 23, 3, 31, 58, 65, 116, 37, 112, 31, 41, 144, 95, 46, 121, 112, 77, 124, 91, 81, 0, 49, 139, 93, 52, 83, 110, 44, 131, 101, 120, 11, 15, 29, 122, 143, 110, 140, 79, 22, 79, 93, 4, 18, 103, 60, 115, 85, 39, 76, 81, 14, 35, 118, 9, 7, 31, 94, 13, 47, 18, 98, 8, 96, 89, 145, 5]
[0 2 2 1 1 2 0 2 2 0 1 1 0 0 1 1 2 1 0 1 1 1 1 2 2 1 2 2 1 2 1 0