In [56]:
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score

In [43]:
!ls ./dataset_0914/

[31mBostonHousing.csv[m[m             [31miris.csv[m[m
[31mBostonHousing_description.txt[m[m [31mliver.csv[m[m
[31mPimaIndiansDiabetes.csv[m[m       [31mprestige.csv[m[m
[31mcars.csv[m[m                      [31mucla_admit.csv[m[m
[31mcredit.csv[m[m


In [44]:
pmd_dataset = pd.read_csv('./dataset_0914/PimaIndiansDiabetes.csv')

In [45]:
pmd_y = pmd_dataset['diabetes']
pmd_X = pmd_dataset.drop(['diabetes'], axis=1)
pmd_y.describe()

count     768
unique      2
top       neg
freq      500
Name: diabetes, dtype: object

In [46]:
train_X, test_X, train_y, test_y = train_test_split(pmd_X, pmd_y, test_size=.3, random_state=1234)

In [47]:
pmd_dct = DecisionTreeClassifier(random_state=1234)
pmd_rfc = RandomForestClassifier(n_estimators=10, random_state=1234)
pmd_svm = svm.SVC(random_state=1234)

In [48]:
# K-fold CV
kf = KFold(n_splits=10, random_state=1234, shuffle=True)
DCT_acc = []
RFC_acc = []
SVM_acc = []
fold = 0

In [49]:
for train_index, test_index in kf.split(pmd_X):
    print(f"fold: {fold}")
    
    # split train/test set
    train_X, test_X = pmd_X.iloc[train_index], pmd_X.iloc[test_index]
    train_y, test_y = pmd_y.iloc[train_index], pmd_y.iloc[test_index]
    
    # fit model
    pmd_dct.fit(train_X, train_y)
    pmd_rfc.fit(train_X, train_y)
    pmd_svm.fit(train_X, train_y)
    
    dct_pred = pmd_dct.predict(test_X)
    rfc_pred = pmd_rfc.predict(test_X)
    svm_pred = pmd_svm.predict(test_X)
    
    dct_acc = accuracy_score(test_y, dct_pred)
    rfc_acc = accuracy_score(test_y, rfc_pred)
    svm_acc = accuracy_score(test_y, svm_pred)
    
    print(f"DCT acc: {dct_acc:.3f}")
    DCT_acc.append(dct_acc)
    print(f"RFC acc: {rfc_acc:.3f}")
    RFC_acc.append(rfc_acc)
    print(f"SVM acc: {svm_acc:.3f}")
    SVM_acc.append(svm_acc)
    
    fold += 1

print(f"DCT mean acc: {np.mean(DCT_acc)}")
print(f"RFC mean acc: {np.mean(RFC_acc)}")
print(f"SVM mean acc: {np.mean(SVM_acc)}")

fold: 0
DCT acc: 0.701
RFC acc: 0.753
SVM acc: 0.740
fold: 1
DCT acc: 0.623
RFC acc: 0.727
SVM acc: 0.740
fold: 2
DCT acc: 0.727
RFC acc: 0.714
SVM acc: 0.701
fold: 3
DCT acc: 0.714
RFC acc: 0.753
SVM acc: 0.844
fold: 4
DCT acc: 0.636
RFC acc: 0.610
SVM acc: 0.779
fold: 5
DCT acc: 0.662
RFC acc: 0.727
SVM acc: 0.714
fold: 6
DCT acc: 0.714
RFC acc: 0.753
SVM acc: 0.727
fold: 7
DCT acc: 0.792
RFC acc: 0.766
SVM acc: 0.792
fold: 8
DCT acc: 0.697
RFC acc: 0.750
SVM acc: 0.803
fold: 9
DCT acc: 0.737
RFC acc: 0.684
SVM acc: 0.763
DCT mean acc: 0.700563909774436
RFC mean acc: 0.7239405331510594
SVM mean acc: 0.760475051264525


In [59]:
# 2번 문제

linear_svm = svm.SVC(kernel='linear')
poly_svm = svm.SVC(kernel='poly')
rbf_svm = svm.SVC(kernel='rbf')
sigmoid_svm = svm.SVC(kernel='sigmoid')
precom_svm = svm.SVC(kernel='precomputed')

svm_models = [linear_svm, poly_svm, rbf_svm, sigmoid_svm, precom_svm]
svm_acc = [[], [], [], [], []]

kf = KFold(n_splits=10, random_state=1234, shuffle=True)

for model in svm_models:
    print(f"model: SVC_{model.kernel}")
    
    for train_index, test_index in tqdm(kf.split(pmd_X), total=kf.get_n_splits(), desc="k-fold"):

        # KFold data split
        if model.kernel == "precomputed":
            train_pre_X, test_pre_X = pmd_X.iloc[train_index], pmd_X.iloc[test_index]
            train_X = np.dot(train_pre_X, train_pre_X.T)
            test_X = np.dot(test_pre_X, train_pre_X.T)
            train_y, test_y = pmd_y.iloc[train_index], pmd_y.iloc[test_index]
        else:
            train_X, test_X = pmd_X.iloc[train_index], pmd_X.iloc[test_index]
            train_y, test_y = pmd_y.iloc[train_index], pmd_y.iloc[test_index]

        # fit model and predict
        model.fit(train_X, train_y)
        pred_y = model.predict(test_X)
        
        acc = accuracy_score(test_y, pred_y)
        svm_acc[svm_models.index(model)].append(acc)
        
print(svm_acc)

k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

model: SVC_linear


k-fold: 100%|██████████| 10/10 [00:33<00:00,  3.33s/it]
k-fold:  50%|█████     | 5/10 [00:00<00:00, 40.89it/s]

model: SVC_poly


k-fold: 100%|██████████| 10/10 [00:00<00:00, 43.32it/s]
k-fold: 100%|██████████| 10/10 [00:00<00:00, 74.31it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

model: SVC_rbf
model: SVC_sigmoid


k-fold: 100%|██████████| 10/10 [00:00<00:00, 67.64it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

model: SVC_precomputed


k-fold: 100%|██████████| 10/10 [00:33<00:00,  3.36s/it]

[[0.7922077922077922, 0.7272727272727273, 0.7532467532467533, 0.8571428571428571, 0.7662337662337663, 0.7272727272727273, 0.7272727272727273, 0.7662337662337663, 0.8157894736842105, 0.8157894736842105], [0.7402597402597403, 0.7402597402597403, 0.6753246753246753, 0.8571428571428571, 0.7272727272727273, 0.7402597402597403, 0.7402597402597403, 0.7922077922077922, 0.8157894736842105, 0.8026315789473685], [0.7402597402597403, 0.7402597402597403, 0.7012987012987013, 0.8441558441558441, 0.7792207792207793, 0.7142857142857143, 0.7272727272727273, 0.7922077922077922, 0.8026315789473685, 0.7631578947368421], [0.5064935064935064, 0.4675324675324675, 0.5454545454545454, 0.5324675324675324, 0.5844155844155844, 0.5064935064935064, 0.4805194805194805, 0.4155844155844156, 0.5131578947368421, 0.40789473684210525], [0.7922077922077922, 0.7272727272727273, 0.7532467532467533, 0.8571428571428571, 0.7662337662337663, 0.7272727272727273, 0.7272727272727273, 0.7662337662337663, 0.8157894736842105, 0.8157894




In [66]:
means = []
for i in svm_acc:
    means.append(np.mean(i, axis=0))
highest_model = means.index(max(means))
print(f"Model: {svm_models[highest_model]}\nHighest mean: {max(means)}")
print(means)

Model: SVC(kernel='linear')
Highest mean: 0.7748462064251538
[0.7748462064251538, 0.7631408065618592, 0.760475051264525, 0.4960013670539986, 0.7748462064251538]


In [71]:
# 3번 문제
estimators = [100, 200, 300, 400, 500]
features = [1, 2, 3, 4, 5]

max_acc = 0
max_est = None
max_feat = None

kf = KFold(n_splits=10, random_state=1234, shuffle=True)

for i in range(5):
    for j in range(5):
        accs = []
        # apply each params to model
        rfc_model = RandomForestClassifier(
            n_estimators=estimators[i], 
            max_features=features[j], 
            random_state=1234)

        for train_index, test_index in tqdm(kf.split(pmd_X), total=kf.get_n_splits(), desc="k-fold"):

            # split train/test set
            train_X, test_X = pmd_X.iloc[train_index], pmd_X.iloc[test_index]
            train_y, test_y = pmd_y.iloc[train_index], pmd_y.iloc[test_index]

            rfc_model.fit(train_X, train_y)

            pred_y = rfc_model.predict(test_X)

            acc = accuracy_score(test_y, pred_y)
            accs.append(acc)
        print(f"est: {estimators[i]} feat: {features[j]} \nmean acc: {np.mean(accs, axis=0)}")
        if max_acc < np.mean(accs, axis=0):
            max_acc = np.mean(accs, axis=0)
            max_est = estimators[i]
            max_feat = features[j]

        
print(f"Max Accuracy\nest: {max_est} feat: {max_feat} \nmean acc: {max_acc}")

k-fold: 100%|██████████| 10/10 [00:01<00:00,  5.50it/s]
k-fold:  10%|█         | 1/10 [00:00<00:01,  5.19it/s]

est: 100 feat: 1 
mean acc: 0.7656698564593302


k-fold: 100%|██████████| 10/10 [00:01<00:00,  5.25it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 100 feat: 2 
mean acc: 0.7643198906356802


k-fold: 100%|██████████| 10/10 [00:02<00:00,  4.86it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 100 feat: 3 
mean acc: 0.7694976076555023


k-fold: 100%|██████████| 10/10 [00:02<00:00,  4.52it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 100 feat: 4 
mean acc: 0.7734449760765549


k-fold: 100%|██████████| 10/10 [00:02<00:00,  4.29it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 100 feat: 5 
mean acc: 0.7695146958304853


k-fold: 100%|██████████| 10/10 [00:03<00:00,  2.97it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 200 feat: 1 
mean acc: 0.7643540669856459


k-fold: 100%|██████████| 10/10 [00:03<00:00,  2.71it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 200 feat: 2 
mean acc: 0.7708304853041694


k-fold: 100%|██████████| 10/10 [00:04<00:00,  2.49it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 200 feat: 3 
mean acc: 0.7682672590567327


k-fold: 100%|██████████| 10/10 [00:04<00:00,  2.35it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 200 feat: 4 
mean acc: 0.7682159945317839


k-fold: 100%|██████████| 10/10 [00:04<00:00,  2.10it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 200 feat: 5 
mean acc: 0.7786568694463432


k-fold: 100%|██████████| 10/10 [00:05<00:00,  1.93it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 300 feat: 1 
mean acc: 0.7734791524265209


k-fold: 100%|██████████| 10/10 [00:05<00:00,  1.81it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 300 feat: 2 
mean acc: 0.768215994531784


k-fold: 100%|██████████| 10/10 [00:06<00:00,  1.66it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 300 feat: 3 
mean acc: 0.7669514695830486


k-fold: 100%|██████████| 10/10 [00:07<00:00,  1.39it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 300 feat: 4 
mean acc: 0.7721291866028708


k-fold: 100%|██████████| 10/10 [00:07<00:00,  1.41it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 300 feat: 5 
mean acc: 0.7681989063568011


k-fold: 100%|██████████| 10/10 [00:06<00:00,  1.44it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 400 feat: 1 
mean acc: 0.7708475734791523


k-fold: 100%|██████████| 10/10 [00:07<00:00,  1.38it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 400 feat: 2 
mean acc: 0.7630041011619959


k-fold: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 400 feat: 3 
mean acc: 0.7669514695830486


k-fold: 100%|██████████| 10/10 [00:08<00:00,  1.18it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 400 feat: 4 
mean acc: 0.7682330827067668


k-fold: 100%|██████████| 10/10 [00:09<00:00,  1.10it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 400 feat: 5 
mean acc: 0.7682159945317839


k-fold: 100%|██████████| 10/10 [00:08<00:00,  1.22it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 500 feat: 1 
mean acc: 0.7682672590567327


k-fold: 100%|██████████| 10/10 [00:08<00:00,  1.11it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 500 feat: 2 
mean acc: 0.7682159945317839


k-fold: 100%|██████████| 10/10 [00:09<00:00,  1.02it/s]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 500 feat: 3 
mean acc: 0.7669514695830485


k-fold: 100%|██████████| 10/10 [00:10<00:00,  1.05s/it]
k-fold:   0%|          | 0/10 [00:00<?, ?it/s]

est: 500 feat: 4 
mean acc: 0.7708304853041694


k-fold: 100%|██████████| 10/10 [00:11<00:00,  1.13s/it]

est: 500 feat: 5 
mean acc: 0.7708304853041694
Max Accuracy
est: 200 feat: 5 
mean acc: 0.7786568694463432



