In [1]:
# conda install graphviz
# conda install python-graphviz

In [2]:
import numpy as np
import pandas
from sklearn.model_selection import train_test_split

def load_rawdata():
    ret = []
    
    f = open("crx.data", 'r')
    while True:
        line = f.readline()
        if not line: 
            break
        line = line.replace('\n', '')
        ret.append(line.split(','))
        
    f.close()
    
    return ret

def to_onehot_vec(data_range, data):
    ret = []
    found = False
    
    for i in range(len(data_range)):
        if (data_range[i] == data):
            found = True
            ret.append(1)
        else:
            ret.append(0)
    
    if (found == False):
        print("Error!: " + str(data_range) + " real: "+data)
        
    return ret

def to_continuous(data):
    if (data == '?'):
        return 999
    
    return float(data)

def preprocess(data):
    ret = []
    
    for d in data:
        vec = []
        
        # A1: b, a.
        vec.extend(to_onehot_vec(['b','a','?'], d[0]))
        
        # A2:	continuous.
        vec.append(to_continuous(d[1]))
        
        # A3:	continuous.
        vec.append(to_continuous(d[2]))
        
        # A4:	u, y, l, t.
        vec.extend(to_onehot_vec(['u', 'y', 'l', 't','?'], d[3]))
        
        # A5:	g, p, gg.
        vec.extend(to_onehot_vec(['g', 'p', 'gg','?'], d[4]))
        
        # A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
        vec.extend(to_onehot_vec(['c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff','?'], d[5]))
        
        # A7:	v, h, bb, j, n, z, dd, ff, o.
        vec.extend(to_onehot_vec(['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o','?'], d[6]))
        
        # A8:	continuous.
        vec.append(to_continuous(d[7]))
        
        # A9:	t, f.
        vec.extend(to_onehot_vec(['t', 'f','?'], d[8]))
        
        # A10:	t, f.
        vec.extend(to_onehot_vec(['t', 'f','?'], d[9]))
        
        # A11:	continuous.
        vec.append(to_continuous(d[10]))
        
        # A12:	t, f..
        vec.extend(to_onehot_vec(['t', 'f','?'], d[11]))
        
        # A13:	g, p, s.
        vec.extend(to_onehot_vec(['g', 'p', 's','?'], d[12]))
        
        # A14:	continuous.
        vec.append(to_continuous(d[13]))
        
        # A15:	continuous.
        vec.append(to_continuous(d[14]))
        
        # A16: +,-         (class attribute)
        if (d[15] == '+'):
            vec.append(1)
        else:
            vec.append(0)
        
        ret.append(np.array(vec))
        
    return np.array(ret)

def load():
    return preprocess(load_rawdata())

data = load()
X, y = np.split(data,[-1],axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
# 2.1. CanonicalModels
# decision tree
# ref: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier

def CanonicalModels_2_1_decision_tree():
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
    print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))
    
    return clf

decision_tree_model = CanonicalModels_2_1_decision_tree()

훈련 세트 정확도: 1.000
테스트 세트 정확도: 0.807


In [16]:
# 2.1. CanonicalModels
# Support Vector Machine
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

def CanonicalModels_2_1_SVM():
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train)
    print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
    print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))
    
    return clf

svm_model = CanonicalModels_2_1_SVM()

훈련 세트 정확도: 0.976
테스트 세트 정확도: 0.526


  y = column_or_1d(y, warn=True)


In [18]:
# 2.2. Committee Machines
# Random Forest
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

def CommitteeMachines_2_2_RandomForest():
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
    print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))
    
    return clf

random_forest_model = CommitteeMachines_2_2_RandomForest()

훈련 세트 정확도: 0.874
테스트 세트 정확도: 0.864


  


In [21]:
# 2.2. Committee Machines
# Ada Boost
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier

def CommitteeMachines_2_2_AdaBoost():
    clf = AdaBoostClassifier(n_estimators=100, learning_rate=1)
    clf.fit(X_train, y_train)
    print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
    print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))
    
    return clf

ada_boost_model = CommitteeMachines_2_2_AdaBoost()

  y = column_or_1d(y, warn=True)


훈련 세트 정확도: 0.937
테스트 세트 정확도: 0.825


In [51]:
# 2.3. Deep Learning Model
# KerasClassification
# ref: https://keras.io/scikit-learn-api/

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

class NBatchLogger(Callback):
    def __init__(self, display):
        self.seen = 0
        self.display = display

    def on_batch_end(self, batch, logs={}):
        self.seen += logs.get('size', 0)
        if self.seen % self.display == 0:
            metrics_log = ''
            for k in self.params['metrics']:
                if k in logs:
                    val = logs[k]
                    if abs(val) > 1e-3:
                        metrics_log += ' - %s: %.4f' % (k, val)
                    else:
                        metrics_log += ' - %s: %.4e' % (k, val)
            print('{}/{} ... {}'.format(self.seen,
                                        self.params['samples'],
                                        metrics_log))
            
def DeepLearningModel_2_3_baseline_model():
    model = Sequential()
    model.add(Dense(10, input_dim=56, activation='relu'))
    model.add(Dense(10, input_dim=10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    return model

def DeepLearningModel_2_3_KerasClassifier():
    # create model
    clf = KerasClassifier(build_fn=DeepLearningModel_2_3_baseline_model, epochs=1500, batch_size=128, verbose=0)
    clf.fit(X_train, y_train,batch_size=128,callbacks=[NBatchLogger(display=1000)])
    print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
    print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))

DeepLearningModel_2_3_KerasClassifier()

26000/462 ...  - loss: 140.0463 - acc: 0.1094
52000/462 ...  - loss: 33.7205 - acc: 0.2266
훈련 세트 정확도: 0.266
테스트 세트 정확도: 0.272
