In [1]:
# conda install graphviz
# conda install python-graphviz

In [47]:
import numpy as np
import pandas
from sklearn.model_selection import train_test_split

def load_rawdata():
    ret = []
    
    f = open("crx.data", 'r')
    while True:
        line = f.readline()
        if not line: 
            break
        line = line.replace('\n', '')
        ret.append(line.split(','))
        
    f.close()
    
    return ret

def to_onehot_vec(data_range, data):
    ret = []
    found = False
    
    for i in range(len(data_range)):
        if (data_range[i] == data):
            found = True
            ret.append(1)
        else:
            ret.append(0)
    
    if (found == False):
        print("Error!: " + str(data_range) + " real: "+data)
        
    return ret

def to_continuous(data):
    if (data == '?'):
        return 999
    
    return float(data)

def preprocess(data):
    ret = []
    
    for d in data:
        vec = []
        
        # A1: b, a.
        vec.extend(to_onehot_vec(['b','a','?'], d[0]))
        
        # A2:	continuous.
        vec.append(to_continuous(d[1]))
        
        # A3:	continuous.
        vec.append(to_continuous(d[2]))
        
        # A4:	u, y, l, t.
        vec.extend(to_onehot_vec(['u', 'y', 'l', 't','?'], d[3]))
        
        # A5:	g, p, gg.
        vec.extend(to_onehot_vec(['g', 'p', 'gg','?'], d[4]))
        
        # A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
        vec.extend(to_onehot_vec(['c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff','?'], d[5]))
        
        # A7:	v, h, bb, j, n, z, dd, ff, o.
        vec.extend(to_onehot_vec(['v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o','?'], d[6]))
        
        # A8:	continuous.
        vec.append(to_continuous(d[7]))
        
        # A9:	t, f.
        vec.extend(to_onehot_vec(['t', 'f','?'], d[8]))
        
        # A10:	t, f.
        vec.extend(to_onehot_vec(['t', 'f','?'], d[9]))
        
        # A11:	continuous.
        vec.append(to_continuous(d[10]))
        
        # A12:	t, f..
        vec.extend(to_onehot_vec(['t', 'f','?'], d[11]))
        
        # A13:	g, p, s.
        vec.extend(to_onehot_vec(['g', 'p', 's','?'], d[12]))
        
        # A14:	continuous.
        vec.append(to_continuous(d[13]))
        
        # A15:	continuous.
        vec.append(to_continuous(d[14]))
        
        # A16: +,-         (class attribute)
        if (d[15] == '+'):
            vec.append(1)
        else:
            vec.append(0)
        
        ret.append(np.array(vec))
        
    return np.array(ret)

def load():
    return preprocess(load_rawdata())

def get_tp(y_test, y_pred):
    tp = 0
    
    for i in range(len(y_test)):
        if (y_test[i] == 1 and y_pred[i] == 1):
            tp = tp + 1
            
    return tp

def get_tn(y_test, y_pred):
    tn = 0
    
    for i in range(len(y_test)):
        if (y_test[i] == 0 and y_pred[i] == 0):
            tn = tn + 1
            
    return tn

def get_fp(y_test, y_pred):
    fp = 0
    
    for i in range(len(y_test)):
        if (y_test[i] == 0 and y_pred[i] == 1):
            fp = fp + 1
            
    return fp

def get_fn(y_test, y_pred):
    fn = 0
    
    for i in range(len(y_test)):
        if (y_test[i] == 1 and y_pred[i] == 0):
            fn = fn + 1
            
    return fn

def get_accuracy(y_test, y_pred):
    tp = get_tp(y_test, y_pred)
    tn = get_tn(y_test, y_pred)
    fp = get_fp(y_test, y_pred)
    fn = get_fn(y_test, y_pred)
    
    return (tp + tn) / (tp + tn + fp + fn)

def get_precision(y_test, y_pred):
    tp = get_tp(y_test, y_pred)
    fp = get_fp(y_test, y_pred)
    
    if (tp + fp == 0):
        return 0
    
    return tp / (tp + fp)

def get_recall(y_test, y_pred):
    tp = get_tp(y_test, y_pred)
    fn = get_fn(y_test, y_pred)
    
    if (tp + fn == 0):
        return 0
    
    return tp / (tp + fn)

def get_f1(y_test, y_pred):
    precision = get_precision(y_test, y_pred)
    recall = get_recall(y_test, y_pred)
    
    if (precision + recall == 0):
        return 0
    
    return (2 * precision * recall) / (precision + recall)
    
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

def evaluation(y_test, y_pred):
    print("My Accuracy :\t" + str(get_accuracy(y_test, y_pred)) + "\t\t" + "sklearn Accuracy :\t"+str(accuracy_score(y_test, y_pred)))
    print("My Precision:\t" + str(get_precision(y_test, y_pred)) + "\t\t" + "sklearn Precision:\t"+str(precision_score(y_test, y_pred)))
    print("My Recall   :\t" + str(get_recall(y_test, y_pred)) + "\t\t" + "sklearn Recall   :\t"+str(recall_score(y_test, y_pred)))
    print("My F1       :\t" + str(get_f1(y_test, y_pred)) + "\t\t" + "sklearn F1       :\t"+str(f1_score(y_test, y_pred)))
    
data = load()
X, y = np.split(data,[-1],axis=1)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [49]:
# 2.1. CanonicalModels
# decision tree
# ref: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier

def CanonicalModels_2_1_decision_tree():
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    
    return clf.predict(X_test)

evaluation(y_test, CanonicalModels_2_1_decision_tree())

My Accuracy :	0.8070175438596491		sklearn Accuracy :	0.8070175438596491
My Precision:	0.797979797979798		sklearn Precision:	0.797979797979798
My Recall   :	0.7669902912621359		sklearn Recall   :	0.7669902912621359
My F1       :	0.7821782178217821		sklearn F1       :	0.7821782178217821


In [50]:
# 2.1. CanonicalModels
# Support Vector Machine
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

def CanonicalModels_2_1_SVM():
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train)
    
    return clf.predict(X_test)

evaluation(y_test, CanonicalModels_2_1_SVM())

My Accuracy :	0.5263157894736842		sklearn Accuracy :	0.5263157894736842
My Precision:	0.4358974358974359		sklearn Precision:	0.4358974358974359
My Recall   :	0.1650485436893204		sklearn Recall   :	0.1650485436893204
My F1       :	0.23943661971830985		sklearn F1       :	0.23943661971830985


  y = column_or_1d(y, warn=True)


In [51]:
# 2.2. Committee Machines
# Random Forest
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

def CommitteeMachines_2_2_RandomForest():
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    
    return clf.predict(X_test)

evaluation(y_test, CommitteeMachines_2_2_RandomForest())

My Accuracy :	0.8640350877192983		sklearn Accuracy :	0.8640350877192983
My Precision:	0.9		sklearn Precision:	0.9
My Recall   :	0.7864077669902912		sklearn Recall   :	0.7864077669902912
My F1       :	0.8393782383419689		sklearn F1       :	0.8393782383419689


  


In [52]:
# 2.2. Committee Machines
# Ada Boost
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier

def CommitteeMachines_2_2_AdaBoost():
    clf = AdaBoostClassifier(n_estimators=100, learning_rate=1)
    clf.fit(X_train, y_train)
    
    return clf.predict(X_test)

evaluation(y_test, CommitteeMachines_2_2_AdaBoost())

My Accuracy :	0.8245614035087719		sklearn Accuracy :	0.8245614035087719
My Precision:	0.8181818181818182		sklearn Precision:	0.8181818181818182
My Recall   :	0.7864077669902912		sklearn Recall   :	0.7864077669902912
My F1       :	0.801980198019802		sklearn F1       :	0.801980198019802


  y = column_or_1d(y, warn=True)


In [53]:
# 2.3. Deep Learning Model
# KerasClassification
# ref: https://keras.io/scikit-learn-api/

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import Callback
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

class NBatchLogger(Callback):
    def __init__(self, display):
        self.seen = 0
        self.display = display

    def on_batch_end(self, batch, logs={}):
        self.seen += logs.get('size', 0)
        if self.seen % self.display == 0:
            metrics_log = ''
            for k in self.params['metrics']:
                if k in logs:
                    val = logs[k]
                    if abs(val) > 1e-3:
                        metrics_log += ' - %s: %.4f' % (k, val)
                    else:
                        metrics_log += ' - %s: %.4e' % (k, val)
            print('{}/{} ... {}'.format(self.seen,
                                        self.params['samples'],
                                        metrics_log))
            
def DeepLearningModel_2_3_baseline_model():
    model = Sequential()
    model.add(Dense(10, input_dim=56, activation='relu'))
    model.add(Dense(10, input_dim=10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    return model

def DeepLearningModel_2_3_KerasClassifier():
    # create model
    clf = KerasClassifier(build_fn=DeepLearningModel_2_3_baseline_model, epochs=1500, batch_size=128, verbose=0)
    clf.fit(X_train, y_train,batch_size=128,callbacks=[NBatchLogger(display=1000)])
    
    return clf.predict(X_test)

evaluation(y_test, DeepLearningModel_2_3_KerasClassifier())

Using TensorFlow backend.


26000/462 ...  - loss: 6.0595 - acc: 0.3906
52000/462 ...  - loss: 1.0678 - acc: 0.5312
78000/462 ...  - loss: 0.5167 - acc: 0.5859
231000/462 ...  - loss: 0.2287 - acc: 0.7051
257000/462 ...  - loss: 0.2047 - acc: 0.7188
283000/462 ...  - loss: 0.2259 - acc: 0.7891
309000/462 ...  - loss: 0.6441 - acc: 0.7734
462000/462 ...  - loss: 0.1876 - acc: 0.8205
488000/462 ...  - loss: 0.1105 - acc: 0.8359
514000/462 ...  - loss: 0.2799 - acc: 0.7812
540000/462 ...  - loss: 0.1255 - acc: 0.8359
693000/462 ...  - loss: 0.9759 - acc: 0.5641
My Accuracy :	0.6710526315789473		sklearn Accuracy :	0.6710526315789473
My Precision:	0.6666666666666666		sklearn Precision:	0.6666666666666666
My Recall   :	0.5436893203883495		sklearn Recall   :	0.5436893203883495
My F1       :	0.5989304812834224		sklearn F1       :	0.5989304812834224
