In [2]:
from srlearn.rdn import BoostedRDN
from srlearn import Database
from srlearn import Background
from sklearn.metrics import roc_auc_score, log_loss, precision_recall_curve, auc, precision_score, recall_score
import numpy as np
from get_datasets import *
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


import sys
sys.path.append('src')
from boost import VectorBoostedRDN

In [3]:
def cll_score(y_true, y_pred):
    def cll(posProb, negProb):
        llSum = 0
        for prob in posProb:
            if prob == 0:
                prob = 1e-6
            llSum += math.log(prob)
        for prob in negProb:
            if prob == 1:
                prob = 1 - 1e-6
            llSum += math.log(1 - prob)
        return llSum/(len(posProb) + len(negProb))
    posProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 1.0]
    negProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 0.0]
    return cll(posProb, negProb)

def pr_auc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [4]:
modes = [
        'professor(+person).',
        'student(+person).',
        'advisedby(+person,+person).',
        'advisedby(+person,-person).',
        'advisedby(-person,+person).',
        'tempadvisedby(+person,+person).',
        'tempadvisedby(+person,-person).',
        'tempadvisedby(-person,+person).',
        'ta(+course,+person,+quarter).',
        'ta(-course,-person,+quarter).',
        'ta(+course,-person,-quarter).',
        'ta(-course,+person,-quarter).',
        'hasposition(+person,+faculty).',
        'hasposition(+person,-faculty).',
        'hasposition(-person,+faculty).',
        'publication(+title,+person).',
        'publication(+title,-person).',
        'publication(-title,+person).',
        'inphase(+person,+prequals).',
        'inphase(+person,-prequals).',
        'inphase(-person,+prequals).',
        'courselevel(+course,+level).',
        'courselevel(+course,-level).',
        'courselevel(-course,+level).',
        'yearsinprogram(+person,+year).',
        'yearsinprogram(-person,+year).',
        'yearsinprogram(+person,-year).',
        'projectmember(+project,+person).',
        'projectmember(+project,-person).',
        'projectmember(-project,+person).',
        'sameproject(+project,+project).',
        'sameproject(+project,-project).',
        'sameproject(-project,+project).',
        'samecourse(+course,+course).',
        'samecourse(+course,-course).',
        'samecourse(-course,+course).',
        'sameperson(+person,+person).',
        'sameperson(+person,-person).',
        'sameperson(-person,+person).',
]

facts, pos, neg = datasets.load('uwcse', modes, target='advisedby')

In [51]:
for fold in range(len(pos)):
    test_db = Database()
    test_db.pos = pos[fold]
    test_db.neg = neg[fold]
    test_db.facts = facts[fold]
    
    train_db = Database()
    for i in range(len(pos)):
        if fold == i:
            continue
        train_db.pos.extend(pos[i])
        train_db.neg.extend(neg[i])
        train_db.facts.extend(facts[i])
        
    bk = Background(
        modes=modes,
    )

    clf = BoostedRDN(
        background=bk,
        target="advisedby",
    )
    
    clf.fit(train_db)
    
    y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
    y_pred = clf.predict_proba(test_db)
    y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])
    
    print('AUC ROC: {}'.format(roc_auc_score(y_true, y_pred)))
    print('AUC PR: {}'.format(pr_auc_score(y_true, y_pred)))
    print('Log loss: {}'.format(log_loss(y_true, y_pred)))
    print('Precision: {}'.format(precision_score(y_true, y_label)))
    print('Recall: {}'.format(recall_score(y_true, y_label)))
    print('\n')

AUC ROC: 0.9240816326530613
AUC PR: 0.8790245200792635
Log loss: 0.2520968575359174
Precision: 0.875
Recall: 1.0


AUC ROC: 1.0
AUC PR: 1.0000000000000002
Log loss: 0.15403280474754083
Precision: 1.0
Recall: 1.0


AUC ROC: 0.9259259259259258
AUC PR: 0.8987654320987655
Log loss: 0.23189491507313972
Precision: 0.9
Recall: 1.0


AUC ROC: 0.9265381083562902
AUC PR: 0.8886354413926365
Log loss: 0.2858761596616291
Precision: 0.8611111111111112
Recall: 0.9393939393939394


AUC ROC: 0.96875
AUC PR: 0.9593162785947712
Log loss: 0.21654744954421234
Precision: 0.8888888888888888
Recall: 1.0




In [17]:
for fold in range(len(pos)):
    test_db = Database()
    test_db.pos = pos[fold]
    test_db.neg = neg[fold]
    test_db.facts = facts[fold]
    
    train_db = Database()
    for i in range(len(pos)):
        if fold == i:
            continue
        train_db.pos.extend(pos[i])
        train_db.neg.extend(neg[i])
        train_db.facts.extend(facts[i])
        
    bk = Background(
        modes=modes,
    )

    clf = VectorBoostedRDN(
        background=bk,
        target="advisedby",
        n_estimators=10,
    )
    
    clf.fit(train_db)
    
    y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
    y_pred = clf.predict_proba(test_db)
    y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])

    print('Original RDNBoost')
    print('AUC ROC: {}'.format(roc_auc_score(y_true, y_pred)))
    print('AUC PR: {}'.format(pr_auc_score(y_true, y_pred)))
    print('Log loss: {}'.format(log_loss(y_true, y_pred)))
    print('Precision: {}'.format(precision_score(y_true, y_label)))
    print('Recall: {}'.format(recall_score(y_true, y_label)))
    print('\n')
    
    X_train = clf.get_proved_vector(train_db)
    # Create the model
    model = Sequential()
    model.add(Dense(5, input_shape=(len(X_train[0]),)))
    model.add(Dense(5))
    model.add(Dense(2, activation='softmax'))
    
    # Configure the model and start training
    Y_train = np.array([[0.0, 1.0] for _ in range(len(train_db.pos))] + [[1.0, 0.0] for _ in range(len(train_db.neg))])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, Y_train, epochs=25, batch_size=32)
    
    X_test = clf.get_proved_vector(test_db)
    y_pred = model.predict(X_test)[:, 1]
    
    print('Ensemble')
    print('AUC ROC: {}'.format(roc_auc_score(y_true, y_pred)))
    print('AUC PR: {}'.format(pr_auc_score(y_true, y_pred)))
    print('Log loss: {}'.format(log_loss(y_true, y_pred)))
    print('Precision: {}'.format(precision_score(y_true, y_label)))
    print('Recall: {}'.format(recall_score(y_true, y_label)))
    print('\n')

Original RDNBoost
AUC ROC: 0.9453061224489796
AUC PR: 0.9343455187160757
Log loss: 0.29891661710718237
Precision: 0.8571428571428571
Recall: 0.8571428571428571


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Ensemble
AUC ROC: 0.9583673469387755
AUC PR: 0.9573082638468996
Log loss: 0.24545211528560945
Precision: 0.8571428571428571
Recall: 0.8571428571428571


Original RDNBoost
AUC ROC: 0.96375
AUC PR: 0.9589689805808226
Log loss: 0.21258742449374451
Precision: 0.9090909090909091
Recall: 1.0


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epo

Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Ensemble
AUC ROC: 0.9178145087235995
AUC PR: 0.9181015463092351
Log loss: 0.47865612909310695
Precision: 0.7619047619047619
Recall: 0.9696969696969697


Original RDNBoost
AUC ROC: 0.84375
AUC PR: 0.8113029970760234
Log loss: 0.39207527645096474
Precision: 0.7619047619047619
Recall: 1.0


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Ensemble
AUC ROC: 0.87890625
AUC PR: 0.8505254271496048
Log loss: 0.43801809096476063
Precision: 0.7619047619047619
Recall: 1.0


