In [1]:
from srlearn.rdn import BoostedRDN
from srlearn import Database
from srlearn import Background
from sklearn.metrics import roc_auc_score, log_loss, precision_recall_curve, auc, precision_score, recall_score
import numpy as np
from get_datasets import *
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LogisticRegression
import random

import sys
sys.path.append('src')
from boost import VectorBoostedRDN

In [2]:
def cll_score(y_true, y_pred):
    def cll(posProb, negProb):
        llSum = 0
        for prob in posProb:
            if prob == 0:
                prob = 1e-6
            llSum += math.log(prob)
        for prob in negProb:
            if prob == 1:
                prob = 1 - 1e-6
            llSum += math.log(1 - prob)
        return llSum/(len(posProb) + len(negProb))
    posProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 1.0]
    negProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 0.0]
    return cll(posProb, negProb)

def pr_auc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [3]:
modes = [
    'countryhascompanyoffice(+country,+company).',
    'countryhascompanyoffice(+country,-company).',
    'countryhascompanyoffice(-country,+company).',
    'companyeconomicsector(+company,+sector).',
    'companyeconomicsector(+company,-sector).',
    'companyeconomicsector(-company,+sector).',
    'economicsectorcompany(+sector,`company).',
    'economicsectorcompany(`sector,+company).',
    #'economicsectorcompany(+sector,+company).',
    #'economicsectorcompany(+sector,-company).',
    #'economicsectorcompany(-sector,+company).',
    #'ceoeconomicsector(+person,+sector).',
    #'ceoeconomicsector(+person,-sector).',
    #'ceoeconomicsector(-person,+sector).',
    'companyceo(+company,+person).',
    'companyceo(+company,-person).',
    'companyceo(-company,+person).',
    'companyalsoknownas(+company,+company).',
    'companyalsoknownas(+company,-company).',
    'companyalsoknownas(-company,+company).',
    'cityhascompanyoffice(+city,+company).',
    'cityhascompanyoffice(+city,-company).',
    'cityhascompanyoffice(-city,+company).',
    'acquired(+company,+company).',
    'acquired(+company,-company).',
    'acquired(-company,+company).',
    #'ceoof(+person,+company).',
    #'ceoof(+person,-company).',
    #'ceoof(-person,+company).',
    'bankbankincountry(+person,+country).',
    'bankbankincountry(+person,-country).',
    'bankbankincountry(-person,+country).',
    'bankboughtbank(+company,+company).',
    'bankboughtbank(+company,-company).',
    'bankboughtbank(-company,+company).',
    'bankchiefexecutiveceo(+company,+person).',
    'bankchiefexecutiveceo(+company,-person).',
    'bankchiefexecutiveceo(-company,+person).',
]

facts, pos, neg = datasets.load('nell_finances', modes, target='companyeconomicsector')

facts, pos, neg = facts[0], pos[0], neg[0]
pos, neg = datasets.split_into_folds(pos, 3), datasets.split_into_folds(neg, 3)

In [18]:
metrics = {}
for fold in range(len(pos)):
    results = {}
    for _ in range(len(pos)):
        test_db = Database()
        test_db.pos = pos[fold]
        test_db.neg = neg[fold]
        test_db.facts = facts

        train_db = Database()
        train_db.facts = facts
        for i in range(len(pos)):
            if fold == i:
                continue
            train_db.pos.extend(pos[i])
            train_db.neg.extend(neg[i])

        bk = Background(
            modes=modes,
            number_of_clauses=8,
        )

        clf = BoostedRDN(
            background=bk,
            target="companyeconomicsector",
            n_estimators=20,
        )

        clf.fit(train_db)

        y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
        y_pred = clf.predict_proba(test_db)
        y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])

        results.setdefault('roc', []).append(roc_auc_score(y_true, y_pred))
        results.setdefault('pr', []).append(pr_auc_score(y_true, y_pred))
        results.setdefault('log_loss', []).append(log_loss(y_true, y_pred))
        results.setdefault('precision', []).append(precision_score(y_true, y_label))
        results.setdefault('recall', []).append(recall_score(y_true, y_label))
    for key, value in results.items():
        metrics.setdefault(key, []).append(np.mean(value))

In [19]:
metrics

{'roc': [0.7255399408284023, 0.7672559171597634, 0.7177052661098092],
 'pr': [0.7836532362406764, 0.7892470181226905, 0.7536113999622445],
 'log_loss': [0.5937989449572965, 0.5784891284744239, 0.5944896073782946],
 'precision': [0.5863178455991922, 0.8696135340148285, 0.7905473843692015],
 'recall': [0.826923076923077, 0.3525641025641026, 0.35529715762273906]}

In [4]:
from model import NeuralRDN

In [5]:
metrics = {}
for fold in range(len(pos)):
    results = {}
    for _ in range(len(pos)):
        test_db = Database()
        test_db.pos = pos[fold]
        test_db.neg = neg[fold]
        test_db.facts = facts

        train_db = Database()
        train_db.facts = facts
        for i in range(len(pos)):
            if fold == i:
                continue
            train_db.pos.extend(pos[i])
            train_db.neg.extend(neg[i])

        bk = Background(
            modes=modes,
            number_of_clauses=4,
        )

        clf = NeuralRDN(
            background=bk,
            target="companyeconomicsector",
            max_tree_depth=2,
            n_estimators=50,
            n_boost_estimators=1,
            predicate_prob=0.5,
            sample_prob=0.5,
        )

        clf.fit(train_db)

        y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
        y_pred = clf.predict_proba(test_db)
        y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])
        
        results.setdefault('roc', []).append(roc_auc_score(y_true, y_pred))
        results.setdefault('pr', []).append(pr_auc_score(y_true, y_pred))
        results.setdefault('log_loss', []).append(log_loss(y_true, y_pred))
        results.setdefault('precision', []).append(precision_score(y_true, y_label))
        results.setdefault('recall', []).append(recall_score(y_true, y_label))
        
    for key, value in results.items():
        print('{}: {}'.format(key, np.mean(value)))
        metrics.setdefault(key, []).append(np.mean(value))

roc: 0.8222115384615384
pr: 0.8562793387638684
log_loss: 0.5049083447148008
precision: 0.7166848987121878
recall: 0.8051282051282053
roc: 0.7368811637080869
pr: 0.7562234736890758
log_loss: 0.5679390615795572
precision: 0.6941173188391527
recall: 0.7384615384615385
roc: 0.7980289646054924
pr: 0.8310661481797679
log_loss: 0.49986087123534984
precision: 0.8778599462077482
recall: 0.42635658914728686


In [6]:
metrics

{'roc': [0.8222115384615384, 0.7368811637080869, 0.7980289646054924],
 'pr': [0.8562793387638684, 0.7562234736890758, 0.8310661481797679],
 'log_loss': [0.5049083447148008, 0.5679390615795572, 0.49986087123534984],
 'precision': [0.7166848987121878, 0.6941173188391527, 0.8778599462077482],
 'recall': [0.8051282051282053, 0.7384615384615385, 0.42635658914728686]}

In [7]:
rdnboost = {'roc': [0.7255399408284023, 0.7672559171597634, 0.7177052661098092],
 'pr': [0.7836532362406764, 0.7892470181226905, 0.7536113999622445],
 'log_loss': [0.5937989449572965, 0.5784891284744239, 0.5944896073782946],
 'precision': [0.5863178455991922, 0.8696135340148285, 0.7905473843692015],
 'recall': [0.826923076923077, 0.3525641025641026, 0.35529715762273906]}

baggingrdnmlp = {'roc': [0.8222115384615384, 0.7368811637080869, 0.7980289646054924],
 'pr': [0.8562793387638684, 0.7562234736890758, 0.8310661481797679],
 'log_loss': [0.5049083447148008, 0.5679390615795572, 0.49986087123534984],
 'precision': [0.7166848987121878, 0.6941173188391527, 0.8778599462077482],
 'recall': [0.8051282051282053, 0.7384615384615385, 0.42635658914728686]}

In [8]:
from IPython.display import display, Markdown, Latex

for metric, _ in rdnboost.items():
    display(Markdown('# Results for ' + metric))
    table = []
    table.append([
        '%.3f +/- %.3f' % (np.array(rdnboost[metric]).mean(), 2 * np.array(rdnboost[metric]).std()),
        '%.3f +/- %.3f' % (np.array(baggingrdnmlp[metric]).mean(), 2 * np.array(baggingrdnmlp[metric]).std()),
    ])
    display(pd.DataFrame(table, columns=['RDNBoost', 'Bagging RDN+MLP']))

# Results for roc

Unnamed: 0,RDNBoost,Bagging RDN+MLP
0,0.737 +/- 0.043,0.786 +/- 0.072


# Results for pr

Unnamed: 0,RDNBoost,Bagging RDN+MLP
0,0.776 +/- 0.031,0.815 +/- 0.085


# Results for log_loss

Unnamed: 0,RDNBoost,Bagging RDN+MLP
0,0.589 +/- 0.015,0.524 +/- 0.062


# Results for precision

Unnamed: 0,RDNBoost,Bagging RDN+MLP
0,0.749 +/- 0.239,0.763 +/- 0.164


# Results for recall

Unnamed: 0,RDNBoost,Bagging RDN+MLP
0,0.512 +/- 0.446,0.657 +/- 0.330
