In [44]:
from srlearn.rdn import BoostedRDN
from srlearn import Database
from srlearn import Background
from sklearn.metrics import roc_auc_score, log_loss, precision_recall_curve, auc, precision_score, recall_score
import numpy as np
from get_datasets import *
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LogisticRegression
import random

import sys
sys.path.append('src')
from boost import VectorBoostedRDN

In [2]:
def cll_score(y_true, y_pred):
    def cll(posProb, negProb):
        llSum = 0
        for prob in posProb:
            if prob == 0:
                prob = 1e-6
            llSum += math.log(prob)
        for prob in negProb:
            if prob == 1:
                prob = 1 - 1e-6
            llSum += math.log(1 - prob)
        return llSum/(len(posProb) + len(negProb))
    posProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 1.0]
    negProb = [prob for true, prob in zip(list(y_true), list(y_pred)) if true == 0.0]
    return cll(posProb, negProb)

def pr_auc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [55]:
modes = [
    'athleteledsportsteam(+athlete,+sportsteam).',
    'athleteledsportsteam(+athlete,-sportsteam).',
    'athleteledsportsteam(-athlete,+sportsteam).',
    'athleteplaysforteam(+athlete,+sportsteam).',
    'athleteplaysforteam(+athlete,-sportsteam).',
    'athleteplaysforteam(-athlete,+sportsteam).',
    'athleteplaysinleague(+athlete,+sportsleague).',
    'athleteplaysinleague(+athlete,-sportsleague).',
    'athleteplaysinleague(-athlete,+sportsleague).',
    'athleteplayssport(+athlete,+sport).',
    'athleteplayssport(+athlete,-sport).',
    'athleteplayssport(-athlete,+sport).',
    'teamalsoknownas(+sportsteam,+sportsteam).',
    'teamalsoknownas(+sportsteam,-sportsteam).',
    'teamalsoknownas(-sportsteam,+sportsteam).',
    'teamplaysagainstteam(+sportsteam,+sportsteam).',
    'teamplaysagainstteam(+sportsteam,-sportsteam).',
    'teamplaysagainstteam(-sportsteam,+sportsteam).',
    'teamplaysinleague(+sportsteam,+sportsleague).',
    'teamplaysinleague(+sportsteam,-sportsleague).',
    'teamplaysinleague(-sportsteam,+sportsleague).',
    'teamplayssport(+sportsteam,+sport).',
    'teamplayssport(+sportsteam,-sport).',
    'teamplayssport(-sportsteam,+sport).',
]

facts, pos, neg = datasets.load('nell_sports', modes, target='teamplayssport')

facts, pos, neg = facts[0], pos[0], neg[0]
pos, neg = datasets.split_into_folds(pos, 3), datasets.split_into_folds(neg, 3)

In [None]:
metrics = {}
for fold in range(len(pos)):
    results = {}
    for _ in range(len(pos)):
        test_db = Database()
        test_db.pos = pos[fold]
        test_db.neg = neg[fold]
        test_db.facts = facts

        train_db = Database()
        train_db.facts = facts
        for i in range(len(pos)):
            if fold == i:
                continue
            train_db.pos.extend(pos[i])
            train_db.neg.extend(neg[i])

        bk = Background(
            modes=modes,
        )

        clf = BoostedRDN(
            background=bk,
            target="teamplayssport",
            n_estimators=10,
        )

        clf.fit(train_db)

        y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
        y_pred = clf.predict_proba(test_db)
        y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])

        results.setdefault('roc', []).append(roc_auc_score(y_true, y_pred))
        results.setdefault('pr', []).append(pr_auc_score(y_true, y_pred))
        results.setdefault('log_loss', []).append(log_loss(y_true, y_pred))
        results.setdefault('precision', []).append(precision_score(y_true, y_label))
        results.setdefault('recall', []).append(recall_score(y_true, y_label))
    for key, value in results.items():
        metrics.setdefault(key, []).append(np.mean(value))

In [22]:
metrics

{'roc': [0.9337959183673469,
  0.9804999999999999,
  1.0,
  0.9916437098255282,
  0.891015625],
 'pr': [0.9303855664365361,
  0.9803909019089225,
  1.0,
  0.9919522567091352,
  0.8699783790341066],
 'log_loss': [0.3092268185953405,
  0.2291802806775261,
  0.21881786172520182,
  0.20021028887455397,
  0.3833369362392089],
 'precision': [0.8468421052631581,
  0.8635234330886504,
  1.0,
  0.9122689075630251,
  0.7619047619047619],
 'recall': [0.9485714285714286,
  0.95,
  0.888888888888889,
  0.9454545454545455,
  1.0]}

In [4]:
from model import NeuralRDN

In [11]:
metrics = {}
for fold in range(len(pos)):
    results = {}
    for _ in range(len(pos)):
        test_db = Database()
        test_db.pos = pos[fold]
        test_db.neg = neg[fold]
        test_db.facts = facts[fold]

        train_db = Database()
        for i in range(len(pos)):
            if fold == i:
                continue
            train_db.pos.extend(pos[i])
            train_db.neg.extend(neg[i])
            train_db.facts.extend(facts[i])

        bk = Background(
            modes=modes,
        )

        clf = NeuralRDN(
            background=bk,
            target="advisedby",
            n_estimators=1,
            n_boost_estimators=10,
        )

        clf.fit(train_db)

        y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
        y_pred = clf.predict_proba(test_db)
        y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])
        
        results.setdefault('roc', []).append(roc_auc_score(y_true, y_pred))
        results.setdefault('pr', []).append(pr_auc_score(y_true, y_pred))
        results.setdefault('log_loss', []).append(log_loss(y_true, y_pred))
        results.setdefault('precision', []).append(precision_score(y_true, y_label))
        results.setdefault('recall', []).append(recall_score(y_true, y_label))
    for key, value in results.items():
        metrics.setdefault(key, []).append(np.mean(value))



In [12]:
metrics

{'roc': [0.9012244897959183,
  0.9780000000000001,
  1.0,
  0.9934802571166207,
  0.876953125],
 'pr': [0.8687635544917756,
  0.9786785789431922,
  1.0000000000000002,
  0.9936771893776373,
  0.8442381193368036],
 'log_loss': [0.34772562357730097,
  0.20886873151175678,
  0.15615490740941215,
  0.14372346808174347,
  0.5810681204486172],
 'precision': [0.7966405075952199,
  0.8899999999999999,
  1.0,
  0.9244727568446732,
  0.7619047619047619],
 'recall': [0.937142857142857,
  0.95,
  0.888888888888889,
  0.9515151515151515,
  1.0]}

In [13]:
metrics = {}
for fold in range(len(pos)):
    results = {}
    for _ in range(len(pos)):
        test_db = Database()
        test_db.pos = pos[fold]
        test_db.neg = neg[fold]
        test_db.facts = facts[fold]

        train_db = Database()
        for i in range(len(pos)):
            if fold == i:
                continue
            train_db.pos.extend(pos[i])
            train_db.neg.extend(neg[i])
            train_db.facts.extend(facts[i])

        bk = Background(
            modes=modes,
        )

        clf = NeuralRDN(
            background=bk,
            target="advisedby",
            n_estimators=10,
            n_boost_estimators=1,
        )

        clf.fit(train_db)

        y_true = np.array([1.0 for _ in range(len(test_db.pos))] + [0.0 for _ in range(len(test_db.neg))])
        y_pred = clf.predict_proba(test_db)
        y_label = np.array([1.0 if i > 0.5 else 0.0 for i in y_pred])
        
        results.setdefault('roc', []).append(roc_auc_score(y_true, y_pred))
        results.setdefault('pr', []).append(pr_auc_score(y_true, y_pred))
        results.setdefault('log_loss', []).append(log_loss(y_true, y_pred))
        results.setdefault('precision', []).append(precision_score(y_true, y_label))
        results.setdefault('recall', []).append(recall_score(y_true, y_label))
    for key, value in results.items():
        metrics.setdefault(key, []).append(np.mean(value))

In [14]:
metrics

{'roc': [0.9517551020408164,
  0.9735000000000001,
  1.0,
  0.9804407713498623,
  0.901953125],
 'pr': [0.956262796027502,
  0.9733332551157208,
  1.0000000000000002,
  0.9833127997249527,
  0.8918796992481204],
 'log_loss': [0.24647344813016908,
  0.18871770443511196,
  0.18403350477003388,
  0.17861312601828214,
  0.486909983921214],
 'precision': [0.85,
  0.8658949745906268,
  1.0,
  0.9142857142857143,
  0.7619047619047619],
 'recall': [0.9714285714285713,
  0.97,
  0.8666666666666666,
  0.9696969696969697,
  1.0]}

In [23]:
rdnboost = {'roc': [0.9337959183673469,
  0.9804999999999999,
  1.0,
  0.9916437098255282,
  0.891015625],
 'pr': [0.9303855664365361,
  0.9803909019089225,
  1.0,
  0.9919522567091352,
  0.8699783790341066],
 'log_loss': [0.3092268185953405,
  0.2291802806775261,
  0.21881786172520182,
  0.20021028887455397,
  0.3833369362392089],
 'precision': [0.8468421052631581,
  0.8635234330886504,
  1.0,
  0.9122689075630251,
  0.7619047619047619],
 'recall': [0.9485714285714286,
  0.95,
  0.888888888888889,
  0.9454545454545455,
  1.0]}

boostingrdnmlp = {'roc': [0.9012244897959183,
  0.9780000000000001,
  1.0,
  0.9934802571166207,
  0.876953125],
 'pr': [0.8687635544917756,
  0.9786785789431922,
  1.0000000000000002,
  0.9936771893776373,
  0.8442381193368036],
 'log_loss': [0.34772562357730097,
  0.20886873151175678,
  0.15615490740941215,
  0.14372346808174347,
  0.5810681204486172],
 'precision': [0.7966405075952199,
  0.8899999999999999,
  1.0,
  0.9244727568446732,
  0.7619047619047619],
 'recall': [0.937142857142857,
  0.95,
  0.888888888888889,
  0.9515151515151515,
  1.0]}

baggingrdnmlp = {'roc': [0.9517551020408164,
  0.9735000000000001,
  1.0,
  0.9804407713498623,
  0.901953125],
 'pr': [0.956262796027502,
  0.9733332551157208,
  1.0000000000000002,
  0.9833127997249527,
  0.8918796992481204],
 'log_loss': [0.24647344813016908,
  0.18871770443511196,
  0.18403350477003388,
  0.17861312601828214,
  0.486909983921214],
 'precision': [0.85,
  0.8658949745906268,
  1.0,
  0.9142857142857143,
  0.7619047619047619],
 'recall': [0.9714285714285713,
  0.97,
  0.8666666666666666,
  0.9696969696969697,
  1.0]}

In [24]:
from IPython.display import display, Markdown, Latex

for metric, _ in rdnboost.items():
    display(Markdown('# Results for ' + metric))
    table = []
    table.append([
        '%.3f +/- %.3f' % (np.array(rdnboost[metric]).mean(), 2 * np.array(rdnboost[metric]).std()),
        '%.3f +/- %.3f' % (np.array(boostingrdnmlp[metric]).mean(), 2 * np.array(boostingrdnmlp[metric]).std()),
        '%.3f +/- %.3f' % (np.array(baggingrdnmlp[metric]).mean(), 2 * np.array(baggingrdnmlp[metric]).std()),
    ])
    display(pd.DataFrame(table, columns=['RDNBoost', 'Boosting RDN+MLP', 'Bagging RDN+MLP']))

# Results for roc

Unnamed: 0,RDNBoost,Boosting RDN+MLP,Bagging RDN+MLP
0,0.959 +/- 0.082,0.950 +/- 0.102,0.962 +/- 0.067


# Results for pr

Unnamed: 0,RDNBoost,Boosting RDN+MLP,Bagging RDN+MLP
0,0.955 +/- 0.097,0.937 +/- 0.133,0.961 +/- 0.075


# Results for log_loss

Unnamed: 0,RDNBoost,Boosting RDN+MLP,Bagging RDN+MLP
0,0.268 +/- 0.137,0.288 +/- 0.327,0.257 +/- 0.235


# Results for precision

Unnamed: 0,RDNBoost,Boosting RDN+MLP,Bagging RDN+MLP
0,0.877 +/- 0.157,0.875 +/- 0.173,0.878 +/- 0.156


# Results for recall

Unnamed: 0,RDNBoost,Boosting RDN+MLP,Bagging RDN+MLP
0,0.947 +/- 0.070,0.946 +/- 0.071,0.956 +/- 0.092
