References:

Logistic regression: https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

ROC curves: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

Controlling the threshold in Logistic Regression in Scikit Learn: https://stackoverflow.com/questions/28716241/controlling-the-threshold-in-logistic-regression-in-scikit-learn

sklearn.metrics.precision_recall_curve: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html


# Preprocessing

In [1]:
import pickle
import numpy as np
import pandas as pd
import itertools
from scipy import interp
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib notebook


  import pandas.util.testing as tm


In [2]:
# Load and preprocess the KGE/curation dataset
with open('kge_dataset.pkl', 'rb') as f:
    kge_dataset = pickle.load(f)
kge_df = pd.DataFrame.from_records(kge_dataset)
kge_df = kge_df.fillna(0)
# Every column except agent names and stmt type should be int
dtype_dict = {col: 'int64' for col in kge_df.columns
              if col not in ('agA_name', 'stmt_type', 'agB_name')}
kge_df = kge_df.astype(dtype_dict)
kge_df.head()

Unnamed: 0,stmt_num,stmt_hash,agA_name,stmt_type,agB_name,correct,medscan,reach,sparser,rlimsp,trips,hprd,isi,biopax,bel,signor,trrust
0,0,28616404731074564,TGFB1,Activation,MAPK3,1,17,3,3,0,0,0,0,0,0,0,0
1,1,34133827417313284,TP53,IncreaseAmount,NKX2-1,0,0,6,0,0,0,0,0,0,0,0,0
2,2,2587683760588810,DACT2,Complex,CTNNB1,1,1,4,2,0,0,0,0,0,0,0,0
3,4,-20297417063899124,LRRK2,Phosphorylation,EIF4EBP1,1,1,11,13,10,0,0,0,0,0,0,0
4,5,-31679163966597107,IGF1,Activation,ERK,1,16,98,29,0,8,0,0,0,0,0,0


# Visualizing the data

In [3]:
plt.figure()

def rand_jitter(arr):
    stdev = .01*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev
reader1 = 'reach'
reader2 = 'sparser'

kge_cp = kge_df.copy()
kge_cp  = kge_cp[(kge_df[reader1] > 0) | (kge_df[reader2] > 0)]
kge_cp[reader1] = np.log(kge_df[reader1] + 1)
kge_cp[reader2] = np.log(kge_df[reader2] + 1)

kge_incorr = kge_cp[kge_cp['correct'] == 0]
kge_corr = kge_cp[kge_cp['correct'] == 1]

plt.plot(rand_jitter(kge_corr[reader1]),
         rand_jitter(kge_corr[reader2]), linestyle='', marker='.',
         color='blue', alpha=0.5, label='Correct')
plt.plot(rand_jitter(kge_incorr[reader1]),
         rand_jitter(kge_incorr[reader2]), linestyle='', marker='.',
                     color='red', alpha=0.5, label='Incorrect')
plt.xlabel(f'log({reader1} + 1)')
plt.ylabel(f'log({reader2} + 1)')
plt.legend(loc='upper left')

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fc7382a9e10>

In [4]:
source_cols = ['medscan', 'reach', 'sparser', 'rlimsp', 'trips', 'hprd', 'isi',
               'biopax', 'bel', 'signor', 'trrust']
reader = 'reach'
source_cols.pop(source_cols.index(reader))
# Reader only dataframe
ro_df = kge_df[~kge_df[source_cols].any(axis=1)]
x = list(range(1, 11))
r_only = []
r_all = []
r_only_cts = []
r_all_cts = []

def stderrp(p, n):
    pa = np.array(p)
    na = np.array(n)
    return(np.sqrt((pa*(1-pa))/na))

for i in x:
    lb = i
    if i < 10:
        ub = i+1
    else:
        ub = 10000
        
    r_only_rows = ro_df[(ro_df[reader] >= lb) & (ro_df[reader] < ub)]
    r_only.append(r_only_rows['correct'].mean())
    r_only_cts.append(len(r_only_rows['correct']))
    r_all_rows = kge_df[(kge_df[reader] >= lb) & (kge_df[reader] < ub)]
    r_all.append(r_all_rows['correct'].mean())
    r_all_cts.append(len(r_all_rows['correct']))
    
plt.figure()
plt.errorbar(x, r_only, linestyle='', yerr=stderrp(r_only, r_only_cts), marker='o', color='r', label=f'{reader} only')
plt.errorbar(x, r_all, linestyle='', yerr=stderrp(r_all, r_all_cts), marker='o', color='b', label=f'{reader} any')
plt.legend(loc='lower left')
plt.ylabel('Precision')
plt.xlabel(f'Num. of {reader}-only evidences')
plt.ylim([0, 1])


<IPython.core.display.Javascript object>

(0, 1)

# Examining probabilities and assumptions

In [154]:
r1 = 'reach'
r2 = 'sparser'
r3 = 'sparser'

# Convert counts and correct into boolean variables
dtype_dict = {col: 'bool' for col in kge_df.columns
              if col not in ('stmt_num', 'stmt_hash', 'agA_name', 'stmt_type', 'agB_name')}
bin_df = kge_df.astype(dtype_dict)
dtype_dict = {col: 'int64' for col in kge_df.columns
              if col not in ('stmt_num', 'stmt_hash', 'agA_name', 'stmt_type', 'agB_name')}
bin_df = bin_df.astype(dtype_dict)
#bin_df_sub = bin_df[(bin_df[r1] > 0) | (bin_df[r2] > 0)]
#bin_df_sub = bin_df[(bin_df[r1] > 0) | (bin_df[r2] > 0) | (bin_df[r3] > 0)]
bin_df_sub = bin_df
bin_df_sub.head()

Unnamed: 0,stmt_num,stmt_hash,agA_name,stmt_type,agB_name,correct,medscan,reach,sparser,rlimsp,trips,hprd,isi,biopax,bel,signor,trrust
0,0,28616404731074564,TGFB1,Activation,MAPK3,1,1,1,1,0,0,0,0,0,0,0,0
1,1,34133827417313284,TP53,IncreaseAmount,NKX2-1,0,0,1,0,0,0,0,0,0,0,0,0
2,2,2587683760588810,DACT2,Complex,CTNNB1,1,1,1,1,0,0,0,0,0,0,0,0
3,4,-20297417063899124,LRRK2,Phosphorylation,EIF4EBP1,1,1,1,1,1,0,0,0,0,0,0,0
4,5,-31679163966597107,IGF1,Activation,ERK,1,1,1,1,0,1,0,0,0,0,0,0


In [155]:
bin_df_sub.corr()

Unnamed: 0,stmt_num,stmt_hash,correct,medscan,reach,sparser,rlimsp,trips,hprd,isi,biopax,bel,signor,trrust
stmt_num,1.0,0.002233,0.00064,-0.032201,0.044611,0.008207,-0.017594,-0.073077,-0.001898,-0.013813,-0.009868,-0.029854,-0.064363,0.010059
stmt_hash,0.002233,1.0,0.071221,0.011844,0.026335,0.109752,0.106569,-0.002421,-0.008343,-0.013807,0.04359,-0.010486,0.025688,0.004797
correct,0.00064,0.071221,1.0,0.369241,0.161226,0.300249,0.135362,0.198552,0.157005,0.07349,0.106446,0.081996,0.130308,0.031781
medscan,-0.032201,0.011844,0.369241,1.0,0.228863,0.333141,-0.043731,0.322052,0.236793,0.138013,0.01648,0.107998,0.124881,0.039329
reach,0.044611,0.026335,0.161226,0.228863,1.0,0.1433,-0.142208,-0.139737,0.071675,0.035355,-0.03028,0.035122,0.032901,0.037085
sparser,0.008207,0.109752,0.300249,0.333141,0.1433,1.0,0.131491,0.232273,0.222626,0.124464,0.098908,0.102867,0.118098,-0.071283
rlimsp,-0.017594,0.106569,0.135362,-0.043731,-0.142208,0.131491,1.0,-0.150048,-0.091077,-0.084016,0.370785,-0.047833,0.122008,-0.036332
trips,-0.073077,-0.002421,0.198552,0.322052,-0.139737,0.232273,-0.150048,1.0,0.201432,0.165541,-0.070246,0.148282,0.06867,0.011475
hprd,-0.001898,-0.008343,0.157005,0.236793,0.071675,0.222626,-0.091077,0.201432,1.0,0.301695,0.058452,0.366944,0.298231,-0.022353
isi,-0.013813,-0.013807,0.07349,0.138013,0.035355,0.124464,-0.084016,0.165541,0.301695,1.0,-0.03317,0.193928,0.103341,-0.009903


In [125]:
#rs_only_df = kge_df[(kge_df[r1] > 0) | (kge_df[r2] > 0)]
#rs_only_df = kge_df[(kge_df[r1] > 0) | (kge_df[r2] > 0) | (kge_df[r3] > 0)]
# Readers are same / 100% dependent
#rs_only_df[r2] = kge_df[r1]
# Readers are independent
#rs_only_df[r2] = rs_only_df[r2].sample(frac=1).values
#rs_only_df = kge_df

def prob(v, c=None, df=bin_df_sub):
    """Probability of variables v: vals conditioned on variables c: vals"""
    if c is None:
        c = []
    if not v:
        raise ValueError("v variables must be specified")
    c_df = df
    for c_var, c_val in c:
        c_df = c_df[c_df[c_var] == c_val]
    vc_df = None
    for v_var, v_val in v:
        if vc_df is None:
            vc_df = c_df[c_df[v_var] == v_val]
        else:
            vc_df = vc_df[vc_df[v_var] == v_val]
    return len(vc_df) / len(c_df)

In [126]:
# Test for conditional independence--is R1 independent of R2 given corr?
print(f"p({reader1} | {reader2}, corr)", prob([(r1, 1)], [(r2, 1), ('correct', 1)]))
print(f"p({reader1} | corr)", prob([(r1, 1)], [('correct', 1)]))
print()

# Are r1 and r2 independent?
print(f"p({reader1} and {reader2})", prob([(r1, 1), (r2, 1)]))
print(f"p({reader1}) * p({reader2})", prob([(r1, 1)]) * prob([(r2, 1)]))
print()

print(f"Ground truth: p(corr, {reader1}, {reader2})", prob([('correct', 1), (r1, 1), (r2, 1)]))
print(f"Ground truth: p({reader1}, {reader2})", prob([(r1, 1), (r2, 1)]))
print(f"Ground truth: p(corr | {reader1}, {reader2})", prob([('correct', 1)], [(r1, 1), (r2, 1)]))
print(f"Assume corr CI of {reader1} given {reader2} = p(corr | {reader2})",
      prob([('correct', 1)], [(reader2, 1)]))


p(reach | sparser, corr) 0.8565310492505354
p(reach | corr) 0.7905138339920948

p(reach and sparser) 0.44136460554371004
p(reach) * p(sparser) 0.4107432681248039

Ground truth: p(corr, reach, sparser) 0.42643923240938164
Ground truth: p(reach, sparser) 0.44136460554371004
Ground truth: p(corr | reach, sparser) 0.966183574879227
Assume corr CI of reach given sparser = p(corr | sparser) 0.9174852652259332


In [127]:
# Both readers > 0
df = bin_df
print(r1, "only", len(df[(df[r1] > 0) & (df[r2] == 0)]))
print(r2, "only", len(df[(df[r2] > 0) & (df[r1] == 0)]))
print(r1, "and", r2, len(df[(df[r2] > 0) & (df[r1] > 0)]))
print("total", len(df))

reach only 296
sparser only 95
reach and sparser 414
total 938


In [128]:
# 1. The joint event
prob([('correct', 1), (r1, 1), (r2, 1)])

0.42643923240938164

In [129]:
# 2. By chain rule (exact vs above)
prob([(r1, 1)]) * prob([('correct', 1)], [(r1, 1)]) * prob([(r2, 1)], [('correct', 1), (r1, 1)])

0.42643923240938164

In [130]:
# 3. What we want to know: p(c|r,s) (exact)
prob([('correct', 1)], [(r1, 1), (r2, 1)])

0.966183574879227

In [131]:
# 4. What we want to know, by defn of cond prob vs. above (exact)
prob([('correct', 1), (r1, 1), (r2, 1)]) / prob([(r1, 1), (r2, 1)]) #

0.9661835748792269

In [132]:
# 5 Applying chain rule (combining 2 and 4)
((prob([(r1, 1)]) * prob([('correct', 1)], [(r1, 1)]) * prob([(r2, 1)], [('correct', 1), (r1, 1)])) / 
     prob([(r1, 1), (r2, 1)]))

0.9661835748792269

In [133]:
# 6. Applying Bayes rule for p(r2|c,r1)
((prob([(r1, 1)]) * prob([('correct', 1)], [(r1, 1)]) * prob([('correct', 1), (r1, 1)], [(r2, 1)]) * prob([(r2, 1)])) / 
     (prob([(r1, 1), (r2, 1)])*prob([('correct', 1)], [(r1, 1)])*prob([(r1, 1)])))

0.9661835748792269

In [134]:
# 7. Cancel terms * (Exact)
# p(c,r1|r2) * p(r2)
# ------------------
#      p(r1,r2)
exact = (prob([('correct', 1), (r1, 1)], [(r2, 1)]) * prob([(r2, 1)])) / (prob([(r1, 1), (r2, 1)]))
print("Exact:", exact)

# 8. Complete identity/correlation
redundant = prob([('correct', 1)], [(r2, 1)])
print("Redundant:", redundant)

# 9. Complete independence
indep = (prob([('correct', 1)], [(r1, 1)]) * prob([('correct', 1)], [(r2, 1)])) / prob([('correct', 1)])
print("Independent", indep)

# 10. Like coin flips
coins = 1 - (1 - prob([('correct', 1)], [(r1, 1)])) * (1 - prob([('correct', 1)], [(r2, 1)])) 
print("Joint belief:", coins)

# 8. Substituting approximation: p(c,r1|r2) = p(c|r2)*p(r1|r2)
# p(c|r2) * p(r1|r2) * p(r2)
# --------------------------
#      p(r1,r2)
#(prob(['correct'], [r2]) * prob([r1], [r2]) * prob([r2])) / (prob([r1, r2]))

# Full independence:
# p(c|r2) * p(r1) * p(r2)
# -------------------------- = p(c|r2)
#      p(r1) * p(r2)

# Full dependence:
# p(c|r2) * 1 * p(r2)
# --------------------------
#      p(r1,r2)

#((prob([r1]) * prob(['correct'], [r1]) * prob([r1], [r2]) * prob([r2])) / 
#     (prob([r1, r2])*prob(['correct'], [r1])*prob([r1])))

Exact: 0.9661835748792268
Redundant: 0.9174852652259332
Independent 0.9581931512352339
Joint belief: 0.9872160270068348


In [135]:
# An expression that we need.
# If only one reader, all rows will have at least one evidence from that reader, whether correct or not
prob([(r1, 1), (r2, 1)], [('correct', 1)])

0.5270092226613966

In [136]:
# Approximation of above - if readers are the same, this should be quite wrong; if independent, quite right
prob([(r1, 1)], [('correct', 1)]) * prob([(r2, 1)], [('correct', 1)])

0.4863899347487593

In [137]:
prob([(r1, 1)], [('correct', 1)])

0.7905138339920948

In [138]:
# Bayes rule vs p(r1,r2|c) - exact
(prob(['correct'], [r1, r2]) * prob([r1, r2])) / prob(['correct'])

ValueError: too many values to unpack (expected 2)

In [139]:
# Rearrange from above - exact
(prob([(r1, 1), (r2, 1)], [('correct', 1)]) * prob([('correct', 1)])) / prob([(r1, 1), (r2, 1)])

0.9661835748792271

In [140]:
# Now, apply our assumption that p(r,s|c) ~= p(r|c)p(s|c)
# A hint in the fact that effectiveness of this changes when we filter dataset to statements with at least
# one of the two readers
(prob([(r1, 1)], [('correct', 1)]) * prob([(r2, 1)], [('correct', 1)]) * prob([('correct', 1)])) / prob([(r1, 1), (r2, 1)])

0.8917148803727254

In [141]:
prob([('correct', 1)])

0.8091684434968017

# Approximating the joint distribution

## Bahadur Lazarsfeld second order model

The notation/nomenclature in the functions below here is based on page 3 of "Estimation in second order dependency model for multivariate binary data", E.H.S. Ip. Link [here](https://statistics.stanford.edu/sites/g/files/sbiybj6031/f/OLK%20NSF%20302.pdf). 

In [142]:
# alpha(Yi) = E(Yi), i.e., probability Yi = 1, i.e., p in the usual notation for bernoulli variables.
def alpha(col, df=bin_df_sub):
    return df[col].mean()

# This function returns the probability  p(Y1 = y1, Y2=y2, Y3=y3...) etc. as the product of
# the independent probabilities i.e., p(Y1=y1)*p(Y2=y2)*p(Y3=y3)*...
def p1y(col_vals):
    prod = 1
    for col, val in col_vals:
        if val not in (0, 1):
            raise ValueError("Values must be 0 or 1")
        prod *= (alpha(col)**val) * (1 - alpha(col))**(1 - val)
    return prod

# W has two senses in the notation used by Ip. If no value is given (val=None),
# evaluates to (Yi - p)/sqrt(pq) = a standardized version of Yi (offset by the mean and divided by the SD).
# If given a specific value yi (0 or 1), gives the standardized probability of yi.
def W(col, val=None, df=bin_df_sub):
    a = alpha(col, df)
    denom = np.sqrt(a * (1 - a))
    if val is None:
        numer = (df[col] - a).values
    else:
        numer = val - a
    return numer / denom

# Returns E(W1*W2*...*Wi). For the case of two variables Yi and Yj, r_ij = E(Wi*Wj) is
# equivalent to the correlation between Yi and Yj.
def r(cols):
    return np.mean(np.prod(np.stack([W(col) for col in cols]), axis=0))

# For a set of values associated with variables, i.e., y1, y2, ... yn,
# returns the product W(y1)*W(y2)*...*W(yn), i.e., the product of the normalized
# probabilities of each Yi taking a specific value yi.
def W_prod(col_vals):
    return np.prod([W(col, val) for col, val in col_vals])
    
# Generates a term in the expansion of the given order. For order 2,
# returns sum_(j<k) {rjk * Wj * Wk}, i.e., based on pairwise correlations.
def term(col_vals, order):
    total = 0
    for col_val_combo in itertools.combinations(col_vals, order):
        r_term = r([col for col, val in col_val_combo])
        W_term = W_prod(col_val_combo)
        total += r_term * W_term
    return total

def bahadur_laz(col_vals, max_order):
    fy = 1
    if max_order < 1 or max_order > len(col_vals):
        raise ValueError('max_order must be between 1 and len(col_vals)')
    for order in range(2, max_order+1):
        fy += term(col_vals, order)
    return p1y(col_vals) * fy    

In [144]:
bin_df_sub.corr()

Unnamed: 0,stmt_num,stmt_hash,correct,medscan,reach,sparser,rlimsp,trips,hprd,isi,biopax,bel,signor,trrust
stmt_num,1.0,0.002233,0.00064,-0.032201,0.044611,0.008207,-0.017594,-0.073077,-0.001898,-0.013813,-0.009868,-0.029854,-0.064363,0.010059
stmt_hash,0.002233,1.0,0.071221,0.011844,0.026335,0.109752,0.106569,-0.002421,-0.008343,-0.013807,0.04359,-0.010486,0.025688,0.004797
correct,0.00064,0.071221,1.0,0.369241,0.161226,0.300249,0.135362,0.198552,0.157005,0.07349,0.106446,0.081996,0.130308,0.031781
medscan,-0.032201,0.011844,0.369241,1.0,0.228863,0.333141,-0.043731,0.322052,0.236793,0.138013,0.01648,0.107998,0.124881,0.039329
reach,0.044611,0.026335,0.161226,0.228863,1.0,0.1433,-0.142208,-0.139737,0.071675,0.035355,-0.03028,0.035122,0.032901,0.037085
sparser,0.008207,0.109752,0.300249,0.333141,0.1433,1.0,0.131491,0.232273,0.222626,0.124464,0.098908,0.102867,0.118098,-0.071283
rlimsp,-0.017594,0.106569,0.135362,-0.043731,-0.142208,0.131491,1.0,-0.150048,-0.091077,-0.084016,0.370785,-0.047833,0.122008,-0.036332
trips,-0.073077,-0.002421,0.198552,0.322052,-0.139737,0.232273,-0.150048,1.0,0.201432,0.165541,-0.070246,0.148282,0.06867,0.011475
hprd,-0.001898,-0.008343,0.157005,0.236793,0.071675,0.222626,-0.091077,0.201432,1.0,0.301695,0.058452,0.366944,0.298231,-0.022353
isi,-0.013813,-0.013807,0.07349,0.138013,0.035355,0.124464,-0.084016,0.165541,0.301695,1.0,-0.03317,0.193928,0.103341,-0.009903


In [159]:
# p(R1=1, R2=1, corr=1)
r1 = 'reach'
r2 = 'trips'
r3 = 'medscan'
r4 = 'trips'
r5 = 'rlimsp'
#Y = [(r1, 1), (r2, 1), (r3, 1), (r4, 1), (r5, 1), ('correct', 1)]
Y = [(r1, 0), (r2, 0), (r3, 0), (r4, 0), (r5, 1), ('correct', 1)]
Y = [(r1, 1), (r2, 0), ('correct', 1)]

#Y = [(r1, 1), ('correct', 1)]
#Y = [(r1, 1), (r2, 0), ('correct', 1)]

given_vars = [r1, r2, r3, r4, r5]
given_vals = [(col, val) for col, val in Y if col in given_vars]

def belief(cols):
    prod = 1
    for col in cols:
        col_df = bin_df_sub[bin_df_sub[col] == 1]
        err = prob([('correct', 0)], df=col_df)
        prod *= err
    return 1 - prod

print(f"Ground truth: p(corr=1 | {str(given_vals)})", prob([('correct', 1)], given_vals))
print("Belief estimate:", belief_mult([r1]))


#print(f"Ground truth: p({str(Y)})", prob(Y))
for order in range(1, len(Y)+1):
    if order == 2:
        print(f"Bahadur-Lazarsfeld Order {order}:", bahadur_laz(Y, order) / prob(given_vals))

Ground truth: p(corr=1 | [('reach', 1), ('trips', 0)]) 0.7725225225225225
Belief estimate: 0.8450704225352113
Bahadur-Lazarsfeld Order 2: 0.7813861331109845


In [64]:
last_term = np.mean(W(r1) * W(r2) * W('correct')) * W(r1, 1) * W(r2, 1) * W('correct', 1)
p2y = p1y * ((1 + ft(r1, 1, 'correct', 1) + ft(r2, 1, 'correct', 1) + ft(r1, 1, r2, 1)))
p3y = p1y * ((1 + ft(r1, 1, 'correct', 1) + ft(r2, 1, 'correct', 1) + ft(r1, 1, r2, 1)) + last_term)

NameError: name 'ft' is not defined

In [441]:
p2y

0.48490898168611224

In [40]:
print(f"Ground truth: p(corr, {r1}, {r2})", prob([('correct', 1), (r1, 1), (r2, 1)]))
print(f"Ground truth: p({r1}, {r2})", prob([(r1, 1), (r2, 1)]))
print(f"Ground truth: p(corr | {r1}, {r2})", prob([('correct', 1)], [(r1, 1), (r2, 1)]))

Ground truth: p(corr, reach, sparser) 0.4968944099378882
Ground truth: p(reach, sparser) 0.5142857142857142
Ground truth: p(corr | reach, sparser) 0.966183574879227


In [443]:
p2y / prob([r1, r2])

0.9428785755007739

In [444]:
p3y / prob([r1, r2])

0.9661835748792272

# Working with Knowledge Graph Embeddings

In [3]:
# Load previous curation dataset and the KGE scores
with open('curation_dataset.pkl', 'rb') as f:
    data_dict = pickle.load(f)

kge_df_scores = pd.read_csv('kge_dataset_w_scores.csv', index_col=0)
kge_df_scores.head()

Unnamed: 0,subject,relation,object,correct,score
0,TP53,IncreaseAmount,NKX2-1,0,-7.389051
1,DACT2,Complex,CTNNB1,1,-8.082224
2,LRRK2,Phosphorylation,EIF4EBP1,1,-7.293999
3,TSC2,Complex,CDK1,1,-7.195977
4,BCAS2,DecreaseAmount,TP53,1,-7.502889


In [4]:
# Prepare the scores data for the join
kds = kge_df_scores
kds = kds.rename(columns={'subject': 'agA_name', 'object': 'agB_name', 'relation': 'stmt_type'})
kds = kds.set_index([kds.agA_name, kds.stmt_type, kds.agB_name, kds.correct])
kds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,agA_name,stmt_type,agB_name,correct,score
agA_name,stmt_type,agB_name,correct,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TP53,IncreaseAmount,NKX2-1,0,TP53,IncreaseAmount,NKX2-1,0,-7.389051
DACT2,Complex,CTNNB1,1,DACT2,Complex,CTNNB1,1,-8.082224
LRRK2,Phosphorylation,EIF4EBP1,1,LRRK2,Phosphorylation,EIF4EBP1,1,-7.293999
TSC2,Complex,CDK1,1,TSC2,Complex,CDK1,1,-7.195977
BCAS2,DecreaseAmount,TP53,1,BCAS2,DecreaseAmount,TP53,1,-7.502889


In [5]:
# Set the multiindex on the main dataframe and do the join
kge_join = kge_df.set_index([kge_df.agA_name, kge_df.stmt_type, kge_df.agB_name, kge_df.correct]).join(kds, rsuffix='sc')
kge_join = kge_join.set_index(kge_join.stmt_num)
kge_join = kge_join.drop(columns=['agA_namesc', 'stmt_typesc', 'agB_namesc', 'correctsc'])
kge_join = kge_join[~pd.isna(kge_join.score)] # Keep only the rows with KGE scores
kge_join.head()

Unnamed: 0_level_0,stmt_num,stmt_hash,agA_name,stmt_type,agB_name,correct,medscan,reach,sparser,rlimsp,trips,hprd,isi,biopax,bel,signor,trrust,score
stmt_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
425,425,35492552341195618,ABL1,Phosphorylation,BCAR1,1,1,4,4,2,0,0,0,0,0,0,0,-6.827783
525,525,17253596610577440,ABL1,Phosphorylation,DGCR8,1,0,0,0,1,0,0,0,0,0,0,0,-7.676037
704,704,35535328953779602,ABL1,Phosphorylation,ENAH,1,0,1,0,2,0,0,0,0,0,0,0,-7.470222
567,567,31602217128375424,ABL1,Phosphorylation,PRKCD,1,2,1,1,3,0,0,0,0,0,0,0,-6.561732
986,986,34962009800728561,ABL1,Phosphorylation,SORBS1,1,0,0,4,2,0,0,0,0,0,0,0,-7.506958


In [6]:
# Old version: Load the data from curation_dataset dictionary
# df = pd.DataFrame.from_dict(data_dict)
# Preprocessing: replace missing values with 0s and encode stmt type as int
# df = df.fillna(0).sample(frac=1)

# New version: drop irrelevant columns
df = kge_join.drop(columns=['stmt_hash', 'stmt_num', 'agA_name', 'agB_name'])
le = LabelEncoder()
le.fit(df.stmt_type)
df.stmt_type = le.transform(df.stmt_type)
df.head()
x_df = df.drop('correct', axis=1)
x = x_df.values
y = df.correct.values
test_split = 0.2
test_size = int(len(y) * test_split)

In [7]:
test_size

102

# Train Models

In [21]:
num_folds = 10
models = {
    'Logistic Regression': LogisticRegression(),
    #'Random Forest Classifier': RandomForestClassifier(),
}
predictors = {'All features': x,
              'Without KGE': x[:, :-1], # Drop the score column
              'Only KGE': x[:, -1:]} # Only the score column

clf_results = {}
roc_results = {}
roc_aucs= {}
# For each model...
base_fpr = np.linspace(0, 1, 101)
for clf_name, clf in models.items():
    # Try different predictors...
    for pred_name, pred_x in predictors.items():
        # Fold the data multiple times
        # Save the data in a matrix (num_folds, test_size)
        tpr_arr = np.zeros((num_folds, len(base_fpr)))
        roc_auc_arr = np.zeros(num_folds)
        for fold_ix in range(num_folds):
            # Split the data
            x_train, x_test, y_train, y_test = train_test_split(pred_x, y, test_size=test_size)
            print(clf_name, pred_name, x_train.shape)
            # Train the model
            clf.fit(x_train, y_train)
            # Save the data
            clf_result = {}
            clf_result['y_preds'] = clf.predict(x_test)
            clf_result['y_probs'] = clf.predict_proba(x_test)
            #clf_results[clf_name] = clf_result
            
            # Compute ROC curve and ROC area for each class
            #roc_results = {}
            fpr, tpr, thresholds = roc_curve(y_test, clf_result['y_probs'][:, 1])
            roc_auc = auc(fpr, tpr)
            tpr = interp(base_fpr, fpr, tpr)
            tpr[0] = 0.0
            tpr_arr[fold_ix, :] = tpr
            roc_auc_arr[fold_ix] = roc_auc
            #roc_result = {}
            #roc_result['fpr'] = fpr
            #roc_result['tpr'] = tpr
            #roc_result['roc_auc'] = roc_auc
            #roc_results[clf_name] = roc_result
        model_key = '%s %s' % (clf_name, pred_name)
        roc_results[model_key] = tpr_arr.mean(axis=0)
        roc_aucs[model_key] = roc_auc_arr

Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression All features (412, 13)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Without KGE (412, 12)
Logistic Regression Only KGE (412, 1)
Logistic Regression Only KGE (412, 1)
Logistic Regression Only KGE (412, 1)
Logistic Regression Only KGE (412, 1

In [22]:
# Correctness prediction for a single Sparser evidence
#sparser_ex = np.array([[0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
#models['Logistic Regression'].predict_proba(sparser_ex)
#models['Random Forest Classifier'].predict_proba(sparser_ex)

# ROC Curves

In [23]:
# Compute micro-average ROC curve and ROC area
#fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
#roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [24]:
#with open('belief_fpr_tpr_auc.pkl', 'rb') as f:
#    bel_fpr, bel_tpr, bel_roc_auc = pickle.load(f)


In [25]:
plt.figure()
lw = 2
colors = ['r', 'g', 'b', 'orange', 'k', 'y']

# Plot ROC curve for Belief Model (REACH)
#plt.plot(bel_fpr, bel_tpr, color=colors[0],
#         lw=lw, label='Orig. Belief (area = %0.2f)' % bel_roc_auc)
for i, (clf_name, roc_result) in enumerate(roc_results.items()):
    #fpr = roc_result['fpr']
    #tpr = roc_result['tpr']
    roc_auc_arr = roc_aucs[clf_name]
    plt.plot(base_fpr, roc_result, color=colors[i],
         lw=lw, label='%s (area = %0.2f +/- %0.2f)' %
                      (clf_name, roc_auc_arr.mean(), roc_auc_arr.std()))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('bel_roc.pdf')
plt.show()


<IPython.core.display.Javascript object>

In [None]:
print(classification_report(y_test, y_preds, labels=[0, 1]))

In [None]:
with open('belief_prec_rec_thresh_auc.pkl', 'rb') as f:
    bel_prec, bel_rec, bel_thresh, bel_pr_auc = pickle.load(f)


In [None]:
pr_results = {}
for clf_name, clf_result in clf_results.items():
    precision, recall, thresholds = precision_recall_curve(y_test, clf_result['y_probs'][:, 1])
    pr_auc = metrics.auc(recall, precision)
    pr_results[clf_name] = {'precision': precision, 'recall': recall,
                            'thresholds': thresholds, 'pr_auc': pr_auc}

"""
plt.figure()
plt.plot(bel_thresh, bel_prec[: -1], color=colors[0],
             linestyle='-', label="Orig. Belief Precision (area = %0.2f)" % bel_pr_auc)
plt.plot(bel_thresh, bel_rec[: -1], color=colors[0],
             linestyle='--', label="Orig. Belief Recall")
""" 

for i, (clf_name, pr_result) in list(enumerate(pr_results.items()))[0:1]:
    plt.plot(pr_result['thresholds'], pr_result['precision'][: -1], color=colors[i],
             linestyle='-', label="%s Precision (area = %0.2f)" % (clf_name, pr_result['pr_auc']))
    plt.plot(pr_result['thresholds'], pr_result['recall'][: -1], color=colors[i+1],
             linestyle='--', label="%s Recall" % clf_name)
    
plt.title("Precision-Recall vs Threshold Chart")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])
plt.savefig('prec_rec.pdf')
plt.show()

In [None]:
# Associate precision values for the statements at each threshold
# So, if a statement has probability value, check if it is above the threshold, and if so,
# assign it that precision.
list(zip(thresholds, precision))

In [None]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

# Logistic Regression

In [None]:
logisticRegr.coef_

In [None]:
sorted(list(zip(x_df.columns, logisticRegr.coef_[0])), key=lambda x: x[1], reverse=True)