# Optimize Dimensionality Parameters

In [29]:
import numpy as np
import scipy
from sklearn.decomposition import IncrementalPCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
import time

import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '../../'))
from retrofit_identity import retrofit_identity
from retrofit_linear   import retrofit_linear
from retrofit_neural   import retrofit_neural
from utils import *

In [32]:
def make_set(rep, indices):
    return np.array([np.hstack((rep[i], rep[j])) for (i,j) in indices])

def evaluate(X_retrofitted, train_indices, test_indices, rf):
    X_train = make_set(X_retrofitted, train_indices)
    X_test  = make_set(X_retrofitted, test_indices)
    rf.fit(X_train, Y_train)
    return rf.score(X_test, Y_test)

In [34]:
in_edges = load_obj("in_edges_filtered")
out_edges = load_obj("out_edges_filtered")
X_cooccur_raw = load_obj("X_cooccur")

dummy = DummyClassifier(strategy='most_frequent')
rf = RandomForestClassifier(n_estimators=100)
subgraph = ['has_finding_site', 'has_pathological_process', 'due_to', 'cause_of']

n_iters = 3
scores = {e : {'Raw': [], 'Identity' : [], 'Linear'   : [], 'Neural'   : []} for e in subgraph}
p_train=0.7

# Reduce dimensionality
pca = IncrementalPCA(n_components=25)
X_reduced_25 = pca.fit_transform(X_cooccur_raw)
pca = IncrementalPCA(n_components=10)
X_reduced_10 = pca.fit_transform(X_cooccur_raw)

for edge_type in subgraph:
    print("-"*20 + "\nLeaving out relations of type: '{}'".format(edge_type))
    rels_to_fit = set(subgraph) - set([edge_type])
    in_edges_to_fit = {r: in_edges[r] for r in rels_to_fit}
    out_edges_to_fit = {r: out_edges[r] for r in rels_to_fit}

    # Measure accuracy.
    for iteration in range(1, n_iters+1):
        print("="*60 + "\nIteration: {}\n".format(iteration) + "="*60)

        # Retrofitting.
        t = time.time()
        X_raw = X_reduced_25
        X_ide = retrofit_identity(X_reduced_25, in_edges_to_fit, n_iter=20)
        (X_lin, A, B) = retrofit_linear(X_reduced_25, in_edges_to_fit,
                                        out_edges_to_fit, lam=1e-3, tol=1e-5,
                                        lr=0.5, lr_decay=0.99, n_iter=20)
        (X_neu, A_neu, u_neu, b_neu) = retrofit_neural(X_reduced_10, in_edges_to_fit,
                                           out_edges_to_fit, k=10, lam=1e-5, tol=1e-5,
                                           lr=5e-4, lr_decay=0.99, batch_size=512,
                                           n_iter=200, verbose=10, patience=100)
  
        # Train/test split of evaluation set.
        train_indices, test_indices, neg_train_indices, neg_test_indices = get_train_test_indices(
            in_edges[edge_type], p_train, neg_sampling=1.0)
        Y_train = np.ones(len(train_indices))
        Y_train = np.concatenate((Y_train, np.zeros((len(neg_train_indices), ))))
        Y_test  = np.ones(len(test_indices))
        Y_test  = np.concatenate((Y_test,  np.zeros((len(neg_test_indices),  ))))
        train_indices.extend(neg_train_indices)
        test_indices.extend(neg_test_indices)

        scores[edge_type]["Raw"].append(evaluate(X_raw, train_indices, test_indices, rf))
        scores[edge_type]["Identity"].append(evaluate(X_ide, train_indices, test_indices, rf))
        scores[edge_type]["Linear"].append(evaluate(X_lin, train_indices, test_indices, rf))
        scores[edge_type]["Neural"].append(evaluate(X_neu, train_indices, test_indices, rf))
        print(scores)
    
    print("Sizes for edges of type: '{}' (# train: {:d}, # testing: {:d})".format(
        edge_type, len(Y_train), len(Y_test)))

--------------------
Leaving out relations of type: 'has_finding_site'
Iteration: 1
Loss reached local minimum at iteration 1
Iteration 10 of 200	Changes: 0.06832	Loss: 2043.907	Patience: 96
Iteration 20 of 200	Changes: 0.11414	Loss: 2632.676	Patience: 93
Iteration 30 of 200	Changes: 0.02685	Loss: 14224.726	Patience: 87
Iteration 40 of 200	Changes: 0.03193	Loss: 1844.953	Patience: 83
Iteration 50 of 200	Changes: 0.03850	Loss: 1778.633	Patience: 77
Iteration 60 of 200	Changes: 0.01255	Loss: 2421.690	Patience: 71
Iteration 70 of 200	Changes: 0.00599	Loss: 763.212	Patience: 66
Iteration 80 of 200	Changes: 0.02031	Loss: 3745.459	Patience: 61
Iteration 90 of 200	Changes: 0.00534	Loss: 269.114	Patience: 58
Iteration 100 of 200	Changes: 0.00303	Loss: 322.118	Patience: 53
Iteration 110 of 200	Changes: 0.00294	Loss: 673.118	Patience: 48
Iteration 120 of 200	Changes: 0.00473	Loss: 1083.127	Patience: 41
Iteration 130 of 200	Changes: 0.00086	Loss: 150.531	Patience: 37
Iteration 140 of 200	Changes:

Loss reached local minimum at iteration 1
Iteration 10 of 200	Changes: 0.06838	Loss: 20134.108	Patience: 98
Iteration 20 of 200	Changes: 0.03939	Loss: 14972.829	Patience: 93
Iteration 30 of 200	Changes: 0.02372	Loss: 13160.949	Patience: 88
Iteration 40 of 200	Changes: 0.02345	Loss: 12639.933	Patience: 83
Iteration 50 of 200	Changes: 0.01702	Loss: 12069.869	Patience: 78
Iteration 60 of 200	Changes: 0.01646	Loss: 8068.967	Patience: 74
Iteration 70 of 200	Changes: 0.01446	Loss: 6816.200	Patience: 70
Iteration 80 of 200	Changes: 0.00952	Loss: 6266.390	Patience: 64
Iteration 90 of 200	Changes: 0.00583	Loss: 3170.638	Patience: 61
Iteration 100 of 200	Changes: 0.01007	Loss: 1546.499	Patience: 57
Iteration 110 of 200	Changes: 0.00344	Loss: 3936.116	Patience: 52
Iteration 120 of 200	Changes: 0.00495	Loss: 1242.184	Patience: 50
Iteration 130 of 200	Changes: 0.00123	Loss: 997.245	Patience: 45
Iteration 140 of 200	Changes: 0.00249	Loss: 733.285	Patience: 40
Iteration 150 of 200	Changes: 0.00407	Lo

Iteration 180 of 200	Changes: 0.00075	Loss: 320.798	Patience: 8
Iteration 190 of 200	Changes: 0.00291	Loss: 127.825	Patience: 4
Loss reached local minimum (and patience expired) at iteration 198
{'cause_of': {'Raw': [], 'Identity': [], 'Neural': [], 'Linear': []}, 'due_to': {'Raw': [0.92337164750957856, 0.90263405456255874], 'Identity': [0.94827586206896552, 0.93885230479774229], 'Neural': [0.92241379310344829, 0.9063969896519285], 'Linear': [0.93821839080459768, 0.92850423330197551]}, 'has_finding_site': {'Raw': [0.95205563509920232, 0.95266162819354305, 0.95188506215610358], 'Identity': [0.95226017590509304, 0.95276431446644216, 0.95133482779702461], 'Neural': [0.9515647371650644, 0.9526821654481229, 0.95153861830038722], 'Linear': [0.95232153814686027, 0.95296968701224016, 0.95214998981047483]}, 'has_pathological_process': {'Raw': [0.98751189343482393, 0.98698453608247427, 0.98670605612998519], 'Identity': [0.99107992388201716, 0.98943298969072169, 0.98953717380600692], 'Neural': [0

Loss reached local minimum at iteration 1
Iteration 10 of 200	Changes: 0.08764	Loss: 36623.350	Patience: 95
Iteration 20 of 200	Changes: 0.06775	Loss: 14782.703	Patience: 91
Iteration 30 of 200	Changes: 0.02467	Loss: 7861.191	Patience: 86
Iteration 40 of 200	Changes: 0.01535	Loss: 5518.504	Patience: 81
Iteration 50 of 200	Changes: 0.12343	Loss: 5445.009	Patience: 75
Iteration 60 of 200	Changes: 0.01730	Loss: 7399.395	Patience: 70
Iteration 70 of 200	Changes: 0.01162	Loss: 5497.138	Patience: 65
Iteration 80 of 200	Changes: 0.01062	Loss: 4654.133	Patience: 59
Iteration 90 of 200	Changes: 0.00603	Loss: 2879.173	Patience: 55
Iteration 100 of 200	Changes: 0.03617	Loss: 3868.330	Patience: 52
Iteration 110 of 200	Changes: 0.00200	Loss: 1743.926	Patience: 48
Iteration 120 of 200	Changes: 0.00388	Loss: 1906.446	Patience: 44
Iteration 130 of 200	Changes: 0.00124	Loss: 2725.260	Patience: 39
Iteration 140 of 200	Changes: 0.00319	Loss: 1015.454	Patience: 34
Iteration 150 of 200	Changes: 0.00035	Los

In [36]:
for edge_type, method_scores in scores.items():
    print("="*40+"\n{}".format(edge_type))
    for method_name, accs in method_scores.items():
        print("Method: {}\t\t{:.4f}+-{:.4f}".format(method_name, np.mean(accs), np.std(accs)))

cause_of
Method: Raw		0.7961+-0.0110
Method: Identity		0.8667+-0.0127
Method: Neural		0.8029+-0.0080
Method: Linear		0.8085+-0.0049
due_to
Method: Raw		0.9147+-0.0088
Method: Identity		0.9469+-0.0061
Method: Neural		0.9157+-0.0068
Method: Linear		0.9350+-0.0046
has_finding_site
Method: Raw		0.9522+-0.0003
Method: Identity		0.9521+-0.0006
Method: Neural		0.9519+-0.0005
Method: Linear		0.9525+-0.0004
has_pathological_process
Method: Raw		0.9871+-0.0003
Method: Identity		0.9900+-0.0008
Method: Neural		0.9894+-0.0031
Method: Linear		0.9932+-0.0007
