In [2]:
import pandas as pd
import os

In [40]:
os.listdir()

['disease_cause.csv',
 'Untitled.ipynb',
 'disease_specialty.csv',
 'embeddigs_diseases.csv',
 '.ipynb_checkpoints',
 'disease_symptoms.csv']

In [344]:
final_df = pd.read_csv('embeddigs_diseases.csv')

In [345]:
final_df.head()

Unnamed: 0,subject,predicate,object
0,encephalitis,Cause,rabies
1,bacterial vaginosis,Cause,Gardnerella vaginalis
2,obsolete adenovirus infectious disease,Cause,Adenoviridae
3,spinal stenosis,Cause,spondylosis
4,endemic goiter,Cause,iodine deficiency


In [346]:
def func(x):
    x = x.strip()
    x = x.replace('"', '')
    return x

In [347]:
final_df['subject']=final_df['subject'].map(func)

In [None]:
final_df['predicate'] = final_df['predicate']

In [348]:
final_df.head()

Unnamed: 0,subject,predicate,object
0,encephalitis,Cause,rabies
1,bacterial vaginosis,Cause,Gardnerella vaginalis
2,obsolete adenovirus infectious disease,Cause,Adenoviridae
3,spinal stenosis,Cause,spondylosis
4,endemic goiter,Cause,iodine deficiency


In [349]:
final_df['object'] = final_df['object'].map(func)

In [387]:
final_df['predicate'] = final_df['predicate'].map(func)

In [388]:
final_df.to_csv('embeddigs_diseases.csv',index=False)

In [389]:
import ampligraph
import numpy as np

In [390]:
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.datasets import load_from_csv


In [391]:
X = load_from_csv('.','embeddigs_diseases.csv',sep=',')

In [392]:
X = X[1:]

In [393]:
X

array([['encephalitis', 'Cause', 'rabies'],
       ['bacterial vaginosis', 'Cause', 'Gardnerella vaginalis'],
       ['obsolete adenovirus infectious disease', 'Cause',
        'Adenoviridae'],
       ...,
       ['acrodermatitis enteropathica', 'Symptom', 'alopecia'],
       ['Lassa fever', 'Symptom', 'alopecia'],
       ['mercury poisoning', 'Symptom', 'proteinuria']], dtype=object)

In [394]:
X.shape

(9552, 3)

In [395]:
X_train, X_valid = train_test_split_no_unseen(np.array(X), test_size=int(X.shape[0]/10))

In [396]:
type(X)

numpy.ndarray

In [397]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)


Train set size:  (8597, 3)
Test set size:  (955, 3)


In [398]:
positives_filter = X

In [399]:
from ampligraph.latent_features import ComplEx

model = ComplEx(batches_count=50,
                epochs=300,
                k=100,
                eta=20,
                optimizer='adam', 
                optimizer_params={'lr':1e-4},
                loss='multiclass_nll',
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                seed=0, 
                verbose=True)

In [400]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train)


Average Loss:   0.185902: 100%|██████████| 300/300 [05:56<00:00,  1.19s/epoch]


In [401]:
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(X_valid,
                             model=model, 
                             filter_triples=positives_filter,
                             use_default_protocol=True,
                             verbose=True)



100%|██████████| 955/955 [00:12<00:00, 78.58it/s]


In [402]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))


MRR: 0.09
MR: 1756.10
Hits@10: 0.17
Hits@3: 0.10
Hits@1: 0.05


In [403]:
from ampligraph.discovery import find_clusters
from sklearn.cluster import KMeans

In [404]:
diseases =np.array(list(set(X_train[:,0])))
disease_embeddings = dict(zip(diseases, model.get_embeddings(diseases)))
len(diseases)

6150

In [405]:
len(dict(zip(diseases,(model.get_embeddings(diseases)))))

6150

In [415]:
X_unseen = np.array([
    
    ['gonorrhea','Symptom','fever'],
    ['cystinosis','Symptom','frequent urination'],
    ['sarcoma','Symptom','weight loss'],
    ['delusional disorder','Symptom','hallucination'],
    ['sarcoma','Symptom','diarrhea'],
    ['cholera','Symptom','blurred vision'], ##fp
    ['tuberculosis','Cause','infection'],
    ['taeniasis','Symptom','nausea'],
    ['lissencephaly','Cause','sex chromosome aberration'],
    ['Brucella suis brucellosis','Specialty','infectious disease']
         
    
])

In [416]:
unseen_filter = np.array(list({tuple(i) for i in np.vstack((positives_filter, X_unseen))}))

In [417]:
ranks_unseen = evaluate_performance(
    X_unseen, 
    model=model, 
    filter_triples=unseen_filter,
    corrupt_side = 's+o',
    use_default_protocol=False,
    verbose=True,
    filter_unseen=True
)

100%|██████████| 10/10 [00:00<00:00, 43.76it/s]


In [418]:
scores = model.predict(X_unseen)

In [419]:
from scipy.special import expit
probs = expit(scores)

In [420]:
pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseen, 
                      np.squeeze(scores),
                      np.squeeze(probs))), 
             columns=['statement', 'rank', 'score', 'prob']).sort_values("score", ascending=False)

Unnamed: 0,statement,rank,score,prob
9,Brucella suis brucellosis Specialty infectious...,3,5.968642,0.997449
7,taeniasis Symptom nausea,145,2.883371,0.947018
5,cholera Symptom blurred vision,432,2.595669,0.930582
3,delusional disorder Symptom hallucination,40,2.582599,0.929733
6,tuberculosis Cause infection,307,1.853493,0.864537
0,gonorrhea Symptom fever,923,0.86321,0.703331
2,sarcoma Symptom weight loss,1956,0.561586,0.636819
4,sarcoma Symptom diarrhea,9776,-0.221001,0.444973
8,lissencephaly Cause sex chromosome aberration,13331,-0.475799,0.383245
1,cystinosis Symptom frequent urination,13641,-0.605908,0.352993


In [275]:
probs.shape

(5,)

In [276]:
np.squeeze(probs).shape

(5,)

In [183]:
scores

array([6.6812215], dtype=float32)

In [186]:
X_valid

array([['Toluenetoxicity', 'Cause', 'intoxication'],
       ['livercirrhosis', 'Cause', 'hepatitis'],
       ['polymyositis', 'Cause', 'autoimmunedisease'],
       ...,
       ['borderlinepersonalitydisorder', 'Symptom',
        'rejectionsensitivedysphoria'],
       ['peripheralarterydisease', 'Symptom', 'intermittentclaudication'],
       ['congenitaldisorderofglycosylationtypeIIc', 'Symptom',
        'intellectualdisability']], dtype=object)