In [2]:
import os
import random
from tqdm.notebook import tqdm
import pandas as pd
import pickle
import numpy as np
from glob import glob
from collections import Counter
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, CoherenceModel
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
import itertools
import re
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
#
import networkx as nx
import community
#
from lda.lda_prop5 import MultiLDA
from lda.lda_prop4 import Lda, MyCorpus
from lda.lda_prop4 import MyCorpora
from lda.related_tools import *

In [3]:
# load orignal data
target_db = ['DisGeNet_AlteredExpression', 'DisGeNet_Biomarker', 'DisGeNet_GeneticVariation']

df_comb = pd.read_pickle('data/20211013_data_SeRfSnpsBmHtDtMbPtBmDisGeNet.pkl').query(
    'element_name in @target_db'
).explode('element').assign(
    element_name = lambda df: df.element_name.str.replace(
        'DisGeNet_AlteredExpression', 'AlteredExpression'
    ).str.replace(
        'DisGeNet_Biomarker', 'Biomarker',
    ).str.replace(
        'DisGeNet_GeneticVariation', 'GeneticVariation'
    ),
    Disease = lambda df: df.Disease.map(lambda x: '|'.join(x)),
    Obs=1
).drop('N', axis=1)

df_comb

Unnamed: 0,key_c,element_name,Disease,element,Obs
0,(idiopathic) normal pressure hydrocephalus,AlteredExpression,(idiopathic) normal pressure hydrocephalus,TNR,1
1,(non-specific) purulent meningitis,AlteredExpression,(non-specific) purulent meningitis,FCGR1A,1
1,(non-specific) purulent meningitis,AlteredExpression,(non-specific) purulent meningitis,UROD,1
1,(non-specific) purulent meningitis,AlteredExpression,(non-specific) purulent meningitis,CALCA,1
1,(non-specific) purulent meningitis,AlteredExpression,(non-specific) purulent meningitis,CRP,1
...,...,...,...,...,...
58723,zoonoses,GeneticVariation,zoonoses,PRNP,1
58723,zoonoses,GeneticVariation,zoonoses,BST2,1
58723,zoonoses,GeneticVariation,zoonoses,SAMD9,1
58724,zoonotic form of cutaneous leishmaniasis,GeneticVariation,zoonotic form of cutaneous leishmaniasis,GZMB,1


In [4]:
flist = glob('result/*/predicted_w_group_*.csv')
flist

['result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_0.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_1.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_2.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_3.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_4.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_5.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_6.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_7.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_8.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_9.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_10.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_11.csv',
 'result/20220621_predict_comp_AeBmGv_remove300/Ae/predicted_w_group_12.cs

In [15]:
# merge predict result with original data
li_merge = []
for f in flist:
    # load predicted result
    df_res = pd.read_csv(f, index_col=[0]).assign(
        element_name = lambda df: df.element_name.str.replace('GeneticVatiation', 'GeneticVariation')
    )

    # extract counterpart in original data
    target_db = df_res.element_name.unique()[0] # one DB
    target_diseases = df_res.key_c.unique() # some Diseases

    df_comb_obs = df_comb.query(
        'element_name == @target_db'
    ).query(
        'key_c in @target_diseases'
    ).rename(
        columns = {'element': 'word'}
    ).reset_index(drop=True)

    # merge data and calculate auc 
    df_merge_temp = pd.merge(
        df_res, df_comb_obs, how='outer'
    ).fillna(
        {'w_count':0, 'freq': 0, 'Obs': 0}
    ).groupby(['element_name', 'key_c']).apply(
        lambda df: pd.DataFrame(
            {
                'Disease': df.query('Disease.notnull()').Disease.unique()[0],
                'AUC': roc_auc_score(df.Obs, df.freq),          
            },
            index=[0]
        )
    ).reset_index().drop('level_2', axis=1)

    li_merge.append(df_merge_temp)

df_merge = pd.concat(li_merge)
df_merge

  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar

Unnamed: 0,element_name,key_c,Disease,AUC
0,AlteredExpression,C0000814|C0020217|C0153572|C0334529|C0477800|C...,hydatidiform mole,0.833795
1,AlteredExpression,C0001309|C0009763|C0009768|C0009770|C0009774|C...,conjunctivitis,1.000000
2,AlteredExpression,C0003502|C0003507|C0005694|C0007785|C0008297|C...,stenosis,0.992983
3,AlteredExpression,C0004352|C0236792|C0338986|C0349329,autistic disorder,0.840028
4,AlteredExpression,C0004767|C0005937|C0008340|C0010093|C0014511|C...,cyst,0.834985
...,...,...,...,...
280,GeneticVariation,tuberous sclerosis 2 (disorder),tuberous sclerosis 2 (disorder),0.919823
281,GeneticVariation,type ii endometrial adenocarcinoma,type ii endometrial adenocarcinoma,0.999810
282,GeneticVariation,"vohwinkel syndrome, variant form","vohwinkel syndrome, variant form",0.577880
283,GeneticVariation,xerocytosis,xerocytosis,0.793577


In [17]:
# save merge result
df_merge.to_csv(
    'result/AUC_remove290.csv', 
    index=False
)