In [None]:
from datasets import load_dataset
import pickle

In [None]:
dataset_availability = {}
with open("../pkl/available_kb_datasets.pkl","rb") as pkl: dataset_availability["kb_available"] = pickle.load(pkl)
with open("../pkl/unavailable_kb_datasets.pkl","rb") as pkl: dataset_availability["kb_unavailable"] = pickle.load(pkl)
with open("../pkl/available_qa_datasets.pkl","rb") as pkl: dataset_availability["qa_available"] = pickle.load(pkl)
with open("../pkl/unavailable_qa_datasets.pkl","rb") as pkl: dataset_availability["qa_unavailable"] = pickle.load(pkl)
with open("../pkl/available_t2t_datasets.pkl","rb") as pkl: dataset_availability["t2t_available"] = pickle.load(pkl)
with open("../pkl/unavailable_t2t_datasets.pkl","rb") as pkl: dataset_availability["t2t_unavailable"] = pickle.load(pkl)
with open("../pkl/available_text_datasets.pkl","rb") as pkl: dataset_availability["text_available"] = pickle.load(pkl)
with open("../pkl/unavailable_text_datasets.pkl","rb") as pkl: dataset_availability["text_unavailable"] = pickle.load(pkl)

In [84]:
def get_entities(df):    
    try:    
        entities = df[df["entities"].apply(len) != 0]["entities"].tolist()
        entities = [item for sublist in entities for item in sublist]
        new_entities = {}
        for entity in entities: new_entities[entity["id"]] = (entity['type'],entity['text'][0])
        for entity in set([new_entities[entity][0] for entity in new_entities.keys()]): print(entity)
        print()
        return new_entities
    except: return []
    
def get_relations(df,entities):
    try:
        relations = df[df["relations"].apply(len) != 0]["relations"].tolist()
        relations = [item for sublist in relations for item in sublist]
        new_relations = {}
        for relation in relations:
            general = f"{entities[relation['arg1_id']][0]} - {relation['type'].upper()} - {entities[relation['arg2_id']][0]}"
            specific = f"{entities[relation['arg1_id']][1]} - {relation['type'].upper()} - {entities[relation['arg2_id']][1]}"
            try:
                new_relations[general].append(specific)
            except: 
                new_relations[general] = [specific]
        for relation in sorted(new_relations.keys()):
            print(f"{relation} ---------- {new_relations[relation][0]}")
    except: pass

def select_largest_split(dataset_dict):
    max_split, max_rows, max_dataset = None, 0, None
    for split, dataset in dataset_dict.items():
        if dataset.num_rows > max_rows:
            max_split, max_rows, max_dataset = split, dataset.num_rows, dataset
    return max_split, max_rows, max_dataset

def get_qa(df):
    question_types = df["type"].unique().tolist()
    samples = list(zip(df['question'].head(5), df['answer'].head(5)))
    print(sorted(question_types))
    print("\nSAMPLES:\n")
    for sample in samples:
        print(f"QUESTION: {sample[0]}")
        print(f"ANSWER: {sample[1]}")
        print()

def get_text_class(df):
    labels_list = df['labels'].tolist()
    samples = list(zip(df['text'].head(5), df['labels'].head(5)))
    labels = []
    for label in labels_list:
        labels.extend(label)
    labels = set(labels)
    print(labels,"\n\nSAMPLES:\n")
    for sample in samples:
        print("TEXT: ",sample[0].replace("\n"," "))
        print(f"LABELS: {sample[1]}")
        print()


def get_t2t(df):
    samples = list(zip(df['text_1'].head(5), df['text_2'].head(5)))
    for (sample1,sample2) in samples:
        sample1 = sample1.replace('\n', ' ').strip()
        sample2 = sample2.replace('\n', ' ').strip()
        print(f"TEXT1: {sample1}")
        print(f"TEXT2: {sample2}")
        print()

In [86]:
elem = dataset_availability["kb_available"][0]
dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
split, rows, df = select_largest_split(dataset)
df = df.to_pandas()
entities = df[df["entities"].apply(len) != 0]["entities"].tolist()
entities = [item for sublist in entities for item in sublist]

In [87]:
new_entities = {}
for entity in entities: new_entities[entity["id"]] = (entity['type'],entity['text'][0])
for entity in (set([new_entities[entity][0] for entity in new_entities.keys()])): print(entity)

Multi-tissue_structure
Tissue
Immaterial_anatomical_entity
Cell
Cellular_component
Developing_anatomical_structure
Anatomical_system
Organ
Pathological_formation
Organism_substance
Organism_subdivision


In [88]:
new_entities

{'PMID-7511490_T1': ('Organism_substance', 'urinary'),
 'PMC-2719750-sec-10_T1': ('Tissue', 'muscle'),
 'PMID-16165242_T1': ('Organ', 'skin'),
 'PMID-16165242_T2': ('Organ', 'skin'),
 'PMID-16165242_T3': ('Organ', 'skin'),
 'PMID-16165242_T4': ('Organ', 'skin'),
 'PMID-16165242_T5': ('Organ', 'skin'),
 'PMID-21596785_T1': ('Cell', 'cellular'),
 'PMID-10823148_T1': ('Tissue', 'tissue specimens'),
 'PMID-10823148_T2': ('Pathological_formation', 'tumour'),
 'PMID-10823148_T3': ('Tissue', 'tissue fragments'),
 'PMID-10823148_T4': ('Tissue', 'specimens'),
 'PMID-10823148_T5': ('Tissue', 'endometrial biopsy'),
 'PMID-10823148_T6': ('Pathological_formation', 'fragment'),
 'PMID-10823148_T7': ('Pathological_formation', 'fragment'),
 'PMC-3041925-caption-05_T1': ('Cellular_component', 'membrane'),
 'PMC-3041925-caption-05_T2': ('Cellular_component', 'membrane'),
 'PMID-16101297_T1': ('Organism_substance', 'venom'),
 'PMID-7828172_T1': ('Cell', 'CHO cells'),
 'PMID-7828172_T2': ('Cell', 'CHO-k1 

In [80]:
relations = df[df["relations"].apply(len) != 0]["relations"].tolist()
relations = [item for sublist in relations for item in sublist]

new_relations = {}
for relation in relations:
    general = f"{new_entities[relation['arg1_id']][0]} - {relation['type'].upper()} - {new_entities[relation['arg2_id']][0]}"
    specific = f"{new_entities[relation['arg1_id']][1]} - {relation['type'].upper()} - {new_entities[relation['arg2_id']][1]}"
    try:
        new_relations[general].append(specific)
    except: 
        new_relations[general] = [specific]
for relation in sorted(new_relations.keys()):
    print(f"{relation} ---------- {new_relations[relation][0]}")

Cell - FRAG - Cell ---------- PAM212 - FRAG - mKSA) cell lines
Cell - PART-OF - Cell ---------- slices - PART-OF - cells
Cell - PART-OF - Multi-tissue_structure ---------- endothelial cells - PART-OF - ventral
Cell - PART-OF - Pathological_formation ---------- cells - PART-OF - inflammation
Cellular_component - FRAG - Cellular_component ---------- conjugative - FRAG - nonconjugative plasmids
Multi-tissue_structure - FRAG - Multi-tissue_structure ---------- internal thoracic - FRAG - radial artery
Multi-tissue_structure - PART-OF - Developing_anatomical_structure ---------- liver slices - PART-OF - fetal
Multi-tissue_structure - PART-OF - Multi-tissue_structure ---------- ventral part - PART-OF - gray matter
Multi-tissue_structure - PART-OF - Organ ---------- lateral white matter - PART-OF - spinal cord
Organ - FRAG - Organ ---------- vascular - FRAG - non-vascular smooth muscle
Organ - PART-OF - Developing_anatomical_structure ---------- liver - PART-OF - fetal
Organism_substance - FRA

In [89]:
for elem in dataset_availability["kb_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nENTITIES:")
    entities = get_entities(df)
    print(f"RELATIONS:")
    get_relations(df,entities)
    print("\n--------------------------------------------------------------------\n")

DATASET
('an_em', 'an_em_bigbio_kb') - train - 250

ENTITIES:
Multi-tissue_structure
Tissue
Immaterial_anatomical_entity
Cell
Cellular_component
Developing_anatomical_structure
Anatomical_system
Organ
Pathological_formation
Organism_substance
Organism_subdivision

RELATIONS:
Cell - FRAG - Cell ---------- PAM212 - FRAG - mKSA) cell lines
Cell - PART-OF - Cell ---------- slices - PART-OF - cells
Cell - PART-OF - Multi-tissue_structure ---------- endothelial cells - PART-OF - ventral
Cell - PART-OF - Pathological_formation ---------- cells - PART-OF - inflammation
Cellular_component - FRAG - Cellular_component ---------- conjugative - FRAG - nonconjugative plasmids
Multi-tissue_structure - FRAG - Multi-tissue_structure ---------- internal thoracic - FRAG - radial artery
Multi-tissue_structure - PART-OF - Developing_anatomical_structure ---------- liver slices - PART-OF - fetal
Multi-tissue_structure - PART-OF - Multi-tissue_structure ---------- ventral part - PART-OF - gray matter
Multi-t

In [None]:
entities_dict = {}
unique_entities = []
for elem in dataset_availability["kb_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    entities = df[df["entities"].apply(len) != 0]["entities"].tolist()
    entities = set([item["type"] for sublist in entities for item in sublist])
    entities_dict[elem[1]] = entities
    unique_entities.extend(entities)

db_entity_counts = {}
for entity in set(unique_entities):
    dbs = []
    for db in entities_dict.keys():
        if entity in entities_dict[db]: dbs.append(db)
    db_entity_counts[entity] = dbs

In [69]:
temp = []
for entity in db_entity_counts.keys():
    temp.append(f"{entity} ---------- {len(db_entity_counts[entity])}")
for i in sorted(temp): print(i)

 ---------- 4
(AND (AND other_name other_name) (AND other_name other_name)) ---------- 1
(AND (OR other_name other_name) (OR other_name other_name)) ---------- 1
(AND DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region) ---------- 1
(AND DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region) ---------- 1
(AND DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region) ---------- 1
(AND DNA_domain_or_region DNA_domain_or_region DNA_domain_or_region) ---------- 1
(AND DNA_domain_or_region DNA_domain_or_region) ---------- 1
(AND DNA_family_or_group DNA_family_or_group DNA_family_or_group) ---------- 1
(AND DNA_family_or_group DNA_family_or_group) ---------- 1
(AND DNA_molecule DNA_molecule) ---------- 1
(AND RNA_N/A RNA_N/A) ---------- 1
(AND RNA_family_or_group RNA_family_or_group) ---------- 1
(AND RNA_molecule RNA_mol

In [None]:
for elem in dataset_availability["qa_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nQUESTION TYPES:")
    get_qa(df)
    print("\n")

In [None]:
for elem in dataset_availability["text_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nTEXT LABELS:")
    get_text_class(df)
    print("\n")

In [None]:
for elem in dataset_availability["t2t_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nT2T SAMPLES:\n")
    get_t2t(df)
    print()