In [None]:
from datasets import load_dataset
import pickle

In [None]:
dataset_availability = {}
with open("../pkl/available_kb_datasets.pkl","rb") as pkl: dataset_availability["kb_available"] = pickle.load(pkl)
with open("../pkl/unavailable_kb_datasets.pkl","rb") as pkl: dataset_availability["kb_unavailable"] = pickle.load(pkl)
with open("../pkl/available_qa_datasets.pkl","rb") as pkl: dataset_availability["qa_available"] = pickle.load(pkl)
with open("../pkl/unavailable_qa_datasets.pkl","rb") as pkl: dataset_availability["qa_unavailable"] = pickle.load(pkl)
with open("../pkl/available_t2t_datasets.pkl","rb") as pkl: dataset_availability["t2t_available"] = pickle.load(pkl)
with open("../pkl/unavailable_t2t_datasets.pkl","rb") as pkl: dataset_availability["t2t_unavailable"] = pickle.load(pkl)
with open("../pkl/available_text_datasets.pkl","rb") as pkl: dataset_availability["text_available"] = pickle.load(pkl)
with open("../pkl/unavailable_text_datasets.pkl","rb") as pkl: dataset_availability["text_unavailable"] = pickle.load(pkl)

In [None]:
def get_entities(df):    
    try:    
        entities = df[df["entities"].apply(len) != 0]["entities"].tolist()
        entities = [item for sublist in entities for item in sublist]
        new_entities = {}
        for entity in entities: new_entities[entity["id"]] = f"{entity['type']}"
        for entity in set([new_entities[entity] for entity in new_entities.keys()]): print(entity)
        print()
        return new_entities
    except: return []
    
def get_relations(df,entities):
    try:
        relations = df[df["relations"].apply(len) != 0]["relations"].tolist()
        relations = [item for sublist in relations for item in sublist]
        for relation in set([f"{entities[relation['arg1_id']]} - {relation['type'].upper()} - {entities[relation['arg2_id']]}" for relation in relations]):
            print(relation)
    except: pass

def select_largest_split(dataset_dict):
    max_split, max_rows, max_dataset = None, 0, None
    for split, dataset in dataset_dict.items():
        if dataset.num_rows > max_rows:
            max_split, max_rows, max_dataset = split, dataset.num_rows, dataset
    return max_split, max_rows, max_dataset

def get_qa(df):
    question_types = df["type"].unique().tolist()
    samples = list(zip(df['question'].head(5), df['answer'].head(5)))
    print(sorted(question_types))
    print("\nSAMPLES:\n")
    for sample in samples:
        print(f"QUESTION: {sample[0]}")
        print(f"ANSWER: {sample[1]}")
        print()

def get_text_class(df):
    labels_list = df['labels'].tolist()
    samples = list(zip(df['text'].head(5), df['labels'].head(5)))
    labels = []
    for label in labels_list:
        labels.extend(label)
    labels = set(labels)
    print(labels,"\n\nSAMPLES:\n")
    for sample in samples:
        print("TEXT: ",sample[0].replace("\n"," "))
        print(f"LABELS: {sample[1]}")
        print()


def get_t2t(df):
    samples = list(zip(df['text_1'].head(5), df['text_2'].head(5)))
    for (sample1,sample2) in samples:
        sample1 = sample1.replace('\n', ' ').strip()
        sample2 = sample2.replace('\n', ' ').strip()
        print(f"TEXT1: {sample1}")
        print(f"TEXT2: {sample2}")
        print()

In [None]:
for elem in dataset_availability["kb_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nENTITIES:")
    entities = get_entities(df)
    print(f"RELATIONS:")
    get_relations(df,entities)
    print("\n--------------------------------------------------------------------\n")

In [None]:
for elem in dataset_availability["qa_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nQUESTION TYPES:")
    get_qa(df)
    print("\n")

In [None]:
for elem in dataset_availability["text_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nTEXT LABELS:")
    get_text_class(df)
    print("\n")

In [None]:
for elem in dataset_availability["t2t_available"]:
    dataset = load_dataset(f"bigbio/{elem[0]}",name=elem[1],trust_remote_code=True)
    split, rows, df = select_largest_split(dataset)
    df = df.to_pandas()
    print(f"DATASET\n{elem} - {split} - {rows}\n\nT2T SAMPLES:\n")
    get_t2t(df)
    print()