In [1]:
import json
import numpy as np
import pandas as pd

from tqdm import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
dir_root = "Data/"

In [4]:
data = pd.read_json(f"{dir_root}qaa.json")
data = data.reset_index(drop=True)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7871 entries, 0 to 7870
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   question_focus  7871 non-null   object
 1   question_type   7871 non-null   object
 2   question        7871 non-null   object
 3   answer          7871 non-null   object
 4   wc              7871 non-null   int64 
 5   t_opt_count     7871 non-null   int64 
 6   t_gptj_count    7871 non-null   int64 
 7   t_llama_count   7871 non-null   int64 
 8   t_llama2_count  7871 non-null   int64 
 9   name            7871 non-null   object
dtypes: int64(5), object(5)
memory usage: 615.0+ KB


Unnamed: 0,question_focus,question_type,question,answer,wc,t_opt_count,t_gptj_count,t_llama_count,t_llama2_count,name
0,keratoderma with woolly hair,frequency,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...,82,111,110,120,120,MedQuAD Super Expert
1,keratoderma with woolly hair,inheritance,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...,56,69,68,79,79,MedQuAD Super Expert
2,Knobloch syndrome,frequency,How many people are affected by Knobloch syndr...,Knobloch syndrome is a rare condition. However...,15,21,20,25,25,MedQuAD Super Expert
3,Knobloch syndrome,inheritance,Is Knobloch syndrome inherited ?,This condition is inherited in an autosomal re...,51,60,59,69,69,MedQuAD Super Expert
4,Knobloch syndrome,treatment,What are the treatments for Knobloch syndrome ?,These resources address the diagnosis or manag...,90,119,118,148,148,MedQuAD Super Expert


In [5]:
data_train = pd.read_json(f"{dir_root}qaa_train.json")
data_train = data_train.reset_index(drop=True)
data_train = data_train.rename(columns={"output" : "answer", "instruction" : "question"})
data_train.info()
data_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6292 entries, 0 to 6291
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  6292 non-null   object
 1   answer    6292 non-null   object
 2   input     6292 non-null   object
dtypes: object(3)
memory usage: 147.6+ KB


Unnamed: 0,question,answer,input
0,How to diagnose Konigsmark Knox Hussels syndro...,Is genetic testing available for Konigsmark Kn...,MedQuAD Super Expert
1,What are the genetic changes related to SADDAN ?,Mutations in the FGFR3 gene cause SADDAN. The ...,MedQuAD Super Expert
2,What are the treatments for mal de Meleda ?,These resources address the diagnosis or manag...,MedQuAD Super Expert
3,What are the treatments for systemic lupus ery...,These resources address the diagnosis or manag...,MedQuAD Super Expert
4,What are the treatments for hypochondroplasia ?,These resources address the diagnosis or manag...,MedQuAD Super Expert


In [6]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



## Embedding the sentences

In [7]:
def generate_sentences(data):
    sentences = []
    for i in tqdm(range(len(data))):
        tmp = sent_tokenize(data["answer"].values[i])
        ls = len(tmp)
        sentences += tmp
    sentences = pd.DataFrame({
        "sentence" : sentences
    })
    sentences["sentences"] = sentences["sentence"].apply(lambda x: x.strip())
    sentences["wc"] = sentences["sentence"].apply(lambda x: len(x.split(" ")))
    sentences = sentences[(sentences["wc"] >= 3)]
    sentences.reset_index()
    return sentences

data_sentences = generate_sentences(data)
data_train_sentences = generate_sentences(data_train)

print(data_sentences.info())
print(data_train_sentences.info())

100% 7871/7871 [00:00<00:00, 11263.21it/s]
100% 6292/6292 [00:00<00:00, 11702.42it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 32310 entries, 0 to 32419
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence   32310 non-null  object
 1   sentences  32310 non-null  object
 2   wc         32310 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1009.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 25851 entries, 0 to 25936
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence   25851 non-null  object
 1   sentences  25851 non-null  object
 2   wc         25851 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 807.8+ KB
None





In [8]:
data_sentences = data_sentences.drop(columns=["sentence","wc"])
data_sentences.to_json(f"{dir_root}sentences.json",orient="records")
data_sentences.head()

Unnamed: 0,sentences
0,Keratoderma with woolly hair is rare; its prev...
1,Type I (Naxos disease) was first described in ...
2,"Since then, affected families have been found ..."
3,This form of the condition may affect up to 1 ...
4,"Type II (Carvajal syndrome), type III, and typ..."


In [9]:
data_train_sentences = data_train_sentences.drop(columns=["sentence","wc"])
data_train_sentences.to_json(f"{dir_root}sentences_train.json",orient="records")
data_train_sentences.head()

Unnamed: 0,sentences
0,Is genetic testing available for Konigsmark Kn...
1,GeneTests lists the names of laboratories that...
2,To view the contact information for the clinic...
3,Please note: Most of the laboratories listed ...
4,"Below, we provide a list of online resources t..."


In [10]:
embbeded_vectors = model.encode(data_sentences["sentences"].values, show_progress_bar=True)

with open(f"{dir_root}sentences.emb", "wb") as f:
    np.save(f, embbeded_vectors)
    
len(embbeded_vectors)

Batches:   0%|          | 0/1010 [00:00<?, ?it/s]

32310

In [11]:
embbeded_vectors_train = model.encode(data_train_sentences["sentences"].values, show_progress_bar=True)

with open(f"{dir_root}sentences_train.emb", "wb") as f:
    np.save(f, embbeded_vectors_train)
    
len(embbeded_vectors_train)

Batches:   0%|          | 0/808 [00:00<?, ?it/s]

25851

## Embeding the questions

In [12]:
def emb_question(data, out):
    indexes = []
    questions = []
    
    for i in tqdm(range(len(data["question"].values))):
        indexes.append(i)
        questions.append(data["question"].values[i])
    
    with open(f"{dir_root}{out}.idx", 'w') as f:
        for item in tqdm(indexes):
            f.write(f"{str(item)}\n")
            
    print(len(indexes), max(indexes), len(questions))
    
    embbeded_vectors = model.encode(questions, show_progress_bar=True)
    
    print(len(embbeded_vectors))
    
    with open(f"{dir_root}{out}.emb", "wb") as f:
        np.save(f, embbeded_vectors)

emb_question(data, "qaa_q")
emb_question(data_train, "qaa_q_train")

100% 7871/7871 [00:00<00:00, 227002.08it/s]
100% 7871/7871 [00:00<00:00, 1907845.98it/s]

7871 7870 7871





Batches:   0%|          | 0/246 [00:00<?, ?it/s]

7871


100% 6292/6292 [00:00<00:00, 228165.73it/s]
100% 6292/6292 [00:00<00:00, 1852359.15it/s]

6292 6291 6292





Batches:   0%|          | 0/197 [00:00<?, ?it/s]

6292


In [5]:
data.to_json(f"{dir_root}qaa_q.json",orient="records")

## To Embedding Projector

In [35]:
with open(f"{dir_root}qaa_emb.tsv", "w") as f:
    np.savetxt(f, embbeded_vectors, delimiter='\t')

In [36]:
with open(f"{dir_root}qaa_meta.tsv", "w") as f:
    for q in tqdm(questions):
        f.write(f"{q}\n")

100% 650232/650232 [00:00<00:00, 2690709.32it/s]
