In [44]:
import json
import pandas as pd

from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

In [None]:
dir_root = "Data/"
name = "COVID Super Expert"

In [None]:
df_paragraphs = pd.read_json(f"{dir_root}/covid_subset_paragraphs.json",orient="records")

In [65]:
model = AutoModelForSeq2SeqLM.from_pretrained("voidful/context-only-question-generator")

In [66]:
tokenizer = AutoTokenizer.from_pretrained("voidful/context-only-question-generator")

In [67]:
def generate(number_of_questions, text, model, tokenizer, device):
    inputs = tokenizer(
        text, 
        max_length=256, 
        return_tensors="pt"
    ).to(device)

    questions = []
    for step in range(0,number_of_questions):
        summary_ids = model.generate(
            inputs["input_ids"], 
            num_beams=2, 
            min_length=0, 
            max_length=36, 
            do_sample=True, 
            num_return_sequences=1, 
            no_repeat_ngram_size=3, 
            temperature=0.8)

        question = tokenizer.batch_decode(
            summary_ids, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False)
        
        questions.append(question[0])

    return questions

In [68]:
test_text = """Plant organs of high starch or sugar content have long since served for manufacturing alcohol, which was also used as fuel, 
but larger volumes were first consumed before and during World War II. Subsequently, the cheap petrol distilled from oil crowded out the expensive alcohol, 
but the years of oil crisis and the lead-contamination of environment concentrated attention upon bioalcohol or bioethanol as a fuel."""

In [69]:
generate(
    number_of_questions = 5,
    text = test_text,
    model = model,
    tokenizer = tokenizer,
    device = "auto"
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['How many years did the oil crisis last?',
 'What was the main source of fuel?',
 'What was used as fuel before World War II?',
 'How many years after the oil crisis was petroleum first distilled?',
 'What was primarily used as fuel?']

In [72]:
pbar = tqdm(range(0, len(df_paragraphs["paragraph"].values)))
qaa = []

for i in pbar:    
    text = df_paragraphs["paragraph"].values[i].strip()
    
    questions = generate(
        number_of_questions = 5,
        text = text,
        model = model,
        tokenizer = tokenizer,
        device = "auto"
    )

    for question in questions:
        qaa.append({
            "question" : question.strip(), 
            "name" : name, 
            "answare" : text,
            "title" : df_paragraphs["title"].values[i],
            "cord_uid" : df_paragraphs["cord_uid"].values[i], 
            "arxiv_id" : df_paragraphs["arxiv_id"].values[i],
            "pmc_json_files" : df_paragraphs["arxiv_id"].values[i]})

print(len(qaa))

100% 18239/18239 [2:39:58<00:00,  1.90it/s]  

91195





In [73]:
output_qaa = f"{dir_root}qaa.json"
output_qaa_duplicated = f"{dir_root}qaa_duplicated"

In [74]:
df_qaa_duplicated = pd.DataFrame(qaa)
df_qaa_duplicated = df_qaa_duplicated.drop_duplicates()
df_qaa_duplicated.to_json(output_qaa_duplicated + ".json",orient="records")

In [75]:
df_qaa_duplicated = pd.read_json(output_qaa_duplicated + ".json",orient="records")
df_qaa_duplicated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58290 entries, 0 to 58289
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   question        58290 non-null  object 
 1   name            58290 non-null  object 
 2   answare         58290 non-null  object 
 3   title           58290 non-null  object 
 4   cord_uid        58290 non-null  object 
 5   arxiv_id        58290 non-null  float64
 6   pmc_json_files  58290 non-null  float64
dtypes: float64(2), object(5)
memory usage: 3.1+ MB


In [76]:
df_qaa_duplicated = pd.read_json(output_qaa_duplicated + ".json",orient="records")
df_qaa_duplicated.head()

Unnamed: 0,question,name,answare,title,cord_uid,arxiv_id,pmc_json_files
0,"Which group had a lower median age, those with...",COVID Super Expert,A prevalent symptom of Parkinson’s disease (PD...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772
1,How many people had no PD?,COVID Super Expert,A prevalent symptom of Parkinson’s disease (PD...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772
2,How many more people had PD than had no PD?,COVID Super Expert,A prevalent symptom of Parkinson’s disease (PD...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772
3,How many total videos were in the dataset?,COVID Super Expert,The dataset consists of 1812 videos from 604 (...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772
4,How many total videos are in the dataset?,COVID Super Expert,The dataset consists of 1812 videos from 604 (...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772


In [77]:
restruct = {}
for i in tqdm(range(0,len(df_qaa_duplicated))):
    if df_qaa_duplicated["answare"].values[i] in restruct:
        restruct[df_qaa_duplicated["answare"].values[i]]["questions"].append(df_qaa_duplicated["question"].values[i])
    else:
        restruct[df_qaa_duplicated["answare"].values[i]] = {
            "questions" : [df_qaa_duplicated["question"].values[i]],
            "name" : df_qaa_duplicated["name"].values[i],
            "answare" : df_qaa_duplicated["answare"].values[i],
            "title" : df_qaa_duplicated["title"].values[i],
            "cord_uid" : df_qaa_duplicated["cord_uid"].values[i],
            "pmc_json_files" : df_qaa_duplicated["pmc_json_files"].values[i],
            "arxiv_id" : str(df_qaa_duplicated["arxiv_id"].values[i])[0:10]
        }

100% 58290/58290 [00:01<00:00, 50665.08it/s]


In [78]:
restruct = [restruct[key] for key in tqdm(restruct.keys())]
restruct[0]

100% 18004/18004 [00:00<00:00, 2085625.69it/s]


{'questions': ['Which group had a lower median age, those with PD or those without PD?',
  'How many people had no PD?',
  'How many more people had PD than had no PD?'],
 'name': 'COVID Super Expert',
 'answare': 'A prevalent symptom of Parkinson’s disease (PD) is hypomimia — reduced facial expressions.In this paper, we present a method for diagnosing PD that utilizes the study of micro-expressions.We analyzed the facial action units (AU) from 1812 videos of 604 individuals (61 with PD and 543 without PD, with a mean age 63.9 y/o, sd.7.8) collected online through a web-based tool (www.parktest.net).In these videos, participants were asked to make three facial expressions (a smiling, disgusted, and surprised face) followed by a neutral face.Using techniques from computer vision and machine learning, we objectively measured the variance of the facial muscle movements and used it to distinguish between individuals with and without PD.The prediction accuracy using the facial micro-express

In [79]:
with open(output_qaa, 'w') as f:
    json.dump(restruct, f)