### DataPreProcess Notebook with following steps:
1. Data Collection: Fetch data from Wikipedia
2. Data Preprocessing: Extract and convert HTML content into a text format using bs4. Chunk the text (per paragraph) and summarize the text with T5 for efficient analysis.
3. Query Generation: Create queries from the preprocessed data using T5, doc2query to generate question-context pair set.
4. Dataset split: Split question-context pairs into train/test sets per context i.e. chunk.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# provide project root path
ProjectRoot = "/content/drive/MyDrive/UMich Capstone/NoteBooks/"
DatasetRoot = ProjectRoot + "Dataset/"
# DatasetRoot = "/content/drive/MyDrive/"

In [3]:
import pandas as pd
import regex as re
import requests
import json
from bs4 import BeautifulSoup
import logging
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Set logging level to ERROR to suppress the notice
logging.getLogger("transformers").setLevel(logging.ERROR)

#### Helper Methods

In [4]:
#Get raw dataset from wikipedia
def get_data(page_title,based_url):
    # build full url and get response
    url=based_url+page_title
    response = requests.get(url)
    html_content=response.text

    # parse text and chunk it by paragraphs
    html_content = BeautifulSoup(html_content, 'html.parser')
    paragraphs = html_content.find_all('p')
    text_list={}
    para_idx = 0
    for paragraph in paragraphs:
        raw_para = paragraph.text.replace('\n', '')
        if raw_para != '':
            text_list[para_idx] = re.sub(r'\[\d+]', '', raw_para)
            para_idx += 1

    # Write JSON output to a file
    with open(DatasetRoot + 'raw_knowledge.json', 'w') as f:
        json.dump(text_list, f, indent=4)

    return text_list

# summarize paragraph in shorter form
def generate_meaning(text):
    # init T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # tokenize raw text
    input_tokens = tokenizer.encode("summarize: " + text, return_tensors='pt')

    # Generate the main meaning/summary using the T5 model
    output = model.generate(input_tokens, max_length=150, num_beams=4, early_stopping=True)

    # decode summary
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# summarize entire raw data
def summarize_data(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = generate_meaning(raw_text)
    return text_list

In [5]:
# set no. of queries to generate
NUM_QUESTIONS_GEN = 5

# generate questions from a paragraph
def text2question(text):
    model_name = 'doc2query/all-with_prefix-t5-base-v1'
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    prefix = "answer2question"

    text2=prefix+": "+text
    input_ids = tokenizer.encode(text2, max_length=384, truncation=True, return_tensors='pt')
    outputs = model.generate(
        input_ids=input_ids,
        max_length=64,
        do_sample=True,
        top_p=0.95,
        num_return_sequences=NUM_QUESTIONS_GEN)

    questions=[]
    for i in range(len(outputs)):
        query = tokenizer.decode(outputs[i], skip_special_tokens=True)
        questions.append(query)
    dict_context_query={}
    dict_context_query['text']=text
    dict_context_query['question']=questions
    return dict_context_query

# generate question-context set from documents:
def query_context_dataset(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = text2question(raw_text)
    return text_list

# split question-context dataset into train/test set and store in csv
def split_store_dataset(list_t2q):
    context_query_train = []
    context_query_test = []

    # for every chunk/context/paragraph, store 80% questions in train and 20% in test sets.
    for para_id, text in list_t2q.items():
        split_idx = round(len(text['question']) * 0.8)
        for i in range(split_idx):
            context_query_train.append([para_id, text['text'], text['question'][i]])
        for i in range(split_idx, len(text['question'])):
            context_query_test.append([para_id, text['text'], text['question'][i]])

    train_df = pd.DataFrame(context_query_train, columns=['raw_para_id', 'paragraph', 'question'])
    train_df.to_csv(DatasetRoot + 'q_a_trainset.csv', index=False)

    test_df = pd.DataFrame(context_query_test, columns=['raw_para_id', 'paragraph', 'question'])
    test_df.to_csv(DatasetRoot + 'q_a_testset.csv', index=False)

    return train_df, test_df

#### Data Collection

In [6]:
# Get raw docs from wikipedia_Data_Science
page_title = "Data_science"
based_url="https://en.wikipedia.org/wiki/"

In [7]:
# fetch raw html, convert to text and chunk it
text_list = get_data(page_title, based_url)

#### Data Preprocessing

In [8]:
# summarize raw data
summarized_text_list = summarize_data(text_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

#### Query Generation

In [9]:
# create question-context pair set
q_c_set = query_context_dataset(summarized_text_list)

tokenizer_config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [10]:
q_c_set[8]

{'text': 'the term "data science" dates back to 1974, when Peter Naur proposed it as an alternative name to computer science. in 1997, C. F. Jeff Wu suggested that statistics should be renamed data science.',
 'question': ['Term used to refer to computer science',
  'why is data science called data science?',
  'What is the origin of the term data science?',
  'What is the real name of statistical science?',
  'When was the term data science made?']}

#### Dataset split

In [11]:
# for every paragraph (context), store 80% questions in train and 20% in test sets.
train_df, test_df = split_store_dataset(q_c_set)

In [12]:
train_df

Unnamed: 0,raw_para_id,paragraph,question
0,0,data science is an interdisciplinary academic ...,what is data science?
1,0,data science is an interdisciplinary academic ...,What is data science?
2,0,data science is an interdisciplinary academic ...,How is data science (based on machine learning...
3,0,data science is an interdisciplinary academic ...,what does data science mean in linux?
4,1,data science is multifaceted and can be descri...,What is Data Science?
...,...,...,...
91,22,"data science involves collecting, processing, ...",what are the pitfalls of data science?
92,23,machine learning models can amplify existing b...,What are the negative effects of machine learn...
93,23,machine learning models can amplify existing b...,why is machine learning an evil?
94,23,machine learning models can amplify existing b...,If a machine learning model is used to predict...


In [13]:
len(train_df)

96

In [14]:
len(test_df)

24