### DataPreProcess Notebook with following steps:
1. Data Collection: Fetch data from Wikipedia
2. Data Preprocessing: Extract and convert HTML content into a text format using bs4. Chunk the text (per paragraph) and summarize the text with T5 for efficient analysis.
3. Query Generation: Create queries from the preprocessed data using T5, doc2query to generate question-context pair set.
4. Dataset split: Split question-context pairs into train/test sets per context i.e. chunk.


In [2]:
# provide project root path
ProjectRoot = "<PROVIDE PROJECT ROOT PATH>"
DatasetRoot = ProjectRoot + "/Dataset/"

In [3]:
import pandas as pd
import regex as re
import requests
import json
from bs4 import BeautifulSoup
import logging
from transformers import T5ForConditionalGeneration, T5Tokenizer

# viz
import altair as alt

# Set logging level to ERROR to suppress the notice
logging.getLogger("transformers").setLevel(logging.ERROR)

#### Helper Methods

In [4]:
#Get raw dataset from wikipedia
def get_data(page_title,based_url):
    # build full url and get response
    url=based_url+page_title
    response = requests.get(url)
    html_content=response.text

    # parse text and chunk it by paragraphs
    html_content = BeautifulSoup(html_content, 'html.parser')
    paragraphs = html_content.find_all('p')
    text_list={}
    para_idx = 0
    for paragraph in paragraphs:
        raw_para = paragraph.text.replace('\n', '')
        if raw_para != '':
            text_list[para_idx] = re.sub(r'\[\d+]', '', raw_para)
            para_idx += 1

    # Write JSON output to a file
    with open(DatasetRoot + 'raw_knowledge.json', 'w') as f:
        json.dump(text_list, f, indent=4)

    return text_list

# summarize paragraph in shorter form
def generate_meaning(text):
    # init T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # tokenize raw text
    input_tokens = tokenizer.encode("summarize: " + text, return_tensors='pt')

    # Generate the main meaning/summary using the T5 model
    output = model.generate(input_tokens, max_length=150, num_beams=4, early_stopping=True)

    # decode summary
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# summarize entire raw data
def summarize_data(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = generate_meaning(raw_text)
    return text_list

In [5]:
# set no. of queries to generate
NUM_QUESTIONS_GEN = 5

# generate questions from a paragraph
def text2question(text):
    model_name = 'doc2query/all-with_prefix-t5-base-v1'
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    prefix = "answer2question"

    text2=prefix+": "+text
    input_ids = tokenizer.encode(text2, max_length=384, truncation=True, return_tensors='pt')
    outputs = model.generate(
        input_ids=input_ids,
        max_length=64,
        do_sample=True,
        top_p=0.95,
        num_return_sequences=NUM_QUESTIONS_GEN)

    questions=[]
    for i in range(len(outputs)):
        query = tokenizer.decode(outputs[i], skip_special_tokens=True)
        questions.append(query)
    dict_context_query={}
    dict_context_query['text']=text
    dict_context_query['question']=questions
    return dict_context_query

# generate question-context set from documents:
def query_context_dataset(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = text2question(raw_text)
    return text_list

# split question-context dataset into train/test set and store in csv
def split_store_dataset(list_t2q):
    context_query_train = []
    context_query_test = []

    # for every chunk/context/paragraph, store 80% questions in train and 20% in test sets.
    for para_id, text in list_t2q.items():
        split_idx = round(len(text['question']) * 0.8)
        for i in range(split_idx):
            context_query_train.append([para_id, text['text'], text['question'][i]])
        for i in range(split_idx, len(text['question'])):
            context_query_test.append([para_id, text['text'], text['question'][i]])

    train_df = pd.DataFrame(context_query_train, columns=['raw_para_id', 'paragraph', 'question'])
    train_df.to_csv(DatasetRoot + 'q_a_trainset.csv', index=False)

    test_df = pd.DataFrame(context_query_test, columns=['raw_para_id', 'paragraph', 'question'])
    test_df.to_csv(DatasetRoot + 'q_a_testset.csv', index=False)

    return train_df, test_df

#### Data Collection (Fetch wikipedia articles)

In [6]:
# Get raw docs from wikipedia_Data_Science
page_title = "Data_science"
based_url="https://en.wikipedia.org/wiki/"

In [7]:
# fetch raw html, convert to text and chunk it
text_list = get_data(page_title, based_url)

#### Optional - Data Collection (optional to upload document from local drive)

In [8]:
try:
    import PyPDF2
except ImportError:
    !pip install PyPDF2

try:
    import docx
except ImportError:
    !pip install python-docx

In [9]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from PyPDF2 import PdfReader
from docx import Document
import io

# Create upload widget to upload local files
upload_widget = widgets.FileUpload(
    accept='.txt,.html,.json,.pdf,.docx',  # supported file types
    multiple=False                # only single file upload
)

# Create button to fetch Local docs
fetch_button_local = widgets.Button(description='Fetch Local Documents')

# Output area for displaying results
output_area = widgets.Output()

def process_uploaded_file(b):

    # use the global list holding raw text chunks
    global text_list
    result_list_temp = []
    clear_output(wait=True)  # Clear previous outputs

    uploaded_files = upload_widget.value
    if uploaded_files:
        for filename, fileinfo in uploaded_files.items():
            content = fileinfo['content']
            # get the file extension
            file_extension = filename.split('.')[-1]

            if file_extension == 'txt':
                result_list_temp = content.decode('utf-8').splitlines()

            elif file_extension == 'docx':
              docx_stream = io.BytesIO(content)
              doc = Document(docx_stream)
              for paragraph in doc.paragraphs:
                result_list_temp.append(paragraph.text)

            elif file_extension == 'pdf':
                pdf_stream = io.BytesIO(content)
                reader = PdfReader(pdf_stream)
                for page in reader.pages:
                    page_text = page.extract_text().replace("\n", " ").replace(" -", "")
                    result_list_temp.append(page_text)

            elif file_extension == 'html':
                html_content = BeautifulSoup(content, 'html.parser')
                # extract paragraphs from html
                paragraphs = html_content.find_all('p')
                for paragraph in paragraphs:
                  raw_para = paragraph.text.replace('\n', '')
                  if raw_para != '':
                    result_list_temp.append(raw_para)

            elif file_extension == 'json':
                # Decode and load JSON
                json_data = json.loads(content.decode('utf-8'))
                if isinstance(json_data, dict):
                    result_list_temp = list(json_data.items())
                elif isinstance(json_data, list):
                    result_list_temp = json_data
                else:
                    # handle unexpected structure
                    result_list_temp = []

            # add items in result_list_temp to raw chunk list (which is a dict)
            org_len = len(text_list)
            for idx, item in enumerate(result_list_temp):
                unique_key = idx + org_len
                text_list[unique_key] = item

            with output_area:
                clear_output()  # Clear previous output
                print(f"Processed {filename}...")

# Set up the event handlers
upload_widget.observe(process_uploaded_file)
fetch_button_local.on_click(process_uploaded_file)

# Display the widgets
display(upload_widget)
display(fetch_button_local)
display(output_area)


FileUpload(value={}, accept='.txt,.html,.json,.pdf,.docx', description='Upload')

Button(description='Fetch Local Documents', style=ButtonStyle())

Output()

#### Data Preprocessing

In [10]:
# store raw chunks in temp dataframe for later viz
df_text_temp = pd.DataFrame(list(text_list.items()), columns=['id', 'text'])

In [11]:
# summarize raw data
summarized_text_list = summarize_data(text_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
# store summarized chunks in temp dataframe for later viz
df_summary_temp = pd.DataFrame(list(summarized_text_list.items()), columns=['id', 'summary'])

In [13]:
# visualize the compression of raw text into summary
df_merged_viz = pd.merge(df_text_temp, df_summary_temp, on='id')

# get text lens
df_merged_viz['text_length'] = df_merged_viz['text'].apply(len)
df_merged_viz['summary_length'] = df_merged_viz['summary'].apply(len)

# Create a common Y scale for both charts
y_scale = alt.Scale(domain=[0, df_merged_viz[['text_length', 'summary_length']].max().max()])

# plot the chart
upper = alt.Chart(df_merged_viz, width=600, height=200, title='Original Text Lengths').mark_area().encode(
    x = alt.X('id', title='chunk ID'),
    y=alt.Y('text_length', scale=y_scale, title='text length (# chars)')
)

lower = alt.Chart(df_merged_viz, width=600, height=200, title='Summarized Text Lengths').mark_area().encode(
    x = alt.X('id', title='chunk ID'),
    y=alt.Y('summary_length', scale=y_scale, title='text length (# chars)')
)

upper & lower

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [14]:
df_merged_viz['compression_percentage'] = (1 - df_merged_viz['summary_length'] / df_merged_viz['text_length']) * 100
print(f"Achieved avg compression ratio of : {df_merged_viz['compression_percentage'].mean():.2f} %")

Achieved avg compression ratio of : 43.91 %


#### Query Generation

In [15]:
# create question-context pair set
q_c_set = query_context_dataset(summarized_text_list)

In [16]:
# Questions generated from random context chunk
q_c_set[8]

{'text': 'the term "data science" dates back to 1974, when Peter Naur proposed it as an alternative name to computer science. in 1997, C. F. Jeff Wu suggested that statistics should be renamed data science.',
 'question': ['how come the term data science is used in the uk?',
  'Is there a name for a particular field of studies of data science?',
  'What is the origin of the name "Data Science"?',
  'How can "data science" be called "data science"?',
  'when did the term "data science" start?']}

#### Dataset split

In [17]:
# for every paragraph (context), store 80% questions in train and 20% in test sets.
train_df, test_df = split_store_dataset(q_c_set)

In [18]:
train_df.head()

Unnamed: 0,raw_para_id,paragraph,question
0,0,data science is an interdisciplinary academic ...,what is data science?
1,0,data science is an interdisciplinary academic ...,what is the definition of data science?
2,0,data science is an interdisciplinary academic ...,what is data science?
3,0,data science is an interdisciplinary academic ...,What is Data Science?
4,1,data science is multifaceted and can be descri...,what is data science in a multi-faceted enviro...


In [19]:
test_df.head()

Unnamed: 0,raw_para_id,paragraph,question
0,0,data science is an interdisciplinary academic ...,What is data science and how is it related to ...
1,1,data science is multifaceted and can be descri...,what is the data science?
2,2,"data science is a concept to unify statistics,...",data science questions?
3,3,data scientist creates programming code and co...,what is data scientist?
4,4,the field includes preparing data for analysis...,what subjects and techniques are needed to bec...


In [20]:
print(f"Length of Train set: {len(train_df)}")

Length of Train set: 96


In [21]:
print(f"Length of Train set: {len(test_df)}")

Length of Train set: 24
