### DataPreProcess Notebook with following steps:
1. Data Collection: Fetch data from Wikipedia
2. Data Preprocessing: Extract and convert HTML content into a text format using bs4. Chunk the text (per paragraph) and summarize the text with T5 for efficient analysis.
3. Query Generation: Create queries from the preprocessed data using T5, doc2query to generate question-context pair set.
4. Dataset split: Split question-context pairs into train/test sets per context i.e. chunk.


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# provide project root path
ProjectRoot = "/home/sangram/Tutorbot_capstone/git_hub/Tutorbot/"
DatasetRoot = ProjectRoot + "Dataset/"
# DatasetRoot = "/content/drive/MyDrive/"

In [3]:
import pandas as pd
import regex as re
import requests
import json
from bs4 import BeautifulSoup
import logging
from transformers import T5ForConditionalGeneration, T5Tokenizer

# viz
import altair as alt

# Set logging level to ERROR to suppress the notice
logging.getLogger("transformers").setLevel(logging.ERROR)

#### Helper Methods

In [4]:
#Get raw dataset from wikipedia
def get_data(page_title,based_url):
    # build full url and get response
    url=based_url+page_title
    response = requests.get(url)
    html_content=response.text

    # parse text and chunk it by paragraphs
    html_content = BeautifulSoup(html_content, 'html.parser')
    paragraphs = html_content.find_all('p')
    text_list={}
    para_idx = 0
    for paragraph in paragraphs:
        raw_para = paragraph.text.replace('\n', '')
        if raw_para != '':
            text_list[para_idx] = re.sub(r'\[\d+]', '', raw_para)
            para_idx += 1

    # Write JSON output to a file
    with open(DatasetRoot + 'raw_knowledge.json', 'w') as f:
        json.dump(text_list, f, indent=4)

    return text_list

# summarize paragraph in shorter form
def generate_meaning(text):
    # init T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # tokenize raw text
    input_tokens = tokenizer.encode("summarize: " + text, return_tensors='pt')

    # Generate the main meaning/summary using the T5 model
    output = model.generate(input_tokens, max_length=150, num_beams=4, early_stopping=True)

    # decode summary
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# summarize entire raw data
def summarize_data(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = generate_meaning(raw_text)
    return text_list

In [5]:
# set no. of queries to generate
NUM_QUESTIONS_GEN = 5

# generate questions from a paragraph
def text2question(text):
    model_name = 'doc2query/all-with_prefix-t5-base-v1'
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    prefix = "answer2question"

    text2=prefix+": "+text
    input_ids = tokenizer.encode(text2, max_length=384, truncation=True, return_tensors='pt')
    outputs = model.generate(
        input_ids=input_ids,
        max_length=64,
        do_sample=True,
        top_p=0.95,
        num_return_sequences=NUM_QUESTIONS_GEN)

    questions=[]
    for i in range(len(outputs)):
        query = tokenizer.decode(outputs[i], skip_special_tokens=True)
        questions.append(query)
    dict_context_query={}
    dict_context_query['text']=text
    dict_context_query['question']=questions
    return dict_context_query

# generate question-context set from documents:
def query_context_dataset(text_list):
    for para_id, raw_text in text_list.items():
        text_list[para_id] = text2question(raw_text)
    return text_list

# split question-context dataset into train/test set and store in csv
def split_store_dataset(list_t2q):
    context_query_train = []
    context_query_test = []

    # for every chunk/context/paragraph, store 80% questions in train and 20% in test sets.
    for para_id, text in list_t2q.items():
        split_idx = round(len(text['question']) * 0.8)
        for i in range(split_idx):
            context_query_train.append([para_id, text['text'], text['question'][i]])
        for i in range(split_idx, len(text['question'])):
            context_query_test.append([para_id, text['text'], text['question'][i]])

    train_df = pd.DataFrame(context_query_train, columns=['raw_para_id', 'paragraph', 'question'])
    train_df.to_csv(DatasetRoot + 'q_a_trainset.csv', index=False)

    test_df = pd.DataFrame(context_query_test, columns=['raw_para_id', 'paragraph', 'question'])
    test_df.to_csv(DatasetRoot + 'q_a_testset.csv', index=False)

    return train_df, test_df

#### Data Collection

In [6]:
# Get raw docs from wikipedia_Data_Science
page_title = "Data_science"
based_url="https://en.wikipedia.org/wiki/"

In [7]:
# fetch raw html, convert to text and chunk it
text_list = get_data(page_title, based_url)

In [8]:
# store in temp dataframe for later viz
df_text_temp = pd.DataFrame(list(text_list.items()), columns=['id', 'text'])

#### Data Preprocessing

In [9]:
# summarize raw data
summarized_text_list = summarize_data(text_list)

In [10]:
# store in temp dataframe for later viz
df_summary_temp = pd.DataFrame(list(summarized_text_list.items()), columns=['id', 'summary'])

In [11]:
# visualize the compression of raw text into summary
df_merged_viz = pd.merge(df_text_temp, df_summary_temp, on='id')

# get text lens
df_merged_viz['text_length'] = df_merged_viz['text'].apply(len)
df_merged_viz['summary_length'] = df_merged_viz['summary'].apply(len)

# Create a common Y scale for both charts
y_scale = alt.Scale(domain=[0, df_merged_viz[['text_length', 'summary_length']].max().max()])

# plot the chart
upper = alt.Chart(df_merged_viz, width=600, height=200, title='Original Text Lengths').mark_area().encode(
    x = alt.X('id', title='Document ID'),
    y=alt.Y('text_length', scale=y_scale, title='Doc length (# chars)')
)

lower = alt.Chart(df_merged_viz, width=600, height=200, title='Summarized Text Lengths').mark_area().encode(
    x = alt.X('id', title='Document ID'),
    y=alt.Y('summary_length', scale=y_scale, title='Doc length (# chars)')
)

upper & lower

#### Query Generation

In [12]:
# create question-context pair set
q_c_set = query_context_dataset(summarized_text_list)

In [13]:
# Questions generated for random doc
q_c_set[8]

{'text': 'the term "data science" dates back to 1974, when Peter Naur proposed it as an alternative name to computer science. in 1997, C. F. Jeff Wu suggested that statistics should be renamed data science.',
 'question': ['What is data science?',
  'Why does a scientist use data science as the name for statistics?',
  'Who introduced "data science" in science?',
  'When did the term "data science" first become the name for statistics, and then when did it change?',
  'who coined the word data science?']}

#### Dataset split

In [14]:
# for every paragraph (context), store 80% questions in train and 20% in test sets.
train_df, test_df = split_store_dataset(q_c_set)

In [15]:
train_df.head()

Unnamed: 0,raw_para_id,paragraph,question
0,0,data science is an interdisciplinary academic ...,What is the basic concept for data science?
1,0,data science is an interdisciplinary academic ...,what is data science?
2,0,data science is an interdisciplinary academic ...,which is data science?
3,0,data science is an interdisciplinary academic ...,Is data science a unified field or a sub-field...
4,1,data science is multifaceted and can be descri...,Can anyone explain the different aspects of da...


In [16]:
test_df.head()

Unnamed: 0,raw_para_id,paragraph,question
0,0,data science is an interdisciplinary academic ...,What exactly are the three key concepts of dat...
1,1,data science is multifaceted and can be descri...,what is the difference between data science an...
2,2,"data science is a concept to unify statistics,...",Data science how to do data?
3,3,data scientist creates programming code and co...,What do data scientists do?
4,4,the field includes preparing data for analysis...,what are the majors in data science?


In [17]:
print(f"Length of Train set: {len(train_df)}")

Length of Train set: 96


In [18]:
print(f"Length of Train set: {len(test_df)}")

Length of Train set: 24
