# Extract raw data into prompt-completion pairs

INPUTS: raw data of some type

OUTPUTS: formatted prompt-completion dataframes, ready to be fed to the fine-tuning scripts in the second notebook

------

This notebook contains sequences that extract prompt-completion pairs. 
From two sources

1) extracts QA pairs from the fastai forums
2) formats a log of a GPT QA Chat to a prompt-completion sequence

### 1) Custom web scraper for fast.ai forums 

In [None]:
# write a scraper that reads the main body of a forum post and returns the text of the post
import re
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import numpy as np


def find_answer_nodes(soup):
    """
    Extract the answer html between the question and the next question

    Args:
        soup: The soup of the question

    Returns:
        The formatted html of the answer
    
    """
    position = soup
    answer_nodes = []
    while True:
        try:
            position = position.find_next_sibling()
            if 'start' in position.attrs:
                break

            answer_nodes.append(position.decode_contents())
        except AttributeError:
            break

    return ''.join(answer_nodes).strip()


def extract_qa(item):
    """
    Extract the question and answer by an answer soup

    Args: 
        item: The soup of the qa post 

    Returns:
        A dictionary of the question and answer pairs 
    """
    title_tmp = item.select_one('li').decode_contents()
    title = title_tmp.replace('<code>', '\'').replace('</code>', '\'')

    # if 'What is the difference between tensor rank and shape?' in title:
    #     print(111)

    # Get the answer content
    answer_inner_html = find_answer_nodes(item)  # Get original html

    # Replace the code blocks
    answer_inner_html = re.sub(r'<pre><code( class="[\w-]+")?>([\s\S]*?)</code></pre>', r'```\2```', answer_inner_html)
    answer_inner_html = answer_inner_html.replace('<code>', '\'').replace('</code>', '\'')

    # Get content
    answer_soup = BeautifulSoup(answer_inner_html, "html.parser")
    answer = answer_soup.text

    return {'title': title, 'answer': answer.strip()}


def get_qa_post(post_url):
    """
    Reads the main body of a forum post and returns the text of the post.
    
    Args:
        post_url: The URL of the forum post.
    
    Returns:
        The text of the forum post.
    """

    # Read the HTML of the forum post
    response = requests.get(post_url)
    soup = BeautifulSoup(response.text, "html.parser")

    items = soup.find_all('ol')

    # Get the valid question titles
    QAs = []
    for i, item in enumerate(items):
        # Only get question pairs
        if i == 0 or 'start' in item.attrs:
            QAs.append(extract_qa(item))

    return QAs



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# # get the forum post for 4 
# url = "https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253"
# qa_dict = get_qa_post(url)
# # print(qa_dict)
# pd.DataFrame(qa_dict).to_csv('fastai-1.csv', index=False)

In [None]:

# loop through the pages and scrape the questions and answers
qa_dict = []
for i, row in forum_urls_df.iterrows():
    # print(i, row['URL'])

    # if it contains nan or is empty or is not a url, skip
    if not pd.isna(row['URL']) and row['URL'].startswith('https'):
        # print(row['URL'])
        qa_dict.extend(get_qa_post(row['URL']))

# save the questions and answers to a csv file
qadict = pd.DataFrame(qa_dict)
qadict.to_csv('fastai-2.csv', index=False)


0 https://forums.fast.ai/t/fastbook-chapter-1-questionnaire-solutions-wiki/65647
https://forums.fast.ai/t/fastbook-chapter-1-questionnaire-solutions-wiki/65647
1 https://forums.fast.ai/t/fastbook-chapter-2-questionnaire-solutions-wiki/66392
https://forums.fast.ai/t/fastbook-chapter-2-questionnaire-solutions-wiki/66392
2 https://forums.fast.ai/t/fastbook-chapter-3-questionnaire-solutions-wiki/68042
https://forums.fast.ai/t/fastbook-chapter-3-questionnaire-solutions-wiki/68042
3 https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253
https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253
4 https://forums.fast.ai/t/fastbook-chapter-5-questionnaire-solutions-wiki/69301
https://forums.fast.ai/t/fastbook-chapter-5-questionnaire-solutions-wiki/69301
5 https://forums.fast.ai/t/fastbook-chapter-6-questionnaire-solutions-wiki/69922
https://forums.fast.ai/t/fastbook-chapter-6-questionnaire-solutions-wiki/69922
6 nan
7 https://forums.fast.ai/t/fastboo



11 https://forums.fast.ai/t/fastbook-chapter-12-questionnaire-wiki/70516
https://forums.fast.ai/t/fastbook-chapter-12-questionnaire-wiki/70516
12 nan
13 nan
14 nan
15 nan
16 nan
17 nan
18 nan
19 nan


In [None]:
# print(qadict.head())
print(len(qadict))

297


## Format extracted content

In [None]:
# open the csv file, remove empty lines, and ensure every line has a question and answer, and ensure answers end with a '.' or ' 'ArithmeticError
qa_df = pd.read_csv('fastai-qa-cropped.csv')
qa_df = qa_df.dropna()
# if an answer does not end with a '.' or ' ' or '`', add a '.'
qa_df['answer'] = qa_df['answer'].apply(lambda x: x if x.endswith('.') or x.endswith(' ') or x.endswith('`') else x + '.')
# save the questions and answers to a csv file
qa_df.to_csv('fastai-qa-cleaned.csv', index=False)
qa_df.head()

Unnamed: 0,question,answer
0,"Based on the book of the same name, what are t...",- A set of processing units\n- A state of acti...
1,What were the two theoretical misunderstanding...,"In 1969, Marvin Minsky and Seymour Papert demo..."
2,What is a GPU?,GPU stands for Graphics Processing Unit (also ...
6,Why is it hard to use a traditional computer p...,"For us humans, it is easy to identify images i..."
7,What term do we normally use in deep learning ...,We instead use the term parameters. In deep le...


### 2) Extract qa pairs from a raw GPT txt chat log 

In [None]:
# create a new df with the questions and answers
gpt_df = pd.DataFrame()

# add new columns to the df
gpt_df['prompt'] = ''
gpt_df['completion'] = ''
gpt_df['question_id'] = 0

question_i = 0

# set the initial entry to empty values and question_id to 0
gpt_df.loc[question_i, 'prompt'] = ''
gpt_df.loc[question_i, 'completion'] = ''
gpt_df.loc[question_i, 'question_id'] = 0

with open('gpt-chat-log-cleaned.txt', 'r') as f:
    lines = f.readlines()

    for i, line in enumerate(lines):
        # if it starts with "Q: ", then it is a question
        if line.startswith("Q: "):
            # add a new entry to the df with the question 
            gpt_df = gpt_df.append({'prompt': line[3:].strip(), 'completion': '', 'question_id': question_i}, ignore_index=True)
            
        # otherwise, append it to the answer for the current question
            question_i += 1
            # print(question_i)
        else:
            # add the answer to the df at the index question_i 
            # print(question_i)
            # print(gpt_df.iloc[question_i]['answer'])
            gpt_df.at[question_i, 'completion'] += line.strip() + ' '


gpt_df.head()
print(len(gpt_df))

# save to csv 
# gpt_df.to_csv('gpt-qa.csv', index=False)

# drop the question_id column
gpt_df = gpt_df.drop(columns=['question_id'])
# remove rows with empty questions
gpt_df = gpt_df[gpt_df['prompt'] != '']
gpt_df = gpt_df[gpt_df['completion'] != '']
gpt_df = gpt_df.dropna()

# replace all triple spaces with '\n\n' 
gpt_df['completion'] = gpt_df['completion'].apply(lambda x: x.replace('  ', ' \n\n'))

gpt_df['completion'] = gpt_df['completion'].apply(lambda x: x.replace('```', '\n```\n'))


# save to csv without the indices 
gpt_df.to_csv('gpt-qa-df.csv', index=False)


57
