In [70]:
# write a scraper that reads the main body of a forum post and returns the text of the post
import re
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import numpy as np


def find_answer_nodes(soup):
    """
    Extract the answer html between the question and the next question

    Args:
        soup: The soup of the question

    Returns:
        The formatted html of the answer
    
    """
    position = soup
    answer_nodes = []
    while True:
        try:
            position = position.find_next_sibling()
            if 'start' in position.attrs:
                break

            answer_nodes.append(position.decode_contents())
        except AttributeError:
            break

    return ''.join(answer_nodes).strip()


def extract_qa(item):
    """
    Extract the question and answer by an answer soup

    Args: 
        item: The soup of the qa post 

    Returns:
        A dictionary of the question and answer pairs 
    """
    title_tmp = item.select_one('li').decode_contents()
    title = title_tmp.replace('<code>', '\'').replace('</code>', '\'')

    # if 'What is the difference between tensor rank and shape?' in title:
    #     print(111)

    # Get the answer content
    answer_inner_html = find_answer_nodes(item)  # Get original html

    # Replace the code blocks
    answer_inner_html = re.sub(r'<pre><code( class="[\w-]+")?>([\s\S]*?)</code></pre>', r'```\2```', answer_inner_html)
    answer_inner_html = answer_inner_html.replace('<code>', '\'').replace('</code>', '\'')

    # Get content
    answer_soup = BeautifulSoup(answer_inner_html, "html.parser")
    answer = answer_soup.text

    return {'title': title, 'answer': answer.strip()}


def get_qa_post(post_url):
    """
    Reads the main body of a forum post and returns the text of the post.
    
    Args:
        post_url: The URL of the forum post.
    
    Returns:
        The text of the forum post.
    """

    # Read the HTML of the forum post
    response = requests.get(post_url)
    soup = BeautifulSoup(response.text, "html.parser")

    items = soup.find_all('ol')

    # Get the valid question titles
    QAs = []
    for i, item in enumerate(items):
        # Only get question pairs
        if i == 0 or 'start' in item.attrs:
            QAs.append(extract_qa(item))

    return QAs



In [3]:
# # get the forum post for 4 
# url = "https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253"
# qa_dict = get_qa_post(url)
# # print(qa_dict)

# pd.DataFrame(qa_dict).to_csv('fastai-1.csv', index=False)

In [6]:
# load the csv file "forum pages" into a df and then loop through the pages and scrape the questions and answers
forum_urls_df = pd.read_csv('fastai-solutions-forum-pages.csv') 
forum_urls_df.head()

Unnamed: 0,Chapter,URL
0,1,https://forums.fast.ai/t/fastbook-chapter-1-qu...
1,2,https://forums.fast.ai/t/fastbook-chapter-2-qu...
2,3,https://forums.fast.ai/t/fastbook-chapter-3-qu...
3,4,https://forums.fast.ai/t/fastbook-chapter-4-qu...
4,5,https://forums.fast.ai/t/fastbook-chapter-5-qu...


In [17]:

# loop through the pages and scrape the questions and answers
qa_dict = []
for i, row in forum_urls_df.iterrows():
    # print(i, row['URL'])

    # if it contains nan or is empty or is not a url, skip
    if not pd.isna(row['URL']) and row['URL'].startswith('https'):
        # print(row['URL'])
        qa_dict.extend(get_qa_post(row['URL']))

# save the questions and answers to a csv file
qadict = pd.DataFrame(qa_dict)
qadict.to_csv('fastai-2.csv', index=False)


0 https://forums.fast.ai/t/fastbook-chapter-1-questionnaire-solutions-wiki/65647
https://forums.fast.ai/t/fastbook-chapter-1-questionnaire-solutions-wiki/65647
1 https://forums.fast.ai/t/fastbook-chapter-2-questionnaire-solutions-wiki/66392
https://forums.fast.ai/t/fastbook-chapter-2-questionnaire-solutions-wiki/66392
2 https://forums.fast.ai/t/fastbook-chapter-3-questionnaire-solutions-wiki/68042
https://forums.fast.ai/t/fastbook-chapter-3-questionnaire-solutions-wiki/68042
3 https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253
https://forums.fast.ai/t/fastbook-chapter-4-questionnaire-solutions-wiki/67253
4 https://forums.fast.ai/t/fastbook-chapter-5-questionnaire-solutions-wiki/69301
https://forums.fast.ai/t/fastbook-chapter-5-questionnaire-solutions-wiki/69301
5 https://forums.fast.ai/t/fastbook-chapter-6-questionnaire-solutions-wiki/69922
https://forums.fast.ai/t/fastbook-chapter-6-questionnaire-solutions-wiki/69922
6 nan
7 https://forums.fast.ai/t/fastboo



11 https://forums.fast.ai/t/fastbook-chapter-12-questionnaire-wiki/70516
https://forums.fast.ai/t/fastbook-chapter-12-questionnaire-wiki/70516
12 nan
13 nan
14 nan
15 nan
16 nan
17 nan
18 nan
19 nan


In [20]:
# print(qadict.head())
print(len(qadict))

297


In [22]:
# open the csv file, remove empty lines, and ensure every line has a question and answer, and ensure answers end with a '.' or ' 'ArithmeticError
qa_df = pd.read_csv('fastai-qa-cropped.csv')
qa_df = qa_df.dropna()
# if an answer does not end with a '.' or ' ' or '`', add a '.'
qa_df['answer'] = qa_df['answer'].apply(lambda x: x if x.endswith('.') or x.endswith(' ') or x.endswith('`') else x + '.')
# save the questions and answers to a csv file
qa_df.to_csv('fastai-qa-cleaned.csv', index=False)
qa_df.head()

Unnamed: 0,question,answer
0,"Based on the book of the same name, what are t...",- A set of processing units\n- A state of acti...
1,What were the two theoretical misunderstanding...,"In 1969, Marvin Minsky and Seymour Papert demo..."
2,What is a GPU?,GPU stands for Graphics Processing Unit (also ...
6,Why is it hard to use a traditional computer p...,"For us humans, it is easy to identify images i..."
7,What term do we normally use in deep learning ...,We instead use the term parameters. In deep le...


In [61]:
# create a new df with the questions and answers
gpt_df = pd.DataFrame()

# add new columns to the df
gpt_df['question'] = ''
gpt_df['answer'] = ''
gpt_df['question_id'] = 0

question_i = 0

# set the initial entry to empty values and question_id to 0
gpt_df.loc[question_i, 'question'] = ''
gpt_df.loc[question_i, 'answer'] = ''
gpt_df.loc[question_i, 'question_id'] = 0

with open('gpt-chat-log-cleaned.txt', 'r') as f:
    lines = f.readlines()

    for i, line in enumerate(lines):
        # if it starts with "Q: ", then it is a question
        if line.startswith("Q: "):
            # add a new entry to the df with the question 
            gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
            
        # otherwise, append it to the answer for the current question
            question_i += 1
            # print(question_i)
        else:
            # add the answer to the df at the index question_i 
            # print(question_i)
            # print(gpt_df.iloc[question_i]['answer'])
            gpt_df.at[question_i, 'answer'] += line.strip() + ' '


gpt_df.head()
print(len(gpt_df))

# save to csv 
# gpt_df.to_csv('gpt-qa.csv', index=False)

# drop the question_id column
gpt_df = gpt_df.drop(columns=['question_id'])
# remove rows with empty questions
gpt_df = gpt_df[gpt_df['question'] != '']
gpt_df = gpt_df[gpt_df['answer'] != '']
gpt_df = gpt_df.dropna()

# save to csv without the indices 
gpt_df.to_csv('gpt-qa-df.csv', index=False)


57


  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer': '', 'question_id': question_i}, ignore_index=True)
  gpt_df = gpt_df.append({'question': line[3:].strip(), 'answer'

# OpenAI Fine-tuning API 

ok now let's combine these two and submit to the openai finetuning api 
-  first i'll upload the GPT chat one 
-  then i'll upload the fastai one

## Now let's format it for finetuning API 

In [83]:
# load the csv file for more editing
gpt_loaded_df = pd.read_csv('gpt-qa-df.csv')
print(gpt_loaded_df.head())

# - Based on your file extension, your file is formatted as a CSV file
# - Your file contains 56 prompt-completion pairs. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples
# - Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
# - All completions end with suffix ` `
#   WARNING: Some of your completions contain the suffix ` ` more than once. We suggest that you review your completions and add a unique ending
# - The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

# add a space to the end of each prompt
# Each prompt should end with a fixed separator to inform the model when the prompt ends and the completion begins. A simple separator which generally works well is \n\n###\n\n. The separator should not appear elsewhere in any prompt.

gpt_loaded_df['prompt'] = gpt_loaded_df['prompt'].apply(lambda x: x + '\n\n###\n\n')

# start each completion with a space 
# Each completion should start with a whitespace due to our tokenization, which tokenizes most words with a preceding whitespace.
# Each completion should end with a fixed stop sequence to inform the model when the completion ends. A stop sequence could be \n, ###, or any other token that does not appear in any completion.

gpt_loaded_df['completion'] = gpt_loaded_df['completion'].apply(lambda x: ' ' + x + '\n<+++>\n')


# save csv 

# gpt_loaded_df.to_csv('gpt-qa-formatted.csv', index=False)

# split the df into two dfs, one with the first 80% of data and one with the last 20% of data
gpt_train_df, gpt_valid_df = np.split(gpt_loaded_df.sample(frac=1), [int(.8*len(gpt_loaded_df))])

# save the train and test dfs to csv files
gpt_train_df.to_csv('gpt-qa-train.csv', index=False)
gpt_valid_df.to_csv('gpt-qa-valid.csv', index=False)


                                              prompt  \
0  whats the difference between ; and && in terminal   
1                            how to auto jump in cli   
2  how to list all files even hidden files in ter...   
3         how to list files in human readable format   
4               how to list details of files with ls   

                                          completion  
0  In the terminal, `;` and `&&` are used to sepa...  
1  To enable auto jumping in the CLI, you can use...  
2  To list all files, including hidden files, in ...  
3  To list files in human readable format in the ...  
4  To list details about files in the terminal us...  


# How to format the API call to the fine tuned model 

### format the prompt from the user before sending it 
After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string 

`\n\n###\n\n` 

for the model to start generating completions, rather than continuing with the prompt. 

### param in API call for stop sequence 
Make sure to include 

`stop=[" \n<+++>\n"]` 

so that the generated texts ends at the expected place.

### Format the output CSV file in terminal into a JSONL: 

openai tools fine_tunes.prepare_data -f <LOCAL_FILE>


In [78]:
filename="gpt-qa-valid.csv"

In [91]:
# !y| openai tools fine_tunes.prepare_data -f {filename}

# Create the fine-tuned model 
- in CLI
- using the files we created earlier

In [92]:
### in terminal: 

openai api fine_tunes.create \
-t gpt-qa-train_prepared.jsonl \
-v gpt-qa-valid_prepared.jsonl \
-m "davinci" \
--suffix "gpt-ml-qa-pairs-A"

SyntaxError: invalid syntax (819037427.py, line 3)


We attach a result file to each job once it has been completed. This results file ID will be listed when you retrieve a fine-tune, and also when you look at the events on a fine-tune. You can download these files.

The _results.csv file contains a row for each training step, where a step refers to one forward and backward pass on a batch of data. 

openai api fine_tunes.results -i <YOUR_FINE_TUNE_JOB_ID>


### Terminal outputs 


(ml) SF-mbp:gpt-fine-tuning stephen$ openai api fine_tunes.create \
> -t gpt-qa-train_prepared.jsonl \
> -v gpt-qa-valid_prepared.jsonl
-m "davinci" \
--suffix "gpt-ml-qa-pairs-A"
Upload progress: 100%|███████████████████████| 50.8k/50.8k [00:00<00:00, 22.6Mit/s]
Uploaded file from gpt-qa-train_prepared.jsonl: file-5fYU80lXy5FQgfmfVU56k8nb
Upload progress: 100%|███████████████████████| 12.2k/12.2k [00:00<00:00, 8.80Mit/s]
Uploaded file from gpt-qa-valid_prepared.jsonl: file-99tOIqCaPbNwL90bI1K1yq23
Created fine-tune: ft-zHk8gGvZdaBNRVheXq5fZFqt
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-11-20 00:36:48] Created fine-tune: ft-zHk8gGvZdaBNRVheXq5fZFqt
[2022-11-20 00:36:52] Fine-tune costs $0.14
[2022-11-20 00:36:52] Fine-tune enqueued. Queue number: 0
[2022-11-20 00:36:54] Fine-tune started
[2022-11-20 00:37:54] Completed epoch 1/4
[2022-11-20 00:38:06] Completed epoch 2/4
[2022-11-20 00:38:18] Completed epoch 3/4
[2022-11-20 00:38:30] Completed epoch 4/4
[2022-11-20 00:38:46] Uploaded model: curie:ft-personal-2022-11-20-08-38-46
[2022-11-20 00:38:47] Uploaded result file: file-EZy5duulRUswzYdqj6LuW76a
[2022-11-20 00:38:47] Fine-tune succeeded

Job complete! Status: succeeded 🎉
Try out your fine-tuned model:

openai api completions.create -m curie:ft-personal-2022-11-20-08-38-46 -p <YOUR_PROMPT>
(ml) SF-mbp:gpt-fine-tuning stephen$ -m "davinci" \
> --suffix "gpt-ml-qa-pairs-A"
-bash: -m: command not found

### Trying again 

$ openai api fine_tunes.create \
> -t gpt-qa-train_prepared.jsonl \
> -v gpt-qa-valid_prepared.jsonl \
> -m "davinci" \
> --suffix "gpt-ml-qa-pairs-A"
Found potentially duplicated files with name 'gpt-qa-train_prepared.jsonl', purpose 'fine-tune' and size 50782 bytes
file-5fYU80lXy5FQgfmfVU56k8nb
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway:
Upload progress: 100%|███████████████████████████████| 50.8k/50.8k [00:00<00:00, 21.8Mit/s]
Uploaded file from gpt-qa-train_prepared.jsonl: file-XCo6Sza9cVyfIWjPmoatpK3s
Found potentially duplicated files with name 'gpt-qa-valid_prepared.jsonl', purpose 'fine-tune' and size 12227 bytes
file-99tOIqCaPbNwL90bI1K1yq23
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway:
Upload progress: 100%|███████████████████████████████| 12.2k/12.2k [00:00<00:00, 6.49Mit/s]
Uploaded file from gpt-qa-valid_prepared.jsonl: file-HbOaJkGRRu4hp4C8DQ4PtZ0t
Created fine-tune: ft-xNzKl8ORDdASWIALy5WoSLcY
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-11-20 00:45:39] Created fine-tune: ft-xNzKl8ORDdASWIALy5WoSLcY
[2022-11-20 00:45:46] Fine-tune costs $1.42
[2022-11-20 00:45:47] Fine-tune enqueued. Queue number: 0
[2022-11-20 00:45:48] Fine-tune started
[2022-11-20 00:47:30] Completed epoch 1/4
[2022-11-20 00:47:50] Completed epoch 2/4
[2022-11-20 00:48:10] Completed epoch 3/4
[2022-11-20 00:48:30] Completed epoch 4/4
[2022-11-20 00:49:22] Uploaded model: davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22
[2022-11-20 00:49:23] Uploaded result file: file-rGIPU3ndPAZXVRSBT4xhjDKy
[2022-11-20 00:49:23] Fine-tune succeeded

Job complete! Status: succeeded 🎉
Try out your fine-tuned model:

openai api completions.create -m davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22 -p <YOUR_PROMPT>
