In [14]:
from pathlib import Path
import pandas as pd
import openai
import re
import numpy as np
from transformers import GPT2TokenizerFast
from tqdm import tqdm
tqdm.pandas()

In [15]:
openai.api_key_path = '/Users/samyoung/.openai-api-key.txt'

In [16]:
# Notion allows exporting a page (and all subpages), as a folder containing manifest .md files.
wiki_paths = list(Path('fine_tuning_wikis/hydrogen_domain').rglob('*.md'))
wikis = [open(wiki_path).read() for wiki_path in wiki_paths]

In [17]:

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = long_text.split('\n')
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return "\n".join(sentences[:i])

    return long_text

In [33]:

def split_sections(wiki_text):
    headings = re.findall("#.*\n", wiki_text)
    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")
    contents = wiki_text.split("==+ !! ==+")
    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1
    return headings, contents

def build_dataframe(headings, contents, page_name, max_tokens=1500):
    df = pd.DataFrame({'heading': headings, 'content': contents[1:], 'page_name': page_name, 'title': headings[0]})
    df = df[df.content != '']
    df['context'] = df['heading'] + df['content']
    df['n_tokens_before'] = df['context'].apply(count_tokens)
    df['context'] = np.where(df['n_tokens_before'] > max_tokens,
                             df.context.apply(lambda ctx: reduce_long(ctx, max_len=max_tokens)),
                             df.context)
    df['tokens'] = df['context'].apply(count_tokens)
    return df

In [34]:
dfs = []
for wiki, path in zip(wikis, wiki_paths):
    headings, contents = split_sections(wiki)
    x = build_dataframe(headings, contents, path.name)
    dfs.append(x)
df = pd.concat(dfs)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 1 to 6
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   heading          56 non-null     object
 1   content          56 non-null     object
 2   page_name        56 non-null     object
 3   title            56 non-null     object
 4   context          56 non-null     object
 5   n_tokens_before  56 non-null     int64 
 6   tokens           56 non-null     int64 
dtypes: int64(2), object(5)
memory usage: 3.5+ KB


In [36]:
def get_questions(context):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write questions based on the text below\n\nText: {context}\n\nQuestions:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response['choices'][0]['text']
    except:
        return ""

In [37]:
df['questions'] = '1.' + df.context.progress_apply(get_questions)
print(df[['questions']].values[0][0])

100%|██████████| 56/56 [00:48<00:00,  1.16it/s]

1. What is an asset in the context of hydrogen generation?
2. What is a delivery site?
3. What is a generation unit?
4. What is an order?
5. What is a schedule?
6. What is a tariff?
7. What is a balance of plant?





In [38]:
def get_answers(row):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write answer based on the text below\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


df['answers']= df.progress_apply(get_answers, axis=1)
df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index',axis=1)
print(df[['answers']].values[0][0])

100%|██████████| 56/56 [01:08<00:00,  1.23s/it]

1. An asset in the context of hydrogen generation is an item that is involved in the generation and delivery of hydrogen.
2. A delivery site is a location that we deliver hydrogen to. A customer who purchases hydrogen may have one or more delivery sites.
3. A generation unit is a co-located grouping of assets that generate hydrogen (electrolysers and compressors). Our first site will have one electrolyser connected to one compressor, but in future, we may run multiple electrolysers, and/or multiple compressors, in parallel within a generation unit.
4. An order is a request for a volume of hydrogen to be made available for use at a specified time and location. Orders must also include the pressure that is required by the vehicle being filled as this will impact the dispensable volume of a storage unit.
5. A schedule is a list of instructions for a window of time that must be followed in order to fulfil one or more hydrogen orders within the window in an optimal way.
6. A tariff is the p




In [39]:
Path('results').mkdir(parents=True, exist_ok=True)
df.to_csv('results/hydrogen_q_and_a.csv', index=False)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   heading          56 non-null     object
 1   content          56 non-null     object
 2   page_name        56 non-null     object
 3   title            56 non-null     object
 4   context          56 non-null     object
 5   n_tokens_before  56 non-null     int64 
 6   tokens           56 non-null     int64 
 7   questions        56 non-null     object
 8   answers          56 non-null     object
dtypes: int64(2), object(7)
memory usage: 4.1+ KB


In [41]:
qa = df.apply(lambda row: list(zip(row['questions'].split('\n'), row['answers'].split('\n'))), axis=1).tolist()
qa_flat = [item for sublist in qa for item in sublist]
qadf = pd.DataFrame(qa_flat)
qadf.columns=['question', 'answer']
qadf.to_csv('results/qadf.csv', index=False)