In [1]:
from pathlib import Path
import pandas as pd
import openai
import re
import numpy as np
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
openai.api_key_path = '/Users/samyoung/.openai-api-key.txt'

In [3]:
# Notion allows exporting a page (and all subpages), as a folder containing manifest .md files.
wiki_paths = list(Path('fine_tuning_wikis').rglob('*.md'))
wikis = [open(wiki_path).read() for wiki_path in wiki_paths]

In [4]:

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = long_text.split('\n')
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return "\n".join(sentences[:i])

    return long_text

In [22]:

def split_sections(wiki_text):
    headings = re.findall("#.*\n", wiki_text)
    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")
    contents = wiki_text.split("==+ !! ==+")
    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1
    return headings, contents

def build_dataframe(headings, contents, page_name, max_tokens=1500):
    df = pd.DataFrame({'heading': headings, 'content': contents[1:], 'page_name': page_name})
    df = df[df.content != '']
    df['context'] = df['heading'] + df['content']
    df['n_tokens_before'] = df['context'].apply(count_tokens)
    df['context'] = np.where(df['n_tokens_before'] > max_tokens,
                             df.context.apply(lambda ctx: reduce_long(ctx, max_len=max_tokens)),
                             df.context)
    df['n_tokens_after'] = df['context'].apply(count_tokens)
    return df

In [23]:
dfs = []
for wiki, path in zip(wikis, wiki_paths):
    headings, contents = split_sections(wiki)
    x = build_dataframe(headings, contents, path.name)
    dfs.append(x)
df = pd.concat(dfs)

In [24]:
df

Unnamed: 0,heading,content,page_name,context,n_tokens_before,n_tokens_after
1,### Ahead of time\n,- Trigger a schedule of production in staging ...,Plug demo script fd56e061d83b4731bb4122fb369a1...,### Ahead of time\n- Trigger a schedule of pro...,30,30
2,### Context\n,- NOTE: Audience are project owners - would wa...,Plug demo script fd56e061d83b4731bb4122fb369a1...,### Context\n- NOTE: Audience are project owne...,110,110
3,### System Overview\n,- Slides: [https://docs.google.com/presentatio...,Plug demo script fd56e061d83b4731bb4122fb369a1...,### System Overview\n- Slides: [https://docs.g...,57,57
5,### Demo of UI\n,**What are they looking at?**\n\n- Our platfor...,Plug demo script fd56e061d83b4731bb4122fb369a1...,### Demo of UI\n**What are they looking at?**\...,675,675
0,# Motive\n,[Emails](Motive b049fe9c69884be880e7e4a6addb09...,Motive b049fe9c69884be880e7e4a6addb0948.md,# Motive\n[Emails](Motive b049fe9c69884be880e7...,227,227
1,### Epics\n,[OE Pricing Integration](https://www.notion.so...,Motive b049fe9c69884be880e7e4a6addb0948.md,### Epics\n[OE Pricing Integration](https://ww...,146,146
0,# Canford\n,![Screenshot 2023-01-19 at 11.50.07.png](Canfo...,Canford e5e6d72a02cc4eab916f0122c177b2ed.md,# Canford\n![Screenshot 2023-01-19 at 11.50.07...,1067,1067
1,## 26th Jan 2023 Questions\n,- In your email you asked whether our system w...,Canford e5e6d72a02cc4eab916f0122c177b2ed.md,## 26th Jan 2023 Questions\n- In your email yo...,1543,1496
3,### Questions before call:\n,- Ask whether they may want electrolyzer integ...,Canford e5e6d72a02cc4eab916f0122c177b2ed.md,### Questions before call:\n- Ask whether they...,41,41
4,### Transcript of call (basically verbatim):\n,"- Rob: Electrolyzer still on track, maybe slip...",Canford e5e6d72a02cc4eab916f0122c177b2ed.md,### Transcript of call (basically verbatim):\n...,2909,1459


In [25]:
def get_questions(context):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write questions based on the text below\n\nText: {context}\n\nQuestions:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response['choices'][0]['text']
    except:
        return ""

In [26]:
df['questions'] = '1.' + df.context.apply(get_questions)
print(df[['questions']].values[0][0])

1. What is the schedule of production for Mira?
2. What are the offtake, nor downtime events?
3. How will the schedule of production be triggered?


In [27]:
def get_answers(row):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write answer based on the text below\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


df['answers']= df.apply(get_answers, axis=1)
df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index',axis=1)
print(df[['answers']].values[0][0])

1. The schedule of production for Mira is a series of steps that need to be completed in a specific order in order for the product to be made.
2. Offtake events are when the product is taken away from the production line for any reason. Downtime events are when the production line is stopped for any reason.
3. The schedule of production will be triggered by a schedule of production in staging for Mira. This means that the steps that need to be completed in order for the product to be made will be done in a specific order, and will be triggered by the schedule in staging.


In [29]:
Path('results').mkdir(parents=True, exist_ok=True)
df.to_csv('results/hydrogen_q_and_a.csv', index=False)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   heading          21 non-null     object
 1   content          21 non-null     object
 2   page_name        21 non-null     object
 3   context          21 non-null     object
 4   n_tokens_before  21 non-null     int64 
 5   n_tokens_after   21 non-null     int64 
 6   questions        21 non-null     object
 7   answers          21 non-null     object
dtypes: int64(2), object(6)
memory usage: 1.4+ KB


In [48]:
qa = df.apply(lambda row: list(zip(row['questions'].split('\n'), row['answers'].split('\n'))), axis=1).tolist()
qa_flat = [item for sublist in qa for item in sublist]
qadf = pd.DataFrame(qa_flat)
qadf.columns=['question', 'answer']
qadf.to_csv('results/qadf.csv', index=False)