In [17]:
# 0.47, 0.6, 0.67, 0.89, 0.98

import openai
import tiktoken
openai.api_key = ""


history = []
token = 0
token_limit = 16384


def OpenaiHandshake(msg, system, model="gpt-3.5-turbo-16k-0613"):
    history = [{
        "role": "user",
        "content": msg
    }]
    response = openai.ChatCompletion.create(
            model=model,
            messages=system+history,
            temperature=1,
            presence_penalty=1
        )
    response_m = response.choices[0].message
        
    return response_m, response_m.content

def TokenCheck(content):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    tokens = encoding.encode(content)
    token_count = len(tokens)
    return token_count

def TokenReflection(history):
    global token
    token_ = 0
    for item in history:
        msg = item["content"]
        token_ += TokenCheck(msg)
    token = token_
    print("[Token: {token}]".format(token=token))
    return token_

In [18]:
exampleData = """
The user want data for neural network testing purposes. You are supposed to create dataset in JSON format for me. The paragraphs should each be about 80 words. 

The following is an example of a dataset.

Query: evolution of moral standards.
Example dataset:
[
    {
        "id": "0",
        "relevance": "high",
        "queryTerm": true,
        "queryFreq": 5,
        "topic": "Philosophy",
        "style": "formal",
        "type": "introduction",
        "entities": [
            "Aristotle",
            "Plato",
            "Socrates",
            "virtue ethics"
        ],
        "correctness": true,
        "text": "The origins of moral philosophy in the Western tradition trace back to ancient Greek philosophers like Socrates, Plato and Aristotle. In his Nicomachean Ethics, Aristotle systematically formulated a theory of virtue ethics that described the path to human flourishing through cultivating virtues of character. "
    }
]
"""


def SystemPrompt(example):
    systemPrompt = """
    You are a data science specilized robot that only provides dataset in JSON format and nothing else (no conversations nor explainations other than the dataset in JSON format that has been requested by the user). Each dataset should be different to others. There should be 4 types of relevance: high, medium, low, and none.
    """
    systemPrompt = [
        {
            "role": "system",
            "content": systemPrompt
        }
    ]
    if example:
        systemPrompt.append( {
            "role": "system",
            "content": exampleData
        } )
    
    return systemPrompt

def Continue(start, batch, query):
    id1 = start
    id2 = id1 + batch
    msg = 'Starting id={id1}. Ending by id={id2}. Provide me {no} datasets in JSON list. Query is "{query}". 80 words for each paragraph.'
    msg = msg.format(
        id1=id1,
        id2=id2,
        no=batch,
        query=query
    )
    return msg


In [30]:
query = "neurobiology of altruism"
batch = 30
id = 0
target = 150

dataBank = []

while id < target:
    msg = Continue(id, batch, query)
    print(msg)
    response, data = OpenaiHandshake(
        msg,
        SystemPrompt(True),
        "gpt-3.5-turbo-16k-0613"
    )
    dataBank.append(data)
    print(data)
    print("\n\n")
    id += batch

print("Finish")

# 6m 18s

Starting id=0. Ending by id=30. Provide me 30 datasets in JSON list. Query is "neurobiology of altruism". 80 words for each paragraph.


In [27]:
import json
dataBank_JSON = []
brokenBlock = []
id = 0
for block in dataBank:
    try:
        block_ = json.loads(block)
        for item in block_:
            item_ = item
            item_["id"] = id
            dataBank_JSON.append(item_)
            id += 1
    except:
        brokenBlock.append(block)
        print("Found broken block")

In [28]:
high = []
medium = []
low = []
none = []
for item in dataBank_JSON:
    if item["relevance"] == "high":
        high.append(item["id"])
    elif item["relevance"] == "medium":
        medium.append(item["id"])
    elif item["relevance"] == "low":
        low.append(item["id"])
    elif item["relevance"] == "none":
        none.append(item["id"])

metaData = {
    "author": {
        "name": "Jason Li",
        "email": "pakaLFZ@gmail.com",
        "organizationEmail": "fl822@ic.ac.uk"
        },
    "overview": {
        "topic": "evolution of moral standards",
        "numParagraphs": len(dataBank_JSON),
        "example_ranking": {
            "ranking_types": [
                "high",
                "medium",
                "low",
                "none"
            ],
            "high": high,
            "medium": medium,
            "low": low,
            "none": none

        }
    },
    "query": query,
    "data": dataBank_JSON
}

datasetName = "dataset_{query}.json".format(query=query.replace(" ", "-"))
log = open(datasetName, "w", encoding="utf8")
log.write(json.dumps(metaData, indent=4))
log.flush()
print("databank stored")
