# Preprocessing
As a dataset, we use a set of essays written by students and their topics. The topics are used to generate text responses using the latest available text engine from OpenAI, text-davinci-003. Text-davinci is the latest addition to OpenAI's GPT-3 model: https://platform.openai.com/docs/models/gpt-3 .
We will use different temperature settings, as a lower temperature means that ChatGPT will select words with a higher probability of occurrence. Topics are given with a prefixed prompt "Write me an essay about this topic". All essays are encoded in UTF-8 to ignore unprocessable characters. 

In [1]:
import openai
import config
import re
import time
import pandas as pd

path = config.system_path

# Please specify OpenAI api key :https://openai.com/api/
openai.api_key = config.api_key

# We are parsing ascii to not encode chars outside what we are able to process
def parse_chatGPT (text):
    parsed = text.strip().encode("ascii", errors = "ignore").decode()
    # Adding space before the \n to disconnect it from the previous word 
    return re.sub("\n", "\n ", parsed)

def chatGPT(text, temp):
    # Trying 10 times to post a request otherwise the code crashes
    for i in range(1,10,1):
        try:
            response = openai.Completion.create(
            # Specified to use the newest davinci model
            engine="text-davinci-003",
            prompt=text,
            temperature=temp,
            # Max length is 4081
            max_tokens=4000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
            )
            print("Sucessfully generated essay with topic: " + text + " and temperature: " + str(temp))
            break
        except Exception as e:
            # Waiting to avoid to many request errors from OpenAI
            time.sleep(100)
            current_time = time.strftime("%H:%M:%S", time.localtime())
            print("Request failed at time " + current_time + " due to: " + str(e))
    # Returning the first choice returned by OpenAI
    return parse_chatGPT(response.choices[0].text)

def build_prompt(topic):
    return str("Write me an essay on the topic " + topic)

def read_txt(file_name, path):
    with open(path + file_name, encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
        return ' '.join(lines[2:])

def append_student_essay(datatable, path):
    datatable["Student Essay"] = datatable["File"].apply(lambda x: read_txt(x, path))

def append_chatgpt_essay(datatable):
    datatable["ChatGPT Essay 0.2"] = datatable["Topic"].apply(lambda x: chatGPT(build_prompt(x), 0.2))    
    datatable["ChatGPT Essay 0.9"] = datatable["Topic"].apply(lambda x: chatGPT(build_prompt(x), 0.9))

def append_length(datatable):
    datatable["Topic length"] = datatable["Topic"].apply(lambda x: len(x))
    datatable["Student Essay length"] = datatable["Student Essay"].apply(lambda x: len(x))
    datatable["ChatGPT Essay 0.2 length"] = datatable["ChatGPT Essay 0.2"].apply(lambda x: len(x))
    datatable["ChatGPT Essay 0.9 length"] = datatable["ChatGPT Essay 0.9"].apply(lambda x: len(x))
    datatable["ChatGPT Essay average length"] = datatable[["ChatGPT Essay 0.2 length", "ChatGPT Essay 0.9 length"]].mean(axis=1)

# Specify topics file
topics = pd.read_csv(path + "data\\ArgumentAnnotatedEssays-1.0\\ArgumentAnnotatedEssays-1.0\\topics.csv", sep=";")

# Specify location of annotaded essays
append_student_essay(topics, path + "data\\ArgumentAnnotatedEssays-1.0\\ArgumentAnnotatedEssays-1.0\\brat-project\\brat-project\\")
append_chatgpt_essay(topics)
append_length(topics)

topics.to_csv("raw_data.csv", encoding="UTF-8", sep=";")    


Sucessfully generated essay with topic: Write me an essay on the topic Should students be taught to compete or to cooperate? and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic More people are migrating to other countries than ever before and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic International tourism is now more common than ever before (for) and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic International tourism is now more common than ever before (against) and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic Living and studying overseas and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic Why you should exercise and temperature: 0.2
Sucessfully generated essay with topic: Write me an essay on the topic Will newspapers become a thing of the past? and temperature: 0.2
Sucessfully