# Preprocessing
This class is responsible for optaining and preprocessing the data. Therefore please specify the path to this directory in the first line.
Also for gathering own data an openai api key is required which can be created after signing up for free at openai.com
This class will create a raw and a preprocessed data csv file in this directory. First we read in the student essays and store them in one column. Then we are creating essays by the topics specified in the topics.csv file with the openai package. At last we are exchanging all the modal tuples specified in the modals.csv table

In [None]:
import openai
import config
import re
import time
import pandas as pd

path = "C:\\Speicher\\Uni\\SocialMediaAnalysis\\ChatGBT\\"

In [4]:
# Please specify openai api key :https://openai.com/api/
cfg = config.Config('C:\\Speicher\\Uni\\SocialMediaAnalysis\\nlp\\config.py')
openai.api_key = cfg.api_key

# We are parsing ascii to not encode chars outside what we are able to process
def parse_chatGPT (text):
    parsed = text.strip().encode("ascii", errors = "ignore").decode()
    # Adding space before the \n to disconnect it from the previous word 
    return re.sub("\n", "\n ", parsed)

def chatGPT(text, temp):
    # Trying 10 times to post a request otherwise the code crashes
    for i in range(1,10,1):
        try:
            response = openai.Completion.create(
            # Specified to use the newest davinci model
            engine="text-davinci-003",
            prompt=text,
            temperature=temp,
            # Max length is 4081
            max_tokens=4000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
            )
            print("Sucessfully generated essay with topic: " + text + " and temperature: " + str(temp))
            break
        except Exception as e:
            # Waiting to avoid to many request errors from openai
            time.sleep(100)
            current_time = time.strftime("%H:%M:%S", time.localtime())
            print("Request failed at time " + current_time + " due to: " + str(e))
    # Returning the first choice returned by openai
    return parse_chatGPT(response.choices[0].text)

def build_prompt(topic):
    return str("Write me an essay on the topic " + topic)

def read_txt(file_name, path):
    with open(path + file_name, encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
        return ' '.join(lines[2:])

def append_student_essay(datatable, path):
    datatable["Student Essay"] = datatable["File"].apply(lambda x: read_txt(x, path))

def append_chatgpt_essay(datatable):
    datatable["ChatGPT Essay 0.2"] = datatable["Topic"].apply(lambda x: chatGPT(build_prompt(x), 0.2))    
    datatable["ChatGPT Essay 0.9"] = datatable["Topic"].apply(lambda x: chatGPT(build_prompt(x), 0.9))

def append_length(datatable):
    datatable["Topic length"] = datatable["Topic"].apply(lambda x: len(x))
    datatable["Student Essay length"] = datatable["Student Essay"].apply(lambda x: len(x))
    datatable["ChatGPT Essay 0.2 length"] = datatable["ChatGPT Essay 0.2"].apply(lambda x: len(x))
    datatable["ChatGPT Essay 0.9 length"] = datatable["ChatGPT Essay 0.9"].apply(lambda x: len(x))
    datatable["ChatGPT Essay average length"] = datatable[["ChatGPT Essay 0.2 length", "ChatGPT Essay 0.9 length"]].mean(axis=1)

# Specify topics file
topics = pd.read_csv(path + "data\\ArgumentAnnotatedEssays-1.0\\ArgumentAnnotatedEssays-1.0\\topics.csv", sep=";")

# Specify location of annotaded essays
append_student_essay(topics, path + "data\\ArgumentAnnotatedEssays-1.0\\ArgumentAnnotatedEssays-1.0\\brat-project\\brat-project\\")
append_chatgpt_essay(topics)
append_length(topics)

topics.to_csv("raw_data.csv", encoding="UTF-8", sep=";")    


AttributeError: module 'config' has no attribute 'Config'

In [5]:
essays = pd.read_csv(path + "raw_data.csv", sep=";", encoding="UTF-8")
modals = pd.read_csv(path + "data\\modals.csv", sep=",", encoding="UTF-8", header=None)

modals[0] = modals[0].apply(lambda x: x.replace('_', ' '))

def apply_modal(text):
    for modal in modals.itertuples():
        if modal[1] in text:
            text.replace(modal[1], modal[2])
            #print("Replaced " + modal[1] + " with " + modal[2])
    return text

essays["Student Essay"] = essays["Student Essay"].apply(lambda x: apply_modal(x))
essays["ChatGPT Essay 0.2"] = essays["ChatGPT Essay 0.2"].apply(lambda x: apply_modal(x))
essays["ChatGPT Essay 0.9"] = essays["ChatGPT Essay 0.9"].apply(lambda x: apply_modal(x))

essays.to_csv("preprocessed_data.csv", encoding="UTF-8", sep=";")