In [1]:
"""
Here, we ask GPT-3.5 to clean our raw dataset for us. 
"""

'\nHere, we ask GPT-3.5 to clean our raw dataset for us. \n'

In [2]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv # pip install python-dotenv
load_dotenv()   # Set API KEY values from .env file


True

In [3]:
# Load raw dataset (sentences that have been extracted, with confidence labels)
sentences_with_labels_raw = pd.read_csv('data/text_processing/sentences_with_ratings_04_20.csv')


In [4]:
# Load all sentences, including unlabeled (to be used for context)
all_sentences_raw = pd.read_csv('data/text_processing/all_sentences.csv')

In [5]:
# Get context for a labeled sentence.
def get_context(filename, pg_num, sent_num, 
                n_sentences_before=5, n_sentences_after=2): # Context is more likely to be before, than after.
    try:
        # Find the row index of the entry

        filtered_df = all_sentences_raw[all_sentences_raw['filenames'] == filename].reset_index(drop=True)
        row_index = filtered_df[(filtered_df['page_num'] == pg_num) &
                                (filtered_df['sent_num'] == sent_num)].index[0]
        
        # Get the indices of the rows before and after the target row
        indices = list(range(max(0, row_index - n_sentences_before), min(row_index + n_sentences_after, len(filtered_df))))

        # Concatenate the sentences
        context = " ".join(filtered_df.loc[indices, 'text'])
        return context
    except IndexError:
        print("Entry not found.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None


In [6]:
SYSTEM_PROMPT = f"""You are DataCleanerGPT, an assistant who rewrites input texts such that they are complete sentences, containing enough information for a human to evaluate the factualness of the statement. This includes replacing references to outside information with the referred-to information, adding external context of what is being discussed or measured so that a human can evaluate the truth of the statement, and removing extraneous qualifiers such as "Therefore," or "In conclusion." It also means removing artifacts of web scraping such as irrelevant characters/symbols, sentence fragments, or numbers in strange places in the sentence. You should also remove indications of confidence/likelihood from the sentence, and avoid them in your revision. Preserve the original meaning of the sentence (within the context given) as much as possible. Your response can only be ONE SENTENCE long. 

You will be given a sentence and context within which that sentence was found. You can respond in one of two ways: 
1. If the sentence requires rewriting and it is possible to do so, respond with the rewritten sentence.
2. If the sentence requires rewriting but it is not possible to do so because there is not enough context provided, or the original sentence is not meaningful, respond with "unrecoverable"
"""

In [9]:
client = OpenAI()
MODEL="gpt-3.5-turbo"

results = []
# Iterate over each row in selected_sentences_df, construct the prompt, feed to GPT
for index, row in sentences_with_labels_raw.iterrows():
    if index % 10 == 0:
        print(f"working on index: {index}")
    if index % 100 == 0:
        results_df = pd.DataFrame(results)
        model_str = MODEL.replace('.', 'p')
        results_df.to_csv(f'data/text_processing/sentences_with_ratings_05_25_revised_{model_str}.csv', index=False)


    # Get the filename, page_number, sentence_number, and sentence from the current row
    filename = row['filenames']
    page_number = row['page_num']
    sentence_number = row['sent_num']
    sentence = row['text']
    conf_rating = row['confidence_rating']
    
    # Get the concatenated sentences as context for the current sentence
    context = get_context(filename, page_number, sentence_number)
    
    input = f"""Sentence: {sentence}
    Context: {context}
    Response: """

    # print("input: ")
    # print(input)

    response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": input}
        ]
    )
    rewrite = response.choices[0].message.content

    # print()
    # print()
    # print("output")
    # print(rewrite)

    # print("--------")
    results.append({
        'filenames': filename,
        'page_num': page_number,
        'sent_num': sentence_number,
        'text': sentence,
        'gpt_revised_text': rewrite,
        'confidence_rating': conf_rating,
        'context': context
    })

results_df = pd.DataFrame(results)

model_str = MODEL.replace('.', 'p')
results_df.to_csv(f'data/text_processing/sentences_with_ratings_05_25_revised_{model_str}.csv', index=False)


working on index: 0
working on index: 10


In [None]:
results_df = pd.DataFrame(results)
model_str = MODEL.replace('.', 'p')
results_df.to_csv(f'data/text_processing/sentences_with_ratings_05_25_revised_{model_str}.csv', index=False)
