In [2]:
import pandas as pd
import requests 
import numpy as np
import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

### Define Functions

In [3]:
##############################################################################################################

def clean_gpt_output(output_list):

    stripped_list = [str(item).replace('\n', '') for item in output_list]
    filled_list = [entry if entry != '0' else '0 0' for entry in stripped_list]
    cleaned_list = [entry.split(' ') for entry in filled_list]

    df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

    return df

##############################################################################################################

## function to query a response from Chat GPT
def ask_chat(question, api_key):
    # define headers
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }

    info = {
     "model": "gpt-4o-mini",
     "messages": [{"role": "user", "content": question}],
     "temperature": 0
    }
    ##Get the response
    response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, json = info)
    ##Convert to json
    response_json = response.json()
    return response.status_code, response_json['choices'][0]['message']['content'].strip()


### API Init

In [4]:
# load api key
# load library to get environmental files
import os
from dotenv import load_dotenv


# load keys from  environmental var
load_dotenv() # .env file in cwd
gpt_key = os.environ.get("GPT_SECRET_KEY") 

### Open/Split Dataframes

In [5]:
comments_df = pd.read_csv("./data/reddit/CODE_ME.csv")

In [6]:
split_dfs = [comments_df.iloc[i:i+2000] for i in range(0, len(comments_df), 2000)]

In [7]:
len(split_dfs)

3

In [None]:
for i, df in enumerate(split_dfs):

    print("Dataframe #" + str(i))

    rate_limit = False
    answer = []

    for j in tqdm(range(len(df))):

        try:
            base_question = "Is this text about neighborhood safety in DC? \
            Answer only with a number: 1 if about safety in DC, 0 if not. \
            If the answer is 1, how safe do you think the author feels? \
            Answer with only a number on a scale between 1 (very unsafe) and 10 (very safe) \
            If you are answering both quetsions, please split the numbers with only a comma. \
            Here is the text: "
            
            text = df["body"].iloc[j]

            full_question = base_question + str(text)
            
            status_code, gpt_answer = ask_chat(full_question, gpt_key)

            if status_code==429:
                print("Rate limit hit!")
                rate_limit = True
                break

            answer.append(gpt_answer)

            time.sleep(1)

        except:
            answer.append(np.nan)

    if rate_limit:
        break

    df["gpt_output"] = answer

    df.to_csv("./data/reddit/rebeccas_last_gpt_batch/coded_" + str(i) + ".csv", index=False)
    #df.to_csv("./data/reddit/gpt_output_new" + str(5000*(i+1)) + "_" + str(5000*(i+1) + 4999) + ".csv", index=False)
    #print("saved!")
        

Dataframe #0


100%|██████████| 2000/2000 [49:10<00:00,  1.48s/it] 


Dataframe #1


100%|██████████| 2000/2000 [53:41<00:00,  1.61s/it]  


Dataframe #2


100%|██████████| 1960/1960 [36:42<00:00,  1.12s/it]  
