In [144]:
import pandas as pd
import requests 
import numpy as np
import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

### Define Functions

In [118]:
##############################################################################################################

def clean_gpt_output(output_list):

    stripped_list = [str(item).replace('\n', '') for item in output_list]
    filled_list = [entry if entry != '0' else '0 0' for entry in stripped_list]
    cleaned_list = [entry.split(' ') for entry in filled_list]

    df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

    return df

##############################################################################################################

## function to query a response from Chat GPT
def ask_chat(question, api_key):
    # define headers
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }

    info = {
     "model": "gpt-4o-mini",
     "messages": [{"role": "user", "content": question}],
     "temperature": 0
    }
    ##Get the response
    response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, json = info)
    ##Convert to json
    response_json = response.json()
    return response.status_code, response_json['choices'][0]['message']['content'].strip()


### API Init

In [10]:
# load api key
# load library to get environmental files
import os
from dotenv import load_dotenv


# load keys from  environmental var
load_dotenv() # .env file in cwd
gpt_key = os.environ.get("GPT_SECRET_KEY") 

### Open/Split Dataframes

In [141]:
comments_df = pd.read_csv("./data/reddit/comments_parsed.csv", skiprows=5001, names=["comment_id","submission_id","author","body","score","year","month","subreddit","type"])

In [142]:
split_dfs = [comments_df.iloc[i:i+5000] for i in range(0, len(comments_df), 5000)]

In [143]:
len(split_dfs)

6

In [None]:
for i, df in enumerate(split_dfs):

    print("Dataframe #" + str(i))

    rate_limit = False
    answer = []

    for j in tqdm(range(len(df))):

        try:
            base_question = "Is this text about neighborhood safety in DC? \
            Answer only with a number: 1 if about safety in DC, 0 if not. \
            If the answer is 1, how safe do you think the author feels? \
            Answer with only a number on a scale between 1 (very unsafe) and 10 (very safe) \
            If you are answering both quetsions, please split the numbers with only a comma. \
                Here is the text: "
            
            text = df["body"].iloc[j]

            full_question = base_question + str(text)
            
            status_code, gpt_answer = ask_chat(full_question, gpt_key)

            if status_code==429:
                print("Rate limit hit!")
                rate_limit = True
                break

            answer.append(gpt_answer)

            time.sleep(1)

        except:
            answer.append(np.nan)

    if rate_limit:
        break

    df["gpt_output"] = answer

    df.to_csv("./data/reddit/gpt_output_" + str(5000*(i+1)) + "_" + str(5000*(i+1) + 4999) + ".csv", index=False)
    print("saved!")
        

Dataframe #0


  8%|▊         | 397/5000 [09:46<1:50:40,  1.44s/it]

### test code 

In [17]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "Diana"],
    "Age": [25, 30, 35, 28],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"],
    "Salary": [70000, 80000, 75000, 72000],
    "Department": ["HR", "Finance", "IT", "Marketing"],
    "Joining Date": ["2020-01-15", "2019-06-10", "2021-03-20", "2020-11-05"]
}

test_df = pd.DataFrame(data)

In [18]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date
0,Alice,25,New York,70000,HR,2020-01-15
1,Bob,30,Los Angeles,80000,Finance,2019-06-10
2,Charlie,35,Chicago,75000,IT,2021-03-20
3,Diana,28,Houston,72000,Marketing,2020-11-05


In [19]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [21]:
cleaned_output = clean_gpt_output(output_list)
cleaned_output

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [22]:
test_df_2 = pd.DataFrame(cleaned_output, columns=['code', 'sentiment'])

In [23]:
test_df[["code","sentiment"]] = test_df_2

In [24]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date,code,sentiment
0,Alice,25,New York,70000,HR,2020-01-15,0,0
1,Bob,30,Los Angeles,80000,Finance,2019-06-10,1,4
2,Charlie,35,Chicago,75000,IT,2021-03-20,1,7
3,Diana,28,Houston,72000,Marketing,2020-11-05,0,0


In [None]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [38]:
cleaned_list = [str(item).replace('\n', '') for item in output_list]

In [39]:
cleaned_list

['0', '1 4', '1 7', '0']

In [41]:
cleaned_list = [entry if entry != '0' else '0 0' for entry in cleaned_list]

In [42]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [26]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [31]:
for i,entry in enumerate(cleaned_list):
    cleaned_list[i] = entry.split(' ')

In [32]:
cleaned_list

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [35]:
test_df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

In [36]:
test_df

Unnamed: 0,code,sentiment
0,0,0
1,1,4
2,1,7
3,0,0
