In [30]:
import pandas as pd
import requests 
import numpy as np
import time
from tdqm import tdqm

ModuleNotFoundError: No module named 'tdqm'

### Define Functions

In [35]:
##############################################################################################################

def clean_gpt_output(output_list):

    stripped_list = [str(item).replace('\n', '') for item in output_list]
    filled_list = [entry if entry != '0' else '0 0' for entry in stripped_list]
    cleaned_list = [entry.split(' ') for entry in filled_list]

    df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

    return df

##############################################################################################################

## function to query a response from Chat GPT
def ask_chat(question, api_key):
    # define headers
    headers = {
        "Authorization": f"Bearer {gpt_key}",
        "Content-Type": "application/json",
    }

    info = {
     "model": "gpt-4o-mini",
     "messages": [{"role": "user", "content": question}],
     "temperature": 0
    }
    ##Get the response
    response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, json = info)
    ##Convert to json
    response_json = response.json()
    return response_json['choices'][0]['message']['content'].strip()

##############################################################################################################

def split_dfs(df, n):
    split_dfs = [df.iloc[i:i+n] for i in range(0, len(df), n)]
    return split_dfs

### API Init

In [None]:
# load api key
# load library to get environmental files
import os
from dotenv import load_dotenv


# load keys from  environmental var
load_dotenv() # .env file in cwd
gpt_key = os.environ.get("gpt_key") 

### Open/Split Dataframes

In [45]:
comments_df = pd.read_csv("./data/reddit/comments_parsed.csv", skiprows=5000, nrows=100, names=["x","y","comment_id","submission_id","author","body","score","year","month","subreddit","type"])

In [38]:
split_dfs = split_dfs(comments_df,10)

In [None]:
for i, df in enumerate(split_dfs):
    
    ##loop through the coded data to test gpt's accuracy with prompt
    answer = []

    for j in tqdm(range(len(df))):
        try:
            base_question = "Is this text about neighborhood safety in DC? \
            Answer only with a number: 1 if about safety in DC, 0 if not. \
            If the answer is 1, how safe do you think the author feels? \
            Answer with only a number on a scale between 1 (very unsafe) and 10 (very safe) \
                Here is the text: "
            text = df.loc[j, "body"]
            full_question = base_question + str(text)
            answer.append(ask_chat(full_question, gpt_key))
            time.sleep(1)
        except:
            answer.append(np.nan)   

    df["gpt_output"] = answer

    file_name = "gpt_output_" + str(i) + ".csv"

    df.to_csv("./data/reddit/" + file_name)

### test code 

In [17]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "Diana"],
    "Age": [25, 30, 35, 28],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"],
    "Salary": [70000, 80000, 75000, 72000],
    "Department": ["HR", "Finance", "IT", "Marketing"],
    "Joining Date": ["2020-01-15", "2019-06-10", "2021-03-20", "2020-11-05"]
}

test_df = pd.DataFrame(data)

In [18]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date
0,Alice,25,New York,70000,HR,2020-01-15
1,Bob,30,Los Angeles,80000,Finance,2019-06-10
2,Charlie,35,Chicago,75000,IT,2021-03-20
3,Diana,28,Houston,72000,Marketing,2020-11-05


In [19]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [21]:
cleaned_output = clean_gpt_output(output_list)
cleaned_output

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [22]:
test_df_2 = pd.DataFrame(cleaned_output, columns=['code', 'sentiment'])

In [23]:
test_df[["code","sentiment"]] = test_df_2

In [24]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date,code,sentiment
0,Alice,25,New York,70000,HR,2020-01-15,0,0
1,Bob,30,Los Angeles,80000,Finance,2019-06-10,1,4
2,Charlie,35,Chicago,75000,IT,2021-03-20,1,7
3,Diana,28,Houston,72000,Marketing,2020-11-05,0,0


In [None]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [38]:
cleaned_list = [str(item).replace('\n', '') for item in output_list]

In [39]:
cleaned_list

['0', '1 4', '1 7', '0']

In [41]:
cleaned_list = [entry if entry != '0' else '0 0' for entry in cleaned_list]

In [42]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [26]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [31]:
for i,entry in enumerate(cleaned_list):
    cleaned_list[i] = entry.split(' ')

In [32]:
cleaned_list

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [35]:
test_df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

In [36]:
test_df

Unnamed: 0,code,sentiment
0,0,0
1,1,4
2,1,7
3,0,0
