In [3]:
import pandas as pd
import requests 
import numpy as np
import time
from tqdm import tqdm

### Define Functions

In [5]:
##############################################################################################################

def clean_gpt_output(output_list):

    stripped_list = [str(item).replace('\n', '') for item in output_list]
    filled_list = [entry if entry != '0' else '0 0' for entry in stripped_list]
    cleaned_list = [entry.split(' ') for entry in filled_list]

    df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

    return df

##############################################################################################################

## function to query a response from Chat GPT
def ask_chat(question, api_key):
    # define headers
    headers = {
        "Authorization": f"Bearer {gpt_key}",
        "Content-Type": "application/json",
    }

    info = {
     "model": "gpt-4o-mini",
     "messages": [{"role": "user", "content": question}],
     "temperature": 0
    }
    ##Get the response
    response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, json = info)
    ##Convert to json
    response_json = response.json()
    return response_json['choices'][0]['message']['content'].strip()

##############################################################################################################

def split_dfs(df, n):
    split_dfs = [df.iloc[i:i+n] for i in range(0, len(df), n)]
    return split_dfs

### API Init

In [7]:
# load api key
# load library to get environmental files
import os
from dotenv import load_dotenv


# load keys from  environmental var
load_dotenv() # .env file in cwd
gpt_key = os.environ.get("gpt_key") 

### Open/Split Dataframes

In [9]:
comments_df = pd.read_csv("./data/reddit/comments_parsed.csv", skiprows=5000, nrows=100, names=["x","y","comment_id","submission_id","author","body","score","year","month","subreddit","type"])

In [11]:
split_dfs = split_dfs(comments_df,10)

In [35]:
for i, df in enumerate(split_dfs):

    print(i)
    
    ##loop through the coded data to test gpt's accuracy with prompt
    answer = []

    for j in tqdm(range(len(df))):
        try:
            base_question = "Is this text about neighborhood safety in DC? \
            Answer only with a number: 1 if about safety in DC, 0 if not. \
            If the answer is 1, how safe do you think the author feels? \
            Answer with only a number on a scale between 1 (very unsafe) and 10 (very safe) \
                Here is the text: "
            text = df.loc[j, "body"]
            print(text)
            full_question = base_question + str(text)
            answer.append(ask_chat(full_question, gpt_key))
            time.sleep(1)
        except Exception as e:
            print(f"Error on index {j} of dataframe {i}: {e}")
            answer.append(np.nan)   

    df["gpt_output"] = answer

    file_name = "gpt_output_" + str(i) + ".csv"

    df.to_csv("./data/reddit/" + file_name)

0


  0%|                                                    | 0/10 [00:00<?, ?it/s]

The real reason is that the companies that are being sued for market manipulation in DC, SURPRISINGLY, operate in Arlington also. Shocker, right?


https://oag.dc.gov/release/attorney-general-schwalb-sues-realpage-residential 


These wretched oligopolies will remove the middle class from the American dream, convincing us to blame our neighbor for it, while they're ripping us all off. It's insidious.


 10%|████▍                                       | 1/10 [00:01<00:14,  1.58s/it]

Yes, Arlington’s crime rate is lower than DC's, even though crime and related problems have become more prevalent in post-COVID Arlington.

Even though I work in DC, I chose to live in Arlington because the housing is comparatively cheaper and more modern than what’s often available in my price range in DC.

Also, better amenities like parking and more things to choose from (ex. More grocery stores available in DC are things like Safeway and Whole Foods), while within a five mile radius of where I live in Arlington I have many more choices. Even within walking distance.

Though, I would say NIMBYISM is a bigger problem in Arlington vs. DC. I think VA taxes from what I've heard are also more than the District.


 20%|████████▊                                   | 2/10 [00:02<00:11,  1.43s/it]

10-15 years ago, Clarendon/Courthouse was wildly cheaper.


 30%|█████████████▏                              | 3/10 [00:04<00:09,  1.39s/it]

Depends which direction. You can go an hour south to where I live and it's still going to be very expensive (condos start at 600, SF are 1M+, renting a 2BR is going yo be over 2k).


 40%|█████████████████▌                          | 4/10 [00:05<00:08,  1.41s/it]

I always enjoyed people telling me I lived in FARlington when my dc commute was half theirs


 50%|██████████████████████                      | 5/10 [00:07<00:06,  1.38s/it]

They said navy yard not EOTR fwiw, but he’s a lot of people do commute into the suburbs not into dc proper.


 60%|██████████████████████████▍                 | 6/10 [00:08<00:05,  1.39s/it]

I wonder where all the tax money goes every year because DC definitely doesn't have better infrastructures/services/resources than Arlington. And the MPD does literally nothing to enforce law.


 70%|██████████████████████████████▊             | 7/10 [00:09<00:04,  1.40s/it]

Possible to live car-free? Do you mean there are restaurants, bars, cafes, movie theaters, markets on walking distance or it is just close to a metro stop which can get you to DC? (genuine question, never been to Arlington, living in DC)


 80%|███████████████████████████████████▏        | 8/10 [00:11<00:02,  1.42s/it]

Lmao


 90%|███████████████████████████████████████▌    | 9/10 [00:12<00:01,  1.43s/it]

Arlington is DC's more successful younger brother who makes DC look bad by comparison.


100%|███████████████████████████████████████████| 10/10 [00:14<00:00,  1.41s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["gpt_output"] = answer


1


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 14789.51it/s]


Error on index 0 of dataframe 1: 0
Error on index 1 of dataframe 1: 1
Error on index 2 of dataframe 1: 2
Error on index 3 of dataframe 1: 3
Error on index 4 of dataframe 1: 4
Error on index 5 of dataframe 1: 5
Error on index 6 of dataframe 1: 6
Error on index 7 of dataframe 1: 7
Error on index 8 of dataframe 1: 8
Error on index 9 of dataframe 1: 9
2


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 19535.65it/s]


Error on index 0 of dataframe 2: 0
Error on index 1 of dataframe 2: 1
Error on index 2 of dataframe 2: 2
Error on index 3 of dataframe 2: 3
Error on index 4 of dataframe 2: 4
Error on index 5 of dataframe 2: 5
Error on index 6 of dataframe 2: 6
Error on index 7 of dataframe 2: 7
Error on index 8 of dataframe 2: 8
Error on index 9 of dataframe 2: 9
3


100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 9850.41it/s]


Error on index 0 of dataframe 3: 0
Error on index 1 of dataframe 3: 1
Error on index 2 of dataframe 3: 2
Error on index 3 of dataframe 3: 3
Error on index 4 of dataframe 3: 4
Error on index 5 of dataframe 3: 5
Error on index 6 of dataframe 3: 6
Error on index 7 of dataframe 3: 7
Error on index 8 of dataframe 3: 8
Error on index 9 of dataframe 3: 9
4


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 35454.81it/s]


Error on index 0 of dataframe 4: 0
Error on index 1 of dataframe 4: 1
Error on index 2 of dataframe 4: 2
Error on index 3 of dataframe 4: 3
Error on index 4 of dataframe 4: 4
Error on index 5 of dataframe 4: 5
Error on index 6 of dataframe 4: 6
Error on index 7 of dataframe 4: 7
Error on index 8 of dataframe 4: 8
Error on index 9 of dataframe 4: 9
5


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 40485.56it/s]


Error on index 0 of dataframe 5: 0
Error on index 1 of dataframe 5: 1
Error on index 2 of dataframe 5: 2
Error on index 3 of dataframe 5: 3
Error on index 4 of dataframe 5: 4
Error on index 5 of dataframe 5: 5
Error on index 6 of dataframe 5: 6
Error on index 7 of dataframe 5: 7
Error on index 8 of dataframe 5: 8
Error on index 9 of dataframe 5: 9
6


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 38800.22it/s]


Error on index 0 of dataframe 6: 0
Error on index 1 of dataframe 6: 1
Error on index 2 of dataframe 6: 2
Error on index 3 of dataframe 6: 3
Error on index 4 of dataframe 6: 4
Error on index 5 of dataframe 6: 5
Error on index 6 of dataframe 6: 6
Error on index 7 of dataframe 6: 7
Error on index 8 of dataframe 6: 8
Error on index 9 of dataframe 6: 9
7


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 18893.26it/s]


Error on index 0 of dataframe 7: 0
Error on index 1 of dataframe 7: 1
Error on index 2 of dataframe 7: 2
Error on index 3 of dataframe 7: 3
Error on index 4 of dataframe 7: 4
Error on index 5 of dataframe 7: 5
Error on index 6 of dataframe 7: 6
Error on index 7 of dataframe 7: 7
Error on index 8 of dataframe 7: 8
Error on index 9 of dataframe 7: 9
8


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 34606.47it/s]


Error on index 0 of dataframe 8: 0
Error on index 1 of dataframe 8: 1
Error on index 2 of dataframe 8: 2
Error on index 3 of dataframe 8: 3
Error on index 4 of dataframe 8: 4
Error on index 5 of dataframe 8: 5
Error on index 6 of dataframe 8: 6
Error on index 7 of dataframe 8: 7
Error on index 8 of dataframe 8: 8
Error on index 9 of dataframe 8: 9
9


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 49056.19it/s]

Error on index 0 of dataframe 9: 0
Error on index 1 of dataframe 9: 1
Error on index 2 of dataframe 9: 2
Error on index 3 of dataframe 9: 3
Error on index 4 of dataframe 9: 4
Error on index 5 of dataframe 9: 5
Error on index 6 of dataframe 9: 6
Error on index 7 of dataframe 9: 7
Error on index 8 of dataframe 9: 8
Error on index 9 of dataframe 9: 9





In [17]:
answer

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

### test code 

In [17]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "Diana"],
    "Age": [25, 30, 35, 28],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"],
    "Salary": [70000, 80000, 75000, 72000],
    "Department": ["HR", "Finance", "IT", "Marketing"],
    "Joining Date": ["2020-01-15", "2019-06-10", "2021-03-20", "2020-11-05"]
}

test_df = pd.DataFrame(data)

In [18]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date
0,Alice,25,New York,70000,HR,2020-01-15
1,Bob,30,Los Angeles,80000,Finance,2019-06-10
2,Charlie,35,Chicago,75000,IT,2021-03-20
3,Diana,28,Houston,72000,Marketing,2020-11-05


In [19]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [21]:
cleaned_output = clean_gpt_output(output_list)
cleaned_output

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [22]:
test_df_2 = pd.DataFrame(cleaned_output, columns=['code', 'sentiment'])

In [23]:
test_df[["code","sentiment"]] = test_df_2

In [24]:
test_df

Unnamed: 0,Name,Age,City,Salary,Department,Joining Date,code,sentiment
0,Alice,25,New York,70000,HR,2020-01-15,0,0
1,Bob,30,Los Angeles,80000,Finance,2019-06-10,1,4
2,Charlie,35,Chicago,75000,IT,2021-03-20,1,7
3,Diana,28,Houston,72000,Marketing,2020-11-05,0,0


In [None]:
output_list = ['0', '1 \n4', '1 \n7', 0]

In [38]:
cleaned_list = [str(item).replace('\n', '') for item in output_list]

In [39]:
cleaned_list

['0', '1 4', '1 7', '0']

In [41]:
cleaned_list = [entry if entry != '0' else '0 0' for entry in cleaned_list]

In [42]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [26]:
cleaned_list

['0 0', '1 4', '1 7', '0 0']

In [31]:
for i,entry in enumerate(cleaned_list):
    cleaned_list[i] = entry.split(' ')

In [32]:
cleaned_list

[['0', '0'], ['1', '4'], ['1', '7'], ['0', '0']]

In [35]:
test_df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

In [36]:
test_df

Unnamed: 0,code,sentiment
0,0,0
1,1,4
2,1,7
3,0,0
