### Test Chat GPT accuracy on classifying Reddit data

In [3]:
# load api key
# load library to get environmental files
import os
from dotenv import load_dotenv


# load keys from  environmental var
load_dotenv() # .env file in cwd
gpt_key = os.environ.get("GPT_SECRET_KEY") 

In [3]:
##import packages
import requests 
import pandas as pd
import numpy as np

In [5]:
## function to query a response from Chat GPT
def ask_chat(question, api_key):
    '''
    asks chat gpt to classify a message and convert the response from a json to a dataframe
    input: question as a string, api_key from .env file
    output: string response from chat gpt
    '''
    headers = {
        "Authorization": f"Bearer {gpt_key}",
        "Content-Type": "application/json",
    }

    info = {
     "model": "gpt-4o-mini",
     "messages": [{"role": "user", "content": question}],
     "temperature": 0
    }
    ##Get the response
    response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, json = info)
    ##Convert to json
    response_json = response.json()
    return response_json['choices'][0]['message']['content'].strip()
    

In [11]:
##open coded csv file and show the first few rows
test_data = pd.read_csv("../../data/reddit/gpt_test_data/coded_comment_test_set_12_6.csv")
test_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,comment_id,submission_id,author,body,score,year,month,safety_code
0,0,0,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0
1,1,1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,1
2,2,2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,1
3,3,3,k7632z8,17ed6rg,Musictravels23,Hey there!! I have a fuIIy furnished sunny spa...,1,2023,10,0
4,4,4,io5bcjq,xciciq,cheesyuser,Thats a difficult ask. The 2000 range may be d...,2,2022,9,0


In [15]:
import time
##loop through the coded data to test gpt's accuracy with prompt
answer = []
for j in range(len(test_data)):
    try:
        print(j) ##keep track of which row gpt is on
        base_question = "Is this text about neighborhood safety in DC? \
        Answer only with a number: 1 if about safety in DC, 0 if not. \
            Here is the text: "
        text = test_data.loc[j, "body"]
        full_question = base_question + str(text)
        answer.append(ask_chat(full_question, gpt_key))
        time.sleep(1) ##rate limit
    except:
        answer.append(np.nan)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129


In [25]:
##add a column to the test data that is the responses from gpt as a number
test_data['gpt_score'] = pd.to_numeric(answer)

In [27]:
##show the first few rows
test_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,comment_id,submission_id,author,body,score,year,month,safety_code,gpt_score
0,0,0,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0,0
1,1,1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,1,1
2,2,2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,1,1
3,3,3,k7632z8,17ed6rg,Musictravels23,Hey there!! I have a fuIIy furnished sunny spa...,1,2023,10,0,0
4,4,4,io5bcjq,xciciq,cheesyuser,Thats a difficult ask. The 2000 range may be d...,2,2022,9,0,0


In [29]:
##drop all na rows
test_data_acc = test_data.dropna().copy()

In [31]:
##import package to test accuracy
from sklearn.metrics import accuracy_score
##check accuracy
accuracy = accuracy_score(test_data_acc['safety_code'], test_data_acc['gpt_score'])
##see the accuracy score
print(accuracy)

0.825


In [45]:
##Export the accuracy test dataframe to csv
test_data_acc.to_csv("./data/reddit/test_data_accuracy.csv")