In [110]:
import pandas as pd

## This notebook was used to clean the combined chat gpt output 

### Import Data

In [121]:
# This file has ALL the comments that have been run through chat gpt
all_comments = pd.read_csv("../data/reddit/gpt_output_coded/ALL_CODED_COMMENTS.csv")

### Get Only Relevant Safety Comments
As determined by OpenAI

In [112]:
# Drop all data with gpt code 0: not safety related
safety_comments = all_comments.drop(all_comments[all_comments["gpt_output"] == '0'].index)
# How many comments do we have left?
safety_comments.shape

(5369, 10)

### Split safety code with sentiment code

In [113]:
# See all the different encodings - some use newlines, some commas
safety_comments["gpt_output"].unique()

array(['1  \n8', '1  \n4', '1  \n7', '1  \n6', '1  \n3', '1  \n5', '1, 5',
       '1, 7', '1, 8', '1, 3', '1  \n2', '1, 2', '1, 6', '1, 4', '1, 1',
       '1, 9', '1  \n9', '1  \n10', '1  \n1', '1,2', '1,3', '1,4', '1,6',
       '1, 10', '1,5', '1,7', '1,8'], dtype=object)

In [114]:
# replace all the newlines with commas so everything is in the same format
safety_comments["gpt_output"] = [str(item).replace('  \n', ' ') for item in safety_comments["gpt_output"]]
# replace all the commas with spaces
safety_comments["gpt_output"] = [str(item).replace(',', ' ') for item in safety_comments["gpt_output"]]

safety_comments["gpt_output"].unique() # notice some have a space and some dont 

array(['1 8', '1 4', '1 7', '1 6', '1 3', '1 5', '1  5', '1  7', '1  8',
       '1  3', '1 2', '1  2', '1  6', '1  4', '1  1', '1  9', '1 9',
       '1 10', '1 1', '1  10'], dtype=object)

In [115]:
# split the column on commas
split_scores = [entry.split(' ') for entry in safety_comments["gpt_output"]]
split_scores[10:20]

[['1', '8'],
 ['1', '5'],
 ['1', '7'],
 ['1', '8'],
 ['1', '5'],
 ['1', '8'],
 ['1', '', '5'],
 ['1', '', '7'],
 ['1', '4'],
 ['1', '4']]

### Grab and Append Safety Sentiment Scores to Safety Comments

In [116]:
# Grab only the last item of the split lists
sentiments = [int(lst[-1]) for lst in split_scores]

In [117]:
# Create new column in dataframe
safety_comments["sentiment_score"] = sentiments

In [118]:
# Drop gpt output column
safety_comments = safety_comments.drop("gpt_output",axis=1)

## The Final DataFrame!

In [119]:
safety_comments.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
5,ita0jq1,y57cbw,samthehaggis,Agreed to Cleveland Park and Woodley Park. The...,1,2022,10,0,1,8
10,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,0,1,4
11,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,0,1,7
12,i9czvva,utr8aw,dans_cafe,"of course there are. And probably, it's over-...",1,2022,5,0,1,7
13,cqarv2e,32cqd6,AUBlazin,Haha well you have decided to live in the sket...,1,2015,4,0,1,6


In [122]:
# save 
safety_comments.to_csv("../data/reddit/cleaned_sentiment_data.csv", index=False)