In [2]:
import pandas as pd

## This notebook was used to clean the combined chat gpt output 

### Import Data

In [3]:
# This file has ALL the comments that have been run through chat gpt
all_comments = pd.read_csv("../../data/reddit/ALL_CODED_COMMENTS.csv")

In [5]:
all_comments.shape

(29261, 10)

### Get Only Relevant Safety Comments
As determined by OpenAI

In [6]:
# Drop all data with gpt code 0: not safety related
safety_comments = all_comments.drop(all_comments[all_comments["gpt_output"] == '0'].index)
# How many comments do we have left?
safety_comments.shape

(6445, 10)

### Split safety code with sentiment code

In [7]:
# See all the different encodings - some use newlines, some commas
safety_comments["gpt_output"].unique()

array(['1, 3', '1, 7', '1, 4', '1, 2', '1, 5', '1,2', '1,3', '1, 6',
       '1,6', '1, 1', '1, 8', '1,4', '1,5', '1, 9', '1,8', '1,7',
       '1  \n8', '1  \n4', '1  \n7', '1  \n6', '1  \n3', '1  \n5',
       '1  \n2', '1  \n9', '1  \n10', '1  \n1', '1, 10'], dtype=object)

In [8]:
# replace all the newlines with commas so everything is in the same format
safety_comments["gpt_output"] = [str(item).replace('  \n', ' ') for item in safety_comments["gpt_output"]]
# replace all the commas with spaces
safety_comments["gpt_output"] = [str(item).replace(',', ' ') for item in safety_comments["gpt_output"]]

safety_comments["gpt_output"].unique() # notice some have a space and some dont 

array(['1  3', '1  7', '1  4', '1  2', '1  5', '1 2', '1 3', '1  6',
       '1 6', '1  1', '1  8', '1 4', '1 5', '1  9', '1 8', '1 7', '1 9',
       '1 10', '1 1', '1  10'], dtype=object)

In [9]:
# split the column on commas
split_scores = [entry.split(' ') for entry in safety_comments["gpt_output"]]
split_scores[10:20]

[['1', '', '2'],
 ['1', '', '4'],
 ['1', '', '4'],
 ['1', '', '3'],
 ['1', '', '3'],
 ['1', '', '4'],
 ['1', '', '5'],
 ['1', '2'],
 ['1', '3'],
 ['1', '', '4']]

### Grab and Append Safety Sentiment Scores to Safety Comments

In [10]:
# Grab only the last item of the split lists
sentiments = [int(lst[-1]) for lst in split_scores]

In [11]:
# Create new column in dataframe
safety_comments["sentiment_score"] = sentiments

In [12]:
# Drop gpt output column
safety_comments = safety_comments.drop("gpt_output",axis=1)

## The Final DataFrame!

In [13]:
safety_comments.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
21,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,69,2023,2,2,1,3
28,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,19,2023,2,2,1,7
40,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,5,2023,3,2,1,4
52,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,11,2023,2,2,1,7
67,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,12,2023,2,2,1,2


In [14]:
safety_comments

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
21,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,69,2023,2,2,1,3
28,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,19,2023,2,2,1,7
40,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,5,2023,3,2,1,4
52,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,11,2023,2,2,1,7
67,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,12,2023,2,2,1,2
...,...,...,...,...,...,...,...,...,...,...
29254,eyh4z5x,cwurzh,jbrofford,Did the OP even call 911? Neighbors need to wa...,3,2019,8,2,1,4
29255,eyjy72n,cwurzh,zetuslapetus311,DC needs a more moderate Mayor. Who cares abou...,2,2019,8,2,1,3
29256,eyhlyi5,cwurzh,millennial_bot,"The problem isn't entirely mpd, the courts are...",1,2019,8,2,1,2
29259,eyfn4hy,cwurzh,,"What would you like MPD to do about the PCP, e...",-24,2019,8,2,1,3


In [24]:
# save 
safety_comments.to_csv("../../data/reddit/cleaned_sentiment_data.csv", index=False)