### Code to extract neighborhood from crime data

In [2]:
import pandas as pd
import numpy as np

In [71]:
## open first set of csv coded comments
coded_comments_df = pd.read_csv("./data/reddit/gpt_output_1.csv")
coded_comments_df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,gpt_score
0,0,0,0,isl1exn,y57cbw,eeek0711,Or Cleveland Park in DC,4,2022,10,0,1,0
1,1,1,1,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0,1,0
2,2,2,2,islh5rt,y57cbw,nonmimeticform,Baltimore,3,2022,10,0,1,0
3,3,3,3,isuelis,y57cbw,dcgirlsmallworld,If you are looking for a quiet neighborhood wi...,3,2022,10,0,1,0
4,4,4,4,isl1grp,y57cbw,eeek0711,I lived in the DeLano Apartments in Woodley Pa...,2,2022,10,0,1,0


In [81]:
## clean dataset
def clean_gpt_output(output_list):

    stripped_list = [str(item).replace('\n', '') for item in output_list]
    filled_list = [entry if entry != '0' else '0 0' for entry in stripped_list]
    cleaned_list = [entry.split()[:2] if len(entry.split()) >= 2 else ['0', '0'] for entry in filled_list]

    df = pd.DataFrame(cleaned_list, columns=['code', 'sentiment'])

    return df

In [83]:
##get gpt_score to list to be cleaned
comments_to_clean_list = coded_comments_df['gpt_score'].tolist()
comments_to_clean_list[5]

'1  \n8'

In [85]:
##clean the coded comments using clean_gpt_output
cleaned_comments_df = clean_gpt_output(comments_to_clean_list)

In [87]:
##show the first few rows of cleaned df
cleaned_comments_df.head()

Unnamed: 0,code,sentiment
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [93]:
##add cleaned df to coded df
cleaned_coded_df = pd.concat([coded_comments_df, cleaned_comments_df], axis = 1)

In [95]:
##show the first 15 entries
cleaned_coded_df.head(15)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,gpt_score,code,sentiment
0,0,0,0,isl1exn,y57cbw,eeek0711,Or Cleveland Park in DC,4,2022,10,0,1,0,0,0
1,1,1,1,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0,1,0,0,0
2,2,2,2,islh5rt,y57cbw,nonmimeticform,Baltimore,3,2022,10,0,1,0,0,0
3,3,3,3,isuelis,y57cbw,dcgirlsmallworld,If you are looking for a quiet neighborhood wi...,3,2022,10,0,1,0,0,0
4,4,4,4,isl1grp,y57cbw,eeek0711,I lived in the DeLano Apartments in Woodley Pa...,2,2022,10,0,1,0,0,0
5,5,5,5,ita0jq1,y57cbw,samthehaggis,Agreed to Cleveland Park and Woodley Park. The...,1,2022,10,0,1,1 \n8,1,8
6,6,6,6,itxtwh8,y57cbw,Adept-Pension-1312,Mt Pleasant,1,2022,10,0,1,0,0,0
7,7,7,8,iwbywh5,y57cbw,danimaymoj315,"Probably Capitol Hill, Cleveland Park, and Log...",1,2022,11,0,1,0,0,0
8,8,8,9,iszr3e7,y57cbw,zazaza05,"Thank you, very helpful because I was consider...",1,2022,10,0,1,0,0,0
9,9,9,10,i9bctxl,utr8aw,,Too much Fox news,8,2022,5,0,1,0,0,0


In [101]:
##show only needed columns in df
coded_selected_columns = cleaned_coded_df[['comment_id', 'submission_id', 'author', 'body', 'code', 'sentiment']]
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,code,sentiment
0,isl1exn,y57cbw,eeek0711,Or Cleveland Park in DC,0,0
1,isl1dig,y57cbw,eeek0711,Takoma Park MD,0,0
2,islh5rt,y57cbw,nonmimeticform,Baltimore,0,0
3,isuelis,y57cbw,dcgirlsmallworld,If you are looking for a quiet neighborhood wi...,0,0
4,isl1grp,y57cbw,eeek0711,I lived in the DeLano Apartments in Woodley Pa...,0,0


In [109]:
neighborhoods = [
    'Adams Morgan', 'American University Park', 'Anacostia', 'Barnaby Woods', 
    'Capitol Hill', 'Columbia Heights', 'Cleveland Park', 'Dupont', 
    'Foggy Bottom', 'Friendship Heights', 'Georgetown', 'Glover Park', 
    'H Street', 'Logan Circle', 'Mount Pleasant', 'Navy Yard', 
    'NoMa', 'Petworth', 'Shaw', 'Southwest Waterfront', 'Takoma', 
    'Tenleytown', 'The Palisades', 'U Street', 'West End', 'Woodley Park']

In [135]:
##function to apply to the body column
def get_neighborhoods(text):
    matches = [neighborhood for neighborhood in neighborhoods if neighborhood in text]
    return ', '.join(matches) if matches else None

In [137]:
coded_selected_columns['neighborhoods_mentioned'] = coded_selected_columns['body'].apply(get_neighborhoods)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coded_selected_columns['neighborhoods_mentioned'] = coded_selected_columns['body'].apply(get_neighborhoods)


In [141]:
coded_selected_columns.head(15)

Unnamed: 0,comment_id,submission_id,author,body,code,sentiment,neighborhoods_mentioned
0,isl1exn,y57cbw,eeek0711,Or Cleveland Park in DC,0,0,Cleveland Park
1,isl1dig,y57cbw,eeek0711,Takoma Park MD,0,0,Takoma
2,islh5rt,y57cbw,nonmimeticform,Baltimore,0,0,
3,isuelis,y57cbw,dcgirlsmallworld,If you are looking for a quiet neighborhood wi...,0,0,"Capitol Hill, Navy Yard, Shaw"
4,isl1grp,y57cbw,eeek0711,I lived in the DeLano Apartments in Woodley Pa...,0,0,Woodley Park
5,ita0jq1,y57cbw,samthehaggis,Agreed to Cleveland Park and Woodley Park. The...,1,8,"Cleveland Park, Woodley Park"
6,itxtwh8,y57cbw,Adept-Pension-1312,Mt Pleasant,0,0,
7,iwbywh5,y57cbw,danimaymoj315,"Probably Capitol Hill, Cleveland Park, and Log...",0,0,"Capitol Hill, Cleveland Park, Logan Circle"
8,iszr3e7,y57cbw,zazaza05,"Thank you, very helpful because I was consider...",0,0,
9,i9bctxl,utr8aw,,Too much Fox news,0,0,
