### Code to extract neighborhood from crime data

In [105]:
import pandas as pd
import numpy as np
import string 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [106]:
import warnings
warnings.filterwarnings('ignore')

### Open Data

In [107]:
## open first set of csv coded comments
coded_comments_df = pd.read_csv("../../data/reddit/cleaned_sentiment_data.csv")
coded_comments_df.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
0,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,69,2023,2,2,1,3
1,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,19,2023,2,2,1,7
2,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,5,2023,3,2,1,4
3,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,11,2023,2,2,1,7
4,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,12,2023,2,2,1,2


In [108]:
##show only needed columns in df
coded_selected_columns = coded_comments_df[['comment_id', 'submission_id', 'author', 'body', 'sentiment_score']]
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,3
1,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,7
2,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,4
3,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,7
4,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,2


### Clean body text 

In [109]:
coded_selected_columns["body"] = [x.lower() for x in coded_selected_columns["body"]] # make lowercase for matching

In [110]:
# remove punctuation
# coded_selected_columns["body"] = [x.translate(str.maketrans('', '', string.punctuation)) for x in coded_selected_columns["body"]]

In [111]:
coded_selected_columns["body"] = [x.replace('\n',' ') for x in coded_selected_columns["body"]]

In [112]:
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2


In [113]:
# all identifying neighborhoods/region words
neighborhoods = [
    'Adams Morgan', 'American University Park', 'Anacostia', 'Barnaby Woods', 
    'Capitol Hill', 'Columbia Heights', 'Cleveland Park', 'Dupont', 
    'Foggy Bottom', 'Friendship Heights', 'Georgetown', 'Glover Park', 
    'H Street', 'Logan Circle', 'Mount Pleasant', 'Navy Yard', 
    'NoMa', 'Petworth', 'Shaw', 'Southwest Waterfront', 'Takoma', 
    'Tenleytown', 'The Palisades', 'U Street', 'West End', 'Woodley Park']

abbreviations = ['EOTR','Cheights', ' CH ', 'Admo', 'Downtown', 'central', 
                 ' SW ', ' SE ', ' NE ', 'north east', 'south east', 'north west']

# combine and make lowercase for matching to tokens
search_terms = neighborhoods + abbreviations
search_terms = [x.lower() for x in search_terms]

In [114]:
##function to apply to the body column
def get_neighborhoods(text):
    matches = [term for term in search_terms if term in text]
    return ', '.join(matches) if matches else None

coded_selected_columns['neighborhoods_mentioned'] = coded_selected_columns['body'].apply(get_neighborhoods)

In [115]:
coded_selected_columns

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,
...,...,...,...,...,...,...
6440,eyh4z5x,cwurzh,jbrofford,did the op even call 911? neighbors need to wa...,4,
6441,eyjy72n,cwurzh,zetuslapetus311,dc needs a more moderate mayor. who cares abou...,3,
6442,eyhlyi5,cwurzh,millennial_bot,"the problem isn't entirely mpd, the courts are...",2,
6443,eyfn4hy,cwurzh,,"what would you like mpd to do about the pcp, e...",3,


In [116]:
yes_neighborhoods = coded_selected_columns[coded_selected_columns["neighborhoods_mentioned"].apply(lambda x: x is not None)]
yes_neighborhoods.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
7,jadkvdl,11e8jyj,digitall565,yep. i've seen more go down at stores in sw an...,4,"navy yard, sw"
21,gzixvn7,nljeif,frenchvanilafantasy,my office is located in noma and we received a...,3,noma
23,gzixppj,nljeif,,"i mean, noma has been like this for a while. a...",2,noma
24,gzjrh9w,nljeif,lonliestnumber,i recently moved to noma this year so i wouldn...,3,noma
28,gzjfj9w,nljeif,DJCWick,"been in noma for 3 years, and it's definitely ...",3,noma


In [117]:
no_neighborhoods = coded_selected_columns[[isinstance(x, type(None)) for x in coded_selected_columns["neighborhoods_mentioned"]]]
no_neighborhoods.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,


In [118]:
no_neighborhoods.shape

(5005, 6)

### Match None Comments with Threads

In [119]:
threads_all = pd.read_csv("../../data/reddit/original_scraped_data/threads_ALL_12_6.csv")
threads_all.head()

Unnamed: 0.1,Unnamed: 0,submission_id,title,text,year,month,subreddit,type
0,0,y57cbw,"Best quiet, safe but lively neighborhood in DC",I am in my late 30s and haven't lived in DC si...,2022,10,0,0
1,2,32cqd6,Recommendations on safe neighborhoods for a yo...,I'm moving to DC from the South in a few weeks...,2015,4,0,0
2,4,1h3ldb4,1 Month of Free Rent in 770 SQFT. High Rise Ap...,Apartment Takeover Opportunity: Market House i...,2024,11,0,0
3,5,1gs6shr,Spacious studio in Woodley park,Spacious studio available for lease takeover s...,2024,11,0,0
4,6,1fbfdzi,ISO 12 month sublet starting 11-22-24,Hi there I recently relocated for work ( in Ro...,2024,9,0,0


In [120]:
merge_threads = no_neighborhoods.merge(threads_all[["submission_id","text","title"]], on="submission_id", how="left")
merge_threads.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned,text,title
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st


### Clean threads & match with neighborhoods

In [121]:
merge_threads["title_text"] = merge_threads["title"] + " " + merge_threads["text"] # combine title and submission body

In [122]:
merge_threads["title_text"] = [str(x).lower() for x in merge_threads["title_text"]] # make lowercase for matching

In [123]:
merge_threads["title_text"] = [x.replace('\n',' ') for x in merge_threads["title_text"]] # remove newline characters

In [124]:
merge_threads.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned,text,title,title_text
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...


In [125]:
merge_threads['neighborhoods_mentioned_submission'] = merge_threads['title_text'].apply(get_neighborhoods)

In [126]:
merge_threads

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned,text,title,title_text,neighborhoods_mentioned_submission
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
...,...,...,...,...,...,...,...,...,...,...
5000,eyh4z5x,cwurzh,jbrofford,did the op even call 911? neighbors need to wa...,4,,I witnessed the stabbing on Irving Street last...,I witnessed the stabbing on Irving Street,i witnessed the stabbing on irving street i wi...,
5001,eyjy72n,cwurzh,zetuslapetus311,dc needs a more moderate mayor. who cares abou...,3,,I witnessed the stabbing on Irving Street last...,I witnessed the stabbing on Irving Street,i witnessed the stabbing on irving street i wi...,
5002,eyhlyi5,cwurzh,millennial_bot,"the problem isn't entirely mpd, the courts are...",2,,I witnessed the stabbing on Irving Street last...,I witnessed the stabbing on Irving Street,i witnessed the stabbing on irving street i wi...,
5003,eyfn4hy,cwurzh,,"what would you like mpd to do about the pcp, e...",3,,I witnessed the stabbing on Irving Street last...,I witnessed the stabbing on Irving Street,i witnessed the stabbing on irving street i wi...,


In [127]:
merge_threads_neighborhoods = merge_threads[merge_threads["neighborhoods_mentioned_submission"].apply(lambda x: x is not None)]
merge_threads_neighborhoods.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned,text,title,title_text,neighborhoods_mentioned_submission
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,,New sign indicates that Walmart on H Street is...,Walmart on H Street Closing on March 31st,walmart on h street closing on march 31st new ...,h street


### Join thread neighborhood data with comment neighborhood data 

In [128]:
yes_neighborhoods

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
7,jadkvdl,11e8jyj,digitall565,yep. i've seen more go down at stores in sw an...,4,"navy yard, sw"
21,gzixvn7,nljeif,frenchvanilafantasy,my office is located in noma and we received a...,3,noma
23,gzixppj,nljeif,,"i mean, noma has been like this for a while. a...",2,noma
24,gzjrh9w,nljeif,lonliestnumber,i recently moved to noma this year so i wouldn...,3,noma
28,gzjfj9w,nljeif,DJCWick,"been in noma for 3 years, and it's definitely ...",3,noma
...,...,...,...,...,...,...
6416,js5fn86,150nxj6,ThatDudeFromPlaces,pre 2019 my buddies and i would all walk home ...,3,"woodley park, admo"
6424,c5tvhmv,ya0e5,DCBacon,"so the ""scary"" places that you never go to wil...",6,"anacostia, georgetown"
6425,c5tua0p,ya0e5,rasputin777,there don't seem to be any from the actually b...,10,anacostia
6427,c5tx9tg,ya0e5,JohnnyMcDoodle,safer in anacostia? i'm calling bullshit. crim...,2,anacostia


In [129]:
all_neighborhood_data = pd.concat([yes_neighborhoods,merge_threads_neighborhoods])

In [130]:
all_neighborhood_data.columns

Index(['comment_id', 'submission_id', 'author', 'body', 'sentiment_score',
       'neighborhoods_mentioned', 'text', 'title', 'title_text',
       'neighborhoods_mentioned_submission'],
      dtype='object')

In [131]:
all_neighborhood_data = all_neighborhood_data.rename(columns={"body":"comment_text","title_text":"thread_text"})

In [132]:
all_neighborhood_data = all_neighborhood_data.drop(["text","title"],axis=1)

In [133]:
all_neighborhood_data   

Unnamed: 0,comment_id,submission_id,author,comment_text,sentiment_score,neighborhoods_mentioned,thread_text,neighborhoods_mentioned_submission
7,jadkvdl,11e8jyj,digitall565,yep. i've seen more go down at stores in sw an...,4,"navy yard, sw",,
21,gzixvn7,nljeif,frenchvanilafantasy,my office is located in noma and we received a...,3,noma,,
23,gzixppj,nljeif,,"i mean, noma has been like this for a while. a...",2,noma,,
24,gzjrh9w,nljeif,lonliestnumber,i recently moved to noma this year so i wouldn...,3,noma,,
28,gzjfj9w,nljeif,DJCWick,"been in noma for 3 years, and it's definitely ...",3,noma,,
...,...,...,...,...,...,...,...,...
4982,js5ar6m,150nxj6,iidesune,i'd say dc is definitely carrying more than it...,3,,multiple robberies in admo (popville) be safe ...,admo
4983,js57acg,150nxj6,Freezerburn,that implies that i would break the law and ca...,3,,multiple robberies in admo (popville) be safe ...,admo
4984,js6f38b,150nxj6,Deep_Stick8786,until you wear a canada goose in winter. but y...,4,,multiple robberies in admo (popville) be safe ...,admo
4985,js69z1j,150nxj6,,for years the draconian gun laws favored crimi...,2,,multiple robberies in admo (popville) be safe ...,admo


In [134]:
all_neighborhood_data["neighborhoods_mentioned"] = all_neighborhood_data["neighborhoods_mentioned"].fillna(all_neighborhood_data["neighborhoods_mentioned_submission"])

In [137]:
all_neighborhood_data.drop(["author","neighborhoods_mentioned_submission"],axis=1)

Unnamed: 0,comment_id,submission_id,comment_text,sentiment_score,neighborhoods_mentioned,thread_text
7,jadkvdl,11e8jyj,yep. i've seen more go down at stores in sw an...,4,"navy yard, sw",
21,gzixvn7,nljeif,my office is located in noma and we received a...,3,noma,
23,gzixppj,nljeif,"i mean, noma has been like this for a while. a...",2,noma,
24,gzjrh9w,nljeif,i recently moved to noma this year so i wouldn...,3,noma,
28,gzjfj9w,nljeif,"been in noma for 3 years, and it's definitely ...",3,noma,
...,...,...,...,...,...,...
4982,js5ar6m,150nxj6,i'd say dc is definitely carrying more than it...,3,admo,multiple robberies in admo (popville) be safe ...
4983,js57acg,150nxj6,that implies that i would break the law and ca...,3,admo,multiple robberies in admo (popville) be safe ...
4984,js6f38b,150nxj6,until you wear a canada goose in winter. but y...,4,admo,multiple robberies in admo (popville) be safe ...
4985,js69z1j,150nxj6,for years the draconian gun laws favored crimi...,2,admo,multiple robberies in admo (popville) be safe ...


In [138]:
all_neighborhood_data = all_neighborhood_data[['comment_id', 'submission_id', 'comment_text', 'thread_text', 'neighborhoods_mentioned', 'sentiment_score']]

In [140]:
all_neighborhood_data.to_csv("../../data/reddit/gpt_sentiment_with_neighborhoods.csv",index=False)