### Code to extract neighborhood from crime data

In [2]:
import pandas as pd
import numpy as np
import string 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Open Data

In [4]:
## open first set of csv coded comments
coded_comments_df = pd.read_csv("../data/reddit/cleaned_sentiment_data.csv")
coded_comments_df.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
0,ita0jq1,y57cbw,samthehaggis,Agreed to Cleveland Park and Woodley Park. The...,1,2022,10,0,1,8
1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,0,1,4
2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,0,1,7
3,i9czvva,utr8aw,dans_cafe,"of course there are. And probably, it's over-...",1,2022,5,0,1,7
4,cqarv2e,32cqd6,AUBlazin,Haha well you have decided to live in the sket...,1,2015,4,0,1,6


In [5]:
##show only needed columns in df
coded_selected_columns = coded_comments_df[['comment_id', 'submission_id', 'author', 'body', 'sentiment_score']]
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,ita0jq1,y57cbw,samthehaggis,Agreed to Cleveland Park and Woodley Park. The...,8
1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",4
2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,7
3,i9czvva,utr8aw,dans_cafe,"of course there are. And probably, it's over-...",7
4,cqarv2e,32cqd6,AUBlazin,Haha well you have decided to live in the sket...,6


### Clean body text 

In [6]:
coded_selected_columns["body"] = [x.lower() for x in coded_selected_columns["body"]] # make lowercase for matching

In [7]:
# remove punctuation
# coded_selected_columns["body"] = [x.translate(str.maketrans('', '', string.punctuation)) for x in coded_selected_columns["body"]]

In [8]:
coded_selected_columns["body"] = [x.replace('\n',' ') for x in coded_selected_columns["body"]]

In [9]:
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,ita0jq1,y57cbw,samthehaggis,agreed to cleveland park and woodley park. the...,8
1,i9cnmct,utr8aw,dans_cafe,"generally speaking, robberies and muggings are...",4
2,i9csov5,utr8aw,Ok_Priority_1534,downtown/central dc. as i'm looking at differ...,7
3,i9czvva,utr8aw,dans_cafe,"of course there are. and probably, it's over-...",7
4,cqarv2e,32cqd6,AUBlazin,haha well you have decided to live in the sket...,6


In [10]:
# all identifying neighborhoods/region words
neighborhoods = [
    'Adams Morgan', 'American University Park', 'Anacostia', 'Barnaby Woods', 
    'Capitol Hill', 'Columbia Heights', 'Cleveland Park', 'Dupont', 
    'Foggy Bottom', 'Friendship Heights', 'Georgetown', 'Glover Park', 
    'H Street', 'Logan Circle', 'Mount Pleasant', 'Navy Yard', 
    'NoMa', 'Petworth', 'Shaw', 'Southwest Waterfront', 'Takoma', 
    'Tenleytown', 'The Palisades', 'U Street', 'West End', 'Woodley Park']

abbreviations = ['EOTR','Cheights', ' CH ', 'Admo', 'Downtown', 'central', 
                 'SW', 'SE', 'NE', 'north east', 'south east', 'north west']

# combine and make lowercase for matching to tokens
search_terms = neighborhoods + abbreviations
search_terms = [x.lower() for x in search_terms]

In [None]:
##function to apply to the body column
def get_neighborhoods(text):
    matches = [term for term in search_terms if term in text]
    return ', '.join(matches) if matches else None

coded_selected_columns['neighborhoods_mentioned'] = coded_selected_columns['body'].apply(get_neighborhoods)

In [13]:
coded_selected_columns.head(15)

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
0,ita0jq1,y57cbw,samthehaggis,agreed to cleveland park and woodley park. the...,8,"cleveland park, woodley park"
1,i9cnmct,utr8aw,dans_cafe,"generally speaking, robberies and muggings are...",4,"h street, takoma, ne"
2,i9csov5,utr8aw,Ok_Priority_1534,downtown/central dc. as i'm looking at differ...,7,"downtown, central, se, ne"
3,i9czvva,utr8aw,dans_cafe,"of course there are. and probably, it's over-...",7,"sw, se, ne"
4,cqarv2e,32cqd6,AUBlazin,haha well you have decided to live in the sket...,6,"capitol hill, noma, se, ne"
5,cg6zt92,20t3z6,,unfortunately mpd is notorious for underreport...,3,"se, ne"
6,cg7mdys,20t3z6,AUBlazin,i completely understand and believe what you a...,5,
7,k65n8fq,17ed6rg,Brinzy,"i am not a super nature-y person myself, so ym...",7,"sw, se, ne"
8,k65oqmg,17ed6rg,Snowbold,i had this same issue while finding a place. i...,7,
9,k6rdhmy,17ed6rg,mm10102,woodley park and cleveland park have one beds ...,7,"cleveland park, woodley park, se, ne"


### Look at none comments

In [30]:
coded_selected_columns[[isinstance(x, type(None)) for x in coded_selected_columns["neighborhoods_mentioned"]]]

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
6,cg7mdys,20t3z6,AUBlazin,i completely understand and believe what you a...,5,
8,k65oqmg,17ed6rg,Snowbold,i had this same issue while finding a place. i...,7,
40,lzn9uwz,1h2wunn,Kind_Mixture1649,you couldn’t pay me to hang out on u st.,3,
43,lzsvxnw,1h2wunn,Adorable-Golf-2113,the reason is the parents aren't raising their...,4,
50,m01a4b0,1h2wunn,Beginning-Gur4706,the police could stop it. it would be violent ...,3,
...,...,...,...,...,...,...
5334,ie9skxh,vmknzh,Failninjaninja,ending soft on crime policies would be a terri...,3,
5341,jkwfgsj,13mjnq2,EC_dwtn,3d has more carjackings than 7d so far this ye...,4,
5356,eyh30ml,cwurzh,geauxing4broke,i had a similar experience my first week in dc...,3,
5358,cbd8oyx,1jbs1a,Baloncesto,"yeah, that area is okay. you'll be right acros...",6,


In [31]:
### We can try to match them with the thread body - might not be necessary though