### Code to extract neighborhood from crime data

In [5]:
import pandas as pd
import numpy as np
import string 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
import warnings
warnings.filterwarnings('ignore')

### Open Data

In [7]:
## open first set of csv coded comments
coded_comments_df = pd.read_csv("../../data/reddit/cleaned_sentiment_data.csv")
coded_comments_df.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,subreddit,type,sentiment_score
0,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,69,2023,2,2,1,3
1,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,19,2023,2,2,1,7
2,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,5,2023,3,2,1,4
3,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,11,2023,2,2,1,7
4,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,12,2023,2,2,1,2


In [8]:
##show only needed columns in df
coded_selected_columns = coded_comments_df[['comment_id', 'submission_id', 'author', 'body', 'sentiment_score']]
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,jadntuu,11e8jyj,tehruben,I used to live at 77H and I both loved and hat...,3
1,jadkohn,11e8jyj,Zwicker101,Thats how I felt. Yeah all the people loiterin...,7
2,jagbljj,11e8jyj,UnderwhelmingComment,The city bent over backwards to attract wal ma...,4
3,jadbu2c,11e8jyj,mr_grission,I live in the building and I've been there nea...,7
4,jad6inl,11e8jyj,Zwicker101,Honestly? Probably because of crime,2


### Clean body text 

In [9]:
coded_selected_columns["body"] = [x.lower() for x in coded_selected_columns["body"]] # make lowercase for matching

In [7]:
# remove punctuation
# coded_selected_columns["body"] = [x.translate(str.maketrans('', '', string.punctuation)) for x in coded_selected_columns["body"]]

In [11]:
coded_selected_columns["body"] = [x.replace('\n',' ') for x in coded_selected_columns["body"]]

In [12]:
coded_selected_columns.head()

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2


In [16]:
# all identifying neighborhoods/region words
neighborhoods = [
    'Adams Morgan', 'American University Park', 'Anacostia', 'Barnaby Woods', 
    'Capitol Hill', 'Columbia Heights', 'Cleveland Park', 'Dupont', 
    'Foggy Bottom', 'Friendship Heights', 'Georgetown', 'Glover Park', 
    'H Street', 'Logan Circle', 'Mount Pleasant', 'Navy Yard', 
    'NoMa', 'Petworth', 'Shaw', 'Southwest Waterfront', 'Takoma', 
    'Tenleytown', 'The Palisades', 'U Street', 'West End', 'Woodley Park']

abbreviations = ['EOTR','Cheights', ' CH ', 'Admo', 'Downtown', 'central', 
                 ' SW ', ' SE ', ' NE ', 'north east', 'south east', 'north west']

# combine and make lowercase for matching to tokens
search_terms = neighborhoods + abbreviations
search_terms = [x.lower() for x in search_terms]

In [17]:
##function to apply to the body column
def get_neighborhoods(text):
    matches = [term for term in search_terms if term in text]
    return ', '.join(matches) if matches else None

coded_selected_columns['neighborhoods_mentioned'] = coded_selected_columns['body'].apply(get_neighborhoods)

In [18]:
coded_selected_columns.head(15)

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,
5,jage8el,11e8jyj,UnderwhelmingComment,dc didn’t hold up its end of the bargain to ke...,3,
6,jae14cj,11e8jyj,internSam,oh boy you must be new here. it’s because mo...,2,
7,jadkvdl,11e8jyj,digitall565,yep. i've seen more go down at stores in sw an...,4,"navy yard, sw"
8,jagcwvq,11e8jyj,UnderwhelmingComment,this is not some big mystery or corporate cons...,4,
9,jagesvp,11e8jyj,NorseTikiBar,"explain, in detail, how you think that ""keepin...",4,


### Look at none comments

In [19]:
coded_selected_columns[[isinstance(x, type(None)) for x in coded_selected_columns["neighborhoods_mentioned"]]]

Unnamed: 0,comment_id,submission_id,author,body,sentiment_score,neighborhoods_mentioned
0,jadntuu,11e8jyj,tehruben,i used to live at 77h and i both loved and hat...,3,
1,jadkohn,11e8jyj,Zwicker101,thats how i felt. yeah all the people loiterin...,7,
2,jagbljj,11e8jyj,UnderwhelmingComment,the city bent over backwards to attract wal ma...,4,
3,jadbu2c,11e8jyj,mr_grission,i live in the building and i've been there nea...,7,
4,jad6inl,11e8jyj,Zwicker101,honestly? probably because of crime,2,
...,...,...,...,...,...,...
6440,eyh4z5x,cwurzh,jbrofford,did the op even call 911? neighbors need to wa...,4,
6441,eyjy72n,cwurzh,zetuslapetus311,dc needs a more moderate mayor. who cares abou...,3,
6442,eyhlyi5,cwurzh,millennial_bot,"the problem isn't entirely mpd, the courts are...",2,
6443,eyfn4hy,cwurzh,,"what would you like mpd to do about the pcp, e...",3,


In [31]:
### We can try to match them with the thread body - might not be necessary though