In [1]:
# File: crime_words.ipynb -- Record # of Crime Words Per Review
# Author: Shomik Jain
# Date: 2/02/2020

In [20]:
import pandas as pd
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import itertools
from collections import Counter

In [2]:
file = 'nyc_zipcode_all.tsv'
data = pd.read_csv(file, delimiter='\t', quotechar='"', escapechar='\\')

In [6]:
data['crime_words'] = 0
data['crime_words_perc'] = 0

In [9]:
data.head()

Unnamed: 0,year,zipcode,reviews,agg_count,rating,bedrooms,beds,person_capacity,price,reviews_count,...,crime_score,crime_class,gini_class,race_class,age_class,edu_class,streetscore_class,reviews_bow,crime_words_old,crime_words_perc_old
0,2014,1001,We stayed for two nights in mid-August. We we...,12,3.667,3.0,3.0,6.0,250.0,25.0,...,,0,0,0,0,0,0,stay night august littl scar come stair apart ...,2,0.001474
1,2015,1001,I didn't meet Shelly but the apartment was spo...,12,3.75,3.0,3.0,6.0,250.0,25.0,...,,0,0,0,0,0,0,meet shelli apart spotless nice decor bedroom ...,1,0.000737
2,2015,1003,"The apartment was perfect. Very neat, great lo...",4,5.0,1.0,1.0,2.0,275.0,4.0,...,,0,0,0,0,0,0,apart perfect neat great locat ideal want visi...,2,0.001474
3,2016,1121,This is a great apartment. Tom and Jenny made ...,1,5.0,1.0,1.0,4.0,110.0,0.0,...,,0,0,0,0,0,0,great apart jenni feel welcom accommod definit...,0,0.0
4,2017,10000,The host canceled this reservation 3 days befo...,9,3.889,1.0,1.0,2.0,200.0,3.0,...,,0,0,0,0,0,0,host cancel reserv day arriv autom post stay s...,3,0.002211


In [27]:
data = data.loc[data['reviews_bow'].notna()]

# Crime Words

In [13]:
crime_words = [
'Abuse', 
'Accomplice', 
'Accuse',
'Activists',
'Against',
'Aggravated',
'assault', 
'Alarm',
'Alert', 
'Allegation', 
'Ammunition', 
'APB', 
'Armed', 
'Arraignment', 
'Arrest',
'Arsenal', 
'Arson',
'Assailant',
'Assault', 
'Attack', 
'Autopsy',
'Bail', 
'Battery', 
'Beat', 
'Blackmail', 
'Blood',
'Bomb', 
'Brawl', 
'Breach', 
'Break',
'Bribe', 
'Brutal', 
'Bully',
'Burglary',
'Bystander',
'Capture', 
'Caution', 
'Coercion',
'Collusion', 
'Combat',
'Complain',
'Conspiracy', 
'Convict', 
'Cops', 
'Coroner', 
'Corrupt',
'Counterfeit', 
'CIA',
'Crime', 
'Criminal',
'Criminology',
'Cuffs',
'Custody',
'Damage',
'Danger',
'Dangerous', 
'Dead', 
'Death',
'Defense',
'Deputy',
'Detain', 
'Detective', 
'Disorderly', 
'Dispatch', 
'DNA', 
'Drugs', 
'Emergency', 
'Evasive', 
'Eviction', 
'Evil',
'Explosives', 
'Extradition', 
'Fatality', 
'FBI',
'Felony',
'Fight',
'Fingerprint', 
'Firebombing',
'Flee', 
'Forensics', 
'Forgery', 
'Fraud',
'Gory',
'Guard',
'Gun',
'Handcuffs',
'Harassment',
'Homeless',
'Harm', 
'Heinous',
'Hijack',
'Holster', 
'Homicide', 
'Hostage',
'Illegal', 
'Immoral', 
'Immunity', 
'Impeach', 
'Imprison',
'Incarceration',
'Incriminating', 
'Indictment', 
'Injury', 
'Inmate',
'Intruder',
'Invasive',
'Investigation',
'Jail',
'Juvenile',
'Kidnapping',
'Kill',
'Killer', 
'Larceny',
'Legal',
'Lynch',
'Mace',
'Malice',
'Malpractice',
'Manacled',
'Manslaughter',
'Misdemeanor',
'Murder',
'Murderer',
'911', 
'Offender',
'Offense',
'Officer',
'Patrol',
'Perjury',
'Perpetrator',
'Plea',
'Police', 
'Prison',
'Probation',
'Prosecute',
'Prosecutor',
'Prostitution',
'Radar', 
'Rape',
'Riot',
'Robbery',
'Rogue',
'Safe',
'Sanction',
'Sergeant',
'Shackles',
'Sheriff', 
'Shooting',
'Smuggling',
'Spying',
'Subpoena',
'Summons',
'Surveillance',
'Suspect',
'Suspicious',
'Terrorism',
'Theft',
'Threatening',
'Torture',
'Trauma',
'Unauthorized',
'Unlawful',
'Vagrancy',
'Vandalism',
'Victim',
'Violation',
'Violence',
'Warning',
'Weapon',
]

In [14]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [15]:
for i,w in enumerate(crime_words):
    crime_words[i] = preprocess(w)

In [18]:
crime_words_clean = list(itertools.chain.from_iterable(crime_words))

In [30]:
cw_counts = []
cw_perc_counts = []
for i,r in data.iterrows():
    bow = r['reviews_bow'].split()
    counts = Counter(bow)
    
    total = 0
    for w in crime_words_clean:
        total += counts[w]
    
    cw_counts.append(total)
    cw_perc_counts.append(total/len(bow))

In [32]:
data.loc[:, 'crime_words'] = cw_counts

In [33]:
data.loc[:, 'crime_words_perc'] = cw_perc_counts

In [37]:
data.head()

Unnamed: 0,year,zipcode,reviews,agg_count,rating,bedrooms,beds,person_capacity,price,reviews_count,...,streetscore_std,age_25_34,crime_score,crime_class,gini_class,race_class,age_class,edu_class,streetscore_class,reviews_bow
0,2014,1001,We stayed for two nights in mid-August. We we...,12,3.667,3.0,3.0,6.0,250.0,25.0,...,,,,0,0,0,0,0,0,stay night august littl scar come stair apart ...
1,2015,1001,I didn't meet Shelly but the apartment was spo...,12,3.75,3.0,3.0,6.0,250.0,25.0,...,,,,0,0,0,0,0,0,meet shelli apart spotless nice decor bedroom ...
2,2015,1003,"The apartment was perfect. Very neat, great lo...",4,5.0,1.0,1.0,2.0,275.0,4.0,...,,,,0,0,0,0,0,0,apart perfect neat great locat ideal want visi...
3,2016,1121,This is a great apartment. Tom and Jenny made ...,1,5.0,1.0,1.0,4.0,110.0,0.0,...,,,,0,0,0,0,0,0,great apart jenni feel welcom accommod definit...
4,2017,10000,The host canceled this reservation 3 days befo...,9,3.889,1.0,1.0,2.0,200.0,3.0,...,,,,0,0,0,0,0,0,host cancel reserv day arriv autom post stay s...


In [38]:
data.to_csv(file, index=False, sep='\t', quotechar='"', escapechar='\\')