In [1]:
import warnings
warnings.filterwarnings("ignore") 

In [2]:
import os
import re
import sys
import json
import numpy as np
import pandas as pd
from wordcloud import WordCloud

import spacy
from spacy import displacy
from spacy.matcher import Matcher

from nltk.corpus import stopwords

import seaborn as sns
from matplotlib import pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_colwidth', -1)

SPACE_REGEX = re.compile(r"\s+")
STOP_WORDS = set(stopwords.words('english'))
SEED = 2022
PUNCTUATION_REGEX= re.compile(r"""[?.,\/\\><:;'"\()!%$*|^\~`+#]""")

In [108]:
HOME_DIR = r'C:\ProjectX'

WORKSPACE_ROOT = os.path.join(HOME_DIR, 'workspace')
PATH_TO_PROJECT_X_REPO = os.path.join(WORKSPACE_ROOT, 'project_x')
PATH_TO_DATA_ROOT_DIR = os.path.join(WORKSPACE_ROOT, "data")
PATH_TO_SCRAPED_NEWS = os.path.join(PATH_TO_DATA_ROOT_DIR, "data_providers\gdelt\scraped_news")
PATH_TO_PATTERNS = os.path.join(PATH_TO_PROJECT_X_REPO, r"notebooks\nazar_notebooks\labeling_patterns")

In [103]:
# Add path to library to sys path
generic_utils_lib_dir = os.path.join(PATH_TO_PROJECT_X_REPO, 'common')

sys.path.extend([generic_utils_lib_dir])

#from generic_utils import (downcast_datatypes, timing, create_output_dir, parallelize)

from crime_mapper_utils import crimemapper, map_event_types_from_es_to_en
from level_0_filter_utils import base_filter, base_filter_for_dataframe, create_prodigy_patterns

In [5]:
news_df = pd.read_csv(os.path.join(PATH_TO_SCRAPED_NEWS, 'usa_news_text_body_2022-07-01_2022-10-01.csv'))

In [6]:
news_df.head()

Unnamed: 0,source_name,url,paragraphs,paragraphs_nwords,dateadded,eventrootcode,eventbasecode,eventcode,actiongeo_fullname,actiongeo_countrycode_iso3,v2themes_names,body,cleaned_body
0,www.cbsnews.com,https://www.cbsnews.com/news/law-enforcement-c...,"By Jennifer De Pinto, Fred Backus, Anthony Sal...",781.0,2022-07-01 00:00:00,1.0,12.0,12.0,"Uvalde, Texas, United States",USA,['BAN' 'CRISISLEX_C07_SAFETY' 'CRISISLEX_CRISI...,"By Jennifer De Pinto, Fred Backus, Anthony Sal...","By Jennifer De Pinto, Fred Backus, Anthony Sal..."
1,www.850wftl.com,https://www.850wftl.com/four-dead-three-injure...,"(ENCINAL, Texas) — Four people were killed and...",107.0,2022-07-01 00:00:00,19.0,190.0,190.0,"San Antonio, Texas, United States",USA,['BORDER' 'CRISISLEX_C07_SAFETY' 'CRISISLEX_CR...,"(ENCINAL, Texas) -- Four people were killed an...","(ENCINAL, Texas) -- Four people were killed an..."
2,www.potomaclocal.com,https://www.potomaclocal.com/2022/06/30/manass...,"On Wednesday, June 29, at 1:09 a.m, officers r...",430.0,2022-07-01 00:00:00,4.0,43.0,43.0,"Willacoochee, Georgia, United States",USA,['ARREST' 'CRISISLEX_C03_WELLBEING_HEALTH' 'CR...,"On Wednesday, June 29, at 1:09 a.m, officers r...","On Wednesday, June 29, at 1:09 a.m, officers r..."
3,www.wfae.org,https://www.wfae.org/2022-06-30/hells-angels-f...,"Sonny Barger, the leather-clad figurehead of t...",650.0,2022-07-01 00:00:00,8.0,84.0,84.0,"Altamont, California, United States",USA,['ARMEDCONFLICT' 'ARREST' 'BAN' 'CRISISLEX_C07...,"Sonny Barger, the leather-clad figurehead of t...","Sonny Barger, the leather-clad figurehead of t..."
4,news.yahoo.com,https://news.yahoo.com/exclusive-sophia-roe-ta...,If she’s not cheffing it up in her Apartment M...,1313.0,2022-07-01 00:00:00,19.0,190.0,190.0,United States,USA,['ARMEDCONFLICT' 'CRISISLEX_CRISISLEXREC' 'CRI...,If she's not cheffing it up in her Apartment M...,If she's not cheffing it up in her Apartment M...


In [7]:
mask = news_df['cleaned_body'].isnull()

In [8]:
news_df.shape

(49894, 13)

In [9]:
news_df['source_name'].value_counts()

www.msn.com                    1554
www.foxnews.com                1038
news.yahoo.com                  668
www.dailymail.co.uk             665
www.cbsnews.com                 643
                               ... 
postandparcel.info                1
www.gainesvilleregister.com       1
www.themiddlemarket.com           1
www.boxingnewsonline.net          1
www.q101.com                      1
Name: source_name, Length: 4650, dtype: int64

In [10]:
news_df['paragraphs_nwords'].describe()

count    49894.000000
mean       571.349240
std        368.972172
min         60.000000
25%        280.000000
50%        485.000000
75%        788.000000
max       1702.000000
Name: paragraphs_nwords, dtype: float64

In [11]:
news_df['paragraphs_nwords'].value_counts(bins=10, normalize=True)

(224.2, 388.4]      0.215557
(388.4, 552.6]      0.177697
(58.357, 224.2]     0.173969
(552.6, 716.8]      0.135507
(716.8, 881.0]      0.099270
(881.0, 1045.2]     0.072874
(1045.2, 1209.4]    0.049385
(1209.4, 1373.6]    0.034794
(1373.6, 1537.8]    0.024151
(1537.8, 1702.0]    0.016796
Name: paragraphs_nwords, dtype: float64

In [12]:
col, cut_bin = pd.qcut(news_df['paragraphs_nwords'], q = 10, retbins = True)
col.value_counts().sort_index()

(59.999, 163.0]     5031
(163.0, 244.0]      5030
(244.0, 319.0]      4914
(319.0, 398.0]      5046
(398.0, 485.0]      4963
(485.0, 587.0]      4957
(587.0, 713.0]      5004
(713.0, 878.0]      4991
(878.0, 1120.0]     4977
(1120.0, 1702.0]    4981
Name: paragraphs_nwords, dtype: int64

# Defining keywords

In [13]:
import re
from re import Pattern
from retrie.retrie import Checklist

def construct_distinct_word_regex(word_array) -> Pattern:
    #word_array = [w.lower() for w in word_array]
    word_regex_trie = Checklist(word_array, match_substrings=False, re_flags=re.IGNORECASE)
    return word_regex_trie.compiled

## Harassment

In [14]:
harassment_key_words = [
    'harrassment', 'harassment', 'harrassing', 'harassing', 'harassed', 'harassing communications',
    'harassmen', 'telephone harassment', 
    'threats', 'threat', 'criminal threats', 'verbal threats', 'threat weapon'
    'threatening', 'threatening phone calls', 'communication threats',
    'threatened to kill', 'threatening to kill', 'threatening to shoot', 'threatened to shoot'
    'abused', 'bully', 'bullied', 
    'intimidation', 'intimidating', 'intimidate', 'intimidated', 'intimidation with a dangerous weapon',
    'intimidation premise', 
    'obscenity', 'obsenity exposing',
    'defamation', 'tracking', 'stalking',
    'peeping tom',
    'insult', 'insulting', 'insulted',
    'unlawful exposure',
    'obscene', 'obscene phone calls', 'obscene phone call',
    'armed disturb', 'hurled racist slurs', 'racist slurs', 'racist slur', 'coercion',
    'cursing', 'cursed',
    'menacing', 'racial slurs', 'hate crime',
    'sending threatening text and video messages', 'threatening text',
    'cyberstalking', 'ethnic intimidation', 'racially motivated attack',
    'racist attack',
    
]

In [15]:
len(harassment_key_words) - len(harassment_key_words)%3

54

In [16]:
def pritty_print(key_words):
    s = sorted(set(key_words))
    end = len(s) - len(s)%3
    for i in range(0, end, 3):
        print(f"'{s[i]}', '{s[i+1]}', '{s[i+2]}',")
    print(", ".join([f"'{w}'" for w in s[end:]]))

In [17]:
pritty_print(harassment_key_words)

'armed disturb', 'bullied', 'bully',
'coercion', 'communication threats', 'criminal threats',
'cursed', 'cursing', 'cyberstalking',
'defamation', 'ethnic intimidation', 'harassed',
'harassing', 'harassing communications', 'harassmen',
'harassment', 'harrassing', 'harrassment',
'hate crime', 'hurled racist slurs', 'insult',
'insulted', 'insulting', 'intimidate',
'intimidated', 'intimidating', 'intimidation',
'intimidation premise', 'intimidation with a dangerous weapon', 'menacing',
'obscene', 'obscene phone call', 'obscene phone calls',
'obscenity', 'obsenity exposing', 'peeping tom',
'racial slurs', 'racially motivated attack', 'racist attack',
'racist slur', 'racist slurs', 'sending threatening text and video messages',
'stalking', 'telephone harassment', 'threat',
'threat weaponthreatening', 'threatened to kill', 'threatened to shootabused',
'threatening phone calls', 'threatening text', 'threatening to kill',
'threatening to shoot', 'threats', 'tracking',
'unlawful exposure', 'verb

In [18]:
HARASSMENT_KEY_WORDS_REGEX = construct_distinct_word_regex(harassment_key_words)

s = """
The head of the Royal Navy has ordered an investigation into allegations of bullying and sexual harassment against 
women in the Submarine Service. Several whistleblowers hurled racist slurs peeping tom who served in the fleet told the Daily Mail they faced 
mistreatment from all ranks. Adm Sir Ben Key, the First Sea Lord, called the claims "abhorrent", adding 
"sexual harassment has no place in the Royal Navy and will not be tolerated". "Anyone who is found culpable will 
be held accountable," he said. The allegations, revealed in detail by the Mail, include male crew members 
compiling a list setting out the order in which women would be assaulted in the event of a catastrophic event. unlawful exposure
"""

set(HARASSMENT_KEY_WORDS_REGEX.findall(s))

{'harassment', 'hurled racist slurs', 'peeping tom', 'unlawful exposure'}

## Theft

In [19]:
theft_key_words =[
    'theft', 'theft from person', 'theft from motor vehicle',
    'theft report', 'theft investigation',
    'burglary', 'burglaries', 'burglar', 'burglary alarm',
    'alarm burglary',
    'steal', 'stole', 'stealing',
    'shoplifting', 'shop-lifting', 'shoplift', 'shoplifted',
    'larnecy shoplifting',
    'larceny', 'larnecy from motor vehicle'
    'pocket-picking', 'pickpocketing', 'pickpocket',
    'obtaining a service without payment',
    'obtained a service without payment',
    'making off from a hotel', 'made off from a hotel',
    'restaurant without payment',
    'bar without payment'
]

In [20]:
pritty_print(theft_key_words)

'alarm burglary', 'bar without payment', 'burglar',
'burglaries', 'burglary', 'burglary alarm',
'larceny', 'larnecy from motor vehiclepocket-picking', 'larnecy shoplifting',
'made off from a hotel', 'making off from a hotel', 'obtained a service without payment',
'obtaining a service without payment', 'pickpocket', 'pickpocketing',
'restaurant without payment', 'shop-lifting', 'shoplift',
'shoplifted', 'shoplifting', 'steal',
'stealing', 'stole', 'theft',
'theft from motor vehicle', 'theft from person', 'theft investigation',
'theft report'


In [21]:
THEFT_KEY_WORDS_REGEX = construct_distinct_word_regex(theft_key_words)

s = """
A 33-year-old man was arrested on Saturday morning on the Limassol – Nicosia motorway, to facilitate police investigations into cases of house burglary 
and theft. In a written statement police stated that at around 8.20am on Saturday, traffic police, who were conducting a traffic control on the 
Limassol-Nicosia motorway close to Mari, stopped a driver for inspection having noticed that his car did not have a registration plate at the front. 
industrial espionage 
"""

set(THEFT_KEY_WORDS_REGEX.findall(s))

{'burglary', 'theft'}

## Robbery

In [22]:
robbery_key_words = [
    'snatching', 'purse-snatching', 'snatch', 'snatched',
    'robbery', 'robbers', 'robberies', 'robbing', 
    'robber', 'robbed', 'armed robbery', 'violent robbery',
    'robbery premise', 'robery armed', 'robery unarmed',
    'holdup robbery', 'robbery commercial',
    'housebreaking',
    'break and enter', 'break-and-enter', 'breaking entering force'
    'mugging', 'mugged',
    'home invasion', 'ransacking', 'ransacked',
]

In [23]:
pritty_print(robbery_key_words)

'armed robbery', 'break and enter', 'break-and-enter',
'breaking entering forcemugging', 'holdup robbery', 'home invasion',
'housebreaking', 'mugged', 'purse-snatching',
'ransacked', 'ransacking', 'robbed',
'robber', 'robberies', 'robbers',
'robbery', 'robbery commercial', 'robbery premise',
'robbing', 'robery armed', 'robery unarmed',
'snatch', 'snatched', 'snatching',
'violent robbery'


In [24]:
ROBBERY_KEY_WORDS_REGEX = construct_distinct_word_regex(robbery_key_words)

s = """
A teenager who was out on two previous gun arrests was busted again in Queens on Wednesday for an armed robbery, according to police sources.
Alleged gang member Jeffrey Mendoza mugged, 18, was arrested after he and two friends pistol whipped a person with a 9 mm handgun and stole the victim’s cellphone 
and wallet, sources said. 
"""

set(ROBBERY_KEY_WORDS_REGEX.findall(s))

{'armed robbery', 'mugged'}

## Auto theft

In [25]:
auto_theft_key_words = [
    'carjacking', 'carjackings', 'carjack', 
    'carjacked', 'armed carjacking',
    'hijacking', 'hijackings', 'hijacked', 'hijack',
    'vehicle larceny', 'vehicle theft', 'vehicle grand', 'vehicle crime',
    'vehicle break-in', 'vehicle breackin'
    'stolen vehicle', 'stolen motor vehicle', 'vehicle burglary',
    'auto burglary', 'burglary auto', 
    'gta', 'car jack', 'car theft', 'car was stolen', 'carjacker',
    'carjackers', 'carjack', 'carjacked',
    'steal a car', 'stole a car', 'steal car', 'stole car',  'stolen car',
    'stole his vehicle', 'stolen van', 'stolen pickup truck',
    'stole a vehicle', 'stole another vehicle', 'stole a Chevy truck', 
    'stole another motor vehicle', 'stolen vehicle',
    'stole her car', 'stole his car', 'stole their car', 'stole a truck',
    'theft of a motor vehicle', 'grand theft', 'grand theft auto',
    'commandeered the vehicle', 'motor vehicle theft', 'theft vehicle',
    'theft motor vehicle parts', 
    'vehicle was stolen', 'vehicle was burglarized', 'van robbers',
    'bicycle theft', 'grand larceny bicycle', 'bicycle larnecy', 
    'stolen bicycle', 'auto theft', 'car-theft', 'grand larceny',
    
]

In [26]:
pritty_print(auto_theft_key_words)

'armed carjacking', 'auto burglary', 'auto theft',
'bicycle larnecy', 'bicycle theft', 'burglary auto',
'car jack', 'car theft', 'car was stolen',
'car-theft', 'carjack', 'carjacked',
'carjacker', 'carjackers', 'carjacking',
'carjackings', 'commandeered the vehicle', 'grand larceny',
'grand larceny bicycle', 'grand theft', 'grand theft auto',
'gta', 'hijack', 'hijacked',
'hijacking', 'hijackings', 'motor vehicle theft',
'steal a car', 'steal car', 'stole a Chevy truck',
'stole a car', 'stole a truck', 'stole a vehicle',
'stole another motor vehicle', 'stole another vehicle', 'stole car',
'stole her car', 'stole his car', 'stole his vehicle',
'stole their car', 'stolen bicycle', 'stolen car',
'stolen motor vehicle', 'stolen pickup truck', 'stolen van',
'stolen vehicle', 'theft motor vehicle parts', 'theft of a motor vehicle',
'theft vehicle', 'van robbers', 'vehicle breackinstolen vehicle',
'vehicle break-in', 'vehicle burglary', 'vehicle crime',
'vehicle grand', 'vehicle larceny', 'veh

In [27]:
AUTO_THEFT_KEY_WORDS_REGEX = construct_distinct_word_regex(auto_theft_key_words)

s = """
Ottawa police say three men from Montreal steal a car are facing charges in connection with an alleged car theft in Kanata South.
Officers were called to Gowrie Street and Barra Avenue Tuesday afternoon after someone reported a man carjacked disconnecting a trailer from a vehicle and thought 
it was suspicious, police said in a news release. A similar vehicle was stopped some time later on Highway 417. Police said it was reported stolen. 
"""

set(AUTO_THEFT_KEY_WORDS_REGEX.findall(s))

{'car theft', 'carjacked', 'steal a car'}

## Extortion

In [28]:
extortion_key_words = [
    'extortion', 'extortion threats', 'extortion blackmail premise',
    'extortio', 'extortion investigation', 'extortion blackmail',
    'blackmail',
]

In [29]:
pritty_print(extortion_key_words)

'blackmail', 'extortio', 'extortion',
'extortion blackmail', 'extortion blackmail premise', 'extortion investigation',
'extortion threats'


In [30]:
EXTORTION_KEY_WORDS_REGEX = construct_distinct_word_regex(extortion_key_words)

s = """
The Ludhiana police on Monday arrested a man and his accomplice for allegedly making an extortion call to his ex-employer while posing as a member of the Lawrence Bishnoi gang.
Police said that the man made a threat call to his ex-employer and demanded Rs 30 lakh from him.
The arrested duo was identified as Jatinder Kumar alias Rohit of Salem Tabri, Ludhiana and Harmel Singh of Jalandhar.
"""

set(EXTORTION_KEY_WORDS_REGEX.findall(s))

{'extortion'}

## Kidnapping

In [31]:
kidnapping_key_words = [
    'kidnapping', 'kidnapping abduction premise', 'kidnap', 'kidnapped',
    'kidnapping neighbourhood', 'kidnapping abduction', 'abduction',
    'unlawful restraint', 'unlawful imprisonment', 'bound with duct tape',
    'abduct a newborn baby', 'abduct', 'abduction', 'held hostage',
]

In [32]:
pritty_print(kidnapping_key_words)

'abduct', 'abduct a newborn baby', 'abduction',
'bound with duct tape', 'held hostage', 'kidnap',
'kidnapped', 'kidnapping', 'kidnapping abduction',
'kidnapping abduction premise', 'kidnapping neighbourhood', 'unlawful imprisonment',
'unlawful restraint'


In [33]:
KIDNAPING_KEY_WORD_REGEX = construct_distinct_word_regex(kidnapping_key_words)

s = '''
A German aid worker was freed after being kidnapped in Niger in 2018, German media reported Saturday.
Jorg Lange was first handed over by his captors in northern Mali to Moroccan mediators on Thursday. He was then taken to the German Embassy in the Malian capital of Bamako.
He was kidnapped April 11, 2018, in western Niger, on the border with Mali, by armed men riding a motorcycle.
'''

set(KIDNAPING_KEY_WORD_REGEX.findall(s))

{'kidnapped'}

## Sex offences

In [34]:
sex_offences_key_words = [
    'sex offences', 'sexual battery', 'sex offender',
    'sexual assault', 'sex abuse', 'sex crimes abuse',
    'sexual battery',
    'sex crimes', 'sex crime', 'sexual offender', 'sexual abuse',
    'forcible rape', 'restraining order violation',
    'rape', 'rape force', 'lewd conduct icident number', 
    'forcible fondling', 'sexoff', 'forcible sodomy premise',
    'soliciting', 'sexually assaulted', 'sexually assaulting',
    'molested', 'molesting', 'masturbating in public',
    'lewdness incident', 
]

In [35]:
pritty_print(sex_offences_key_words)

'forcible fondling', 'forcible rape', 'forcible sodomy premise',
'lewd conduct icident number', 'lewdness incident', 'masturbating in public',
'molested', 'molesting', 'rape',
'rape force', 'restraining order violation', 'sex abuse',
'sex crime', 'sex crimes', 'sex crimes abuse',
'sex offences', 'sex offender', 'sexoff',
'sexual abuse', 'sexual assault', 'sexual battery',
'sexual offender', 'sexually assaulted', 'sexually assaulting',
'soliciting'


In [36]:
SEX_OFFENCES_KEY_WORD_REGEX = construct_distinct_word_regex(sex_offences_key_words)

s = '''
Officers investigated 106 reports that included sexual abuse 
and similar crimes against children
'''

set(SEX_OFFENCES_KEY_WORD_REGEX.findall(s))

{'sexual abuse'}

## Vandalism

In [37]:
vandalism_key_words = [
    'vandalism', 'property damage', 'criminal damage',
    'destruction property', 'malicious mischief', 'destruct property',
    'arson', 'graffiti', 'vandalized'
]

In [38]:
pritty_print(vandalism_key_words)

'arson', 'criminal damage', 'destruct property',
'destruction property', 'graffiti', 'malicious mischief',
'property damage', 'vandalism', 'vandalized',



In [39]:
VANDALISM_KEY_WORDS_REGEX = construct_distinct_word_regex(vandalism_key_words)

s = '''
A B.C. man is facing charges, including one count of arson, after 
allegedly placing a jerry can inside a retail store and lighting it 
on fire. Police in the small community of Kaslo say the incident 
happened on Dec. 5, when a man was found to be causing a disturbance.
According to the RCMP, the man was yelling and swearing inside the 
business that morning, with staff advising police they were afraid 
of him.
'''

set(VANDALISM_KEY_WORDS_REGEX.findall(s))

{'arson'}

## Trafficking of Illegal Goods

In [40]:
trafficking_illegal_goods = [
    'buy narcotics', 'sell narcotics', 'import narcotics', 
    'possess narcotics', 'drug dealing', 'drugs dealing',
    'drug violations', 'drugs violations', 'drugs dealer', 'drug dealer',
    'drugs dealers', 'drug dealers',
    'narcotics manufacture', 'drug problem', 'drugs problem',
    'drug deal', 
]

In [41]:
pritty_print(trafficking_illegal_goods)

'buy narcotics', 'drug deal', 'drug dealer',
'drug dealers', 'drug dealing', 'drug problem',
'drug violations', 'drugs dealer', 'drugs dealers',
'drugs dealing', 'drugs problem', 'drugs violations',
'import narcotics', 'narcotics manufacture', 'possess narcotics',
'sell narcotics'


In [42]:
TRAFFICKING_ILLEGAL_GOODS_REGEX = construct_distinct_word_regex(trafficking_illegal_goods)

s = '''
More and more students are becoming drug dealers. This was stated 
by Deputy Chief Prosecutor and Director of the National Investigation 
Service Borislav Sarafov at a joint briefing with Interior 
Minister Ivan Demerdzhiev and Chief Secretary of the Ministry of 
Internal Affairs Petar Todorov.
'''

set(TRAFFICKING_ILLEGAL_GOODS_REGEX.findall(s))

{'drug dealers'}

## Fraud

In [43]:
fraud_key_words = [
    'fraud', 'credit card fraud', 'fraud incident', 'fraud calls',
    'fraud credit card', 'fare evasion', 'false use anothers identity',
    'forgery', 'delayed forgery', 'false pretenses', 'false swindle', 
    'identity theft', 'deceptive practice', 'financial identity theft',
    'counterfeting', 'credit card debit abuse', 'credit card abuse',
    'embezzlement', 'impersonated'
]

In [44]:
pritty_print(fraud_key_words)

'counterfeting', 'credit card abuse', 'credit card debit abuse',
'credit card fraud', 'deceptive practice', 'delayed forgery',
'embezzlement', 'false pretenses', 'false swindle',
'false use anothers identity', 'fare evasion', 'financial identity theft',
'forgery', 'fraud', 'fraud calls',
'fraud credit card', 'fraud incident', 'identity theft',
'impersonated'


In [45]:
FRAUD_KEY_WORDS_REGEX = construct_distinct_word_regex(fraud_key_words)

s = '''
Jacksonville Police looking for credit card fraud suspect
Jacksonville FL — On Monday, Jacksonville Police released photos of 
someone accused of using stolen credit cards in the Avondale area. 
Police say it’s investigating an auto burglary that happened on 
Pinegrove Avenue and during the incident, several items were stolen 
including the victim’s credit and debit cards. Police say the suspect 
has a Celtic Cross with a heart tattoo on his upper left wrist.
'''

set(FRAUD_KEY_WORDS_REGEX.findall(s))

{'credit card fraud'}

## Organised crime

In [46]:
organised_crime_key_words = [
    'criminal conspiracy', 'conspiracy commit crime', 'gang related',
    'engaging organized criminal activity', 'organized criminal activity',
    'street gang', 'operating gambling', 'promoting gambling', 'assisting gambling',
    'gang activity', 'attemp conspiracy penalties', 'gang', 'money laundering',
    'weapon law violations',
]

In [47]:
pritty_print(organised_crime_key_words)

'assisting gambling', 'attemp conspiracy penalties', 'conspiracy commit crime',
'criminal conspiracy', 'engaging organized criminal activity', 'gang',
'gang activity', 'gang related', 'money laundering',
'operating gambling', 'organized criminal activity', 'promoting gambling',
'street gang', 'weapon law violations'


In [48]:
ORGANISED_CRIME_KEY_WORDS_REGEX = construct_distinct_word_regex(organised_crime_key_words)

s = """
House must act on whistleblower bill to protect ‘insiders’ who 
report Russian money laundering
"""

set(ORGANISED_CRIME_KEY_WORDS_REGEX.findall(s))

{'money laundering'}

## Terrorist threats

In [49]:
terrorist_threats_key_words = [
    'terroristic threat', 'terrorist threat',
    'terroristic threats', 'terrorist threats',
    'terroristic threatening', 'terrorist threats',
    'terrorizing', 'terroristic threat zone', 'bomb threat',
]

In [50]:
pritty_print(terrorist_threats_key_words)

'bomb threat', 'terrorist threat', 'terrorist threats',
'terroristic threat', 'terroristic threat zone', 'terroristic threatening',
'terroristic threats', 'terrorizing'


In [51]:
TERRORIST_THREATS_KEY_WORDS_REGEX = construct_distinct_word_regex(terrorist_threats_key_words)

s = '''
FRAMINGHAM – On Friday morning, December 9, Framingham High was under a 
shelter-in-place order for about an hour, while Police searched for a 
man who made a threat against the high school.

Friday night, Framingham Police arrested a Framingham man and charged 
him with “making terroristic threats,” said Framingham Police 
spokesperson Lt. Rachel Mickens.

Peter J. Wilson, 46, of 19 Cherry Street in Framingham was arrested 
at 6;23 p.m. on December 9. The police log said he was charged with 
false “bomb threat.”
'''

set(TERRORIST_THREATS_KEY_WORDS_REGEX.findall(s))

{'bomb threat', 'terroristic threats'}

## Disturbance

In [52]:
disturbance_key_words = [
    'disturbance', 'domestic disturbance', 'disturbing peace',
    'civil disturbance', 'disturbance business', 'public disturbence',
    'disturbance neighbour', 'disturb', 'distubance family',
    'noisy party', 'noise disturbance', 'disorderly conduct', 
    'public order', 'disorderly person', 'noise complaint', 
    'loud noise disturbance', 
]

In [53]:
pritty_print(disturbance_key_words)

'civil disturbance', 'disorderly conduct', 'disorderly person',
'distubance family', 'disturb', 'disturbance',
'disturbance business', 'disturbance neighbour', 'disturbing peace',
'domestic disturbance', 'loud noise disturbance', 'noise complaint',
'noise disturbance', 'noisy party', 'public disturbence',
'public order'


In [54]:
DISTURBANCE_KEY_WORDS_REGEX = construct_distinct_word_regex(disturbance_key_words)

s = """
A man was found hiding in a garden in Salford after police were 
called to a 'disturbance'. Police were called to a house on 
Limefield Road at around 10.30pm last night (Monday). Officers and 
the Tactical Dog Unit responded and arrested the suspect.
He was found hiding in a garden of a house, police said. 
The suspect - a man aged in his 50s - was arrested on suspicion of 
burglary and resisting arrest.
"""

set(DISTURBANCE_KEY_WORDS_REGEX.findall(s))

{'disturbance'}

## Suspicious Activity

In [55]:
suspicious_activity_key_words = [
    'suspicious person', 'suspicious activity', 'suspicious vehicle',
    'suspicious situation', 'suspicious circumstances',
    'suspicious priority', 'suspicious perso', 'suspicious incident',
    'suspicious event', 'suspicious subject'
]

In [56]:
pritty_print(suspicious_activity_key_words)

'suspicious activity', 'suspicious circumstances', 'suspicious event',
'suspicious incident', 'suspicious perso', 'suspicious person',
'suspicious priority', 'suspicious situation', 'suspicious subject',
'suspicious vehicle'


In [57]:
SUSPICIOUS_ACTIVITY_KEY_WORDS_REGEX = construct_distinct_word_regex(suspicious_activity_key_words)

s = '''
La Crosse man arrested for 6th OWI, obstruction while possessing fentanyl, 
meth after ‘suspicious activity
'''

set(SUSPICIOUS_ACTIVITY_KEY_WORDS_REGEX.findall(s))

{'suspicious activity'}

## Domestic offenses

In [58]:
domestic_offences_key_words = [
    'domestic dispute', 'domestic violence', 'domestic related',
    'domestic battery', 'domestic assault', 'domestic progress',
    'domestic verbal', 'domestic trouble', 'domestic incident',
    'domestic related',
    'family trouble', 'family dispute',
    'family fight', 'simple assault'
]

In [59]:
pritty_print(domestic_offences_key_words)

'domestic assault', 'domestic battery', 'domestic dispute',
'domestic incident', 'domestic progress', 'domestic related',
'domestic trouble', 'domestic verbal', 'domestic violence',
'family dispute', 'family fight', 'family trouble',
'simple assault'


In [60]:
DOMESTIC_OFFENCES_KEY_WORDS_REGEX = construct_distinct_word_regex(domestic_offences_key_words)

s = '''
‘Holidays always increase the stress’: Portland metro sees rise in 
domestic violence cases
'''

set(DOMESTIC_OFFENCES_KEY_WORDS_REGEX.findall(s))

{'domestic violence'}

## Drug & Alcohol violations

In [61]:
drug_alcohol_violations_key_words = [
    'drugs', 'narcotics', 'drug equipment violations',
    'drug violations', 'drug violation', 'drugs violations', 
    'alcohol violations', 'alcohol violation',
    'narcotics offense', 'disturbance firecrackers',
    'drug paraphernalia', 'narcotics offence', 'narcotic',
    'possess controlled substance', 'drugs narcotics', 'narcotics place',
    'drug case', 'drugs case', 'drug overdose', 'drugs overdose',
    'drug offence', 'drug offences', 'drugs offence', 'drugs offences',
    'possesion marijuana', 'paraphernalia use'
]

In [62]:
pritty_print(drug_alcohol_violations_key_words)

'alcohol violation', 'alcohol violations', 'disturbance firecrackers',
'drug case', 'drug equipment violations', 'drug offence',
'drug offences', 'drug overdose', 'drug paraphernalia',
'drug violation', 'drug violations', 'drugs',
'drugs case', 'drugs narcotics', 'drugs offence',
'drugs offences', 'drugs overdose', 'drugs violations',
'narcotic', 'narcotics', 'narcotics offence',
'narcotics offense', 'narcotics place', 'paraphernalia use',
'possesion marijuana', 'possess controlled substance'


In [63]:
DRUG_ALCOHOL_KEY_WORDS_REGEX = construct_distinct_word_regex(drug_alcohol_violations_key_words)

s = '''Crown and defence far apart on sentencing for drug offences'''

set(DRUG_ALCOHOL_KEY_WORDS_REGEX.findall(s))

{'drug offences'}

## Traffic violations

In [64]:
traffic_violations_key_words = [
    'hit run', 'accident hit run', 'hit-and-run', 'hit and run', 
    'hit-and-run crash', 'hit run crash', 'hit and run crash',
    'intoxicated driver',
    'hit run property', 'impaired driver', 'driving intoxicated',
    'hit run collision', 'dui', 'dui alcohol', 'traffic offense',
    'driving influence premise', 'driving influence', 'impaired driving',
    'drunk driving', 'drunk drive', 'driving impaired', 'drunk driver',
    'motor vehicle violation', 'parking violation', 'reckless driver', 'reckless driving',
    'drag racing', 'traffic-moving violations', 'motor vehicle crash accident',
    'driving violation', 'drunk driver', 'ran a red light', 'ran red light',
    'run a red light', 'run red light', 'street race', 'street races',
    'driving under the influence of alcohol', 
    
]

In [65]:
pritty_print(traffic_violations_key_words)

'accident hit run', 'drag racing', 'driving impaired',
'driving influence', 'driving influence premise', 'driving intoxicated',
'driving under the influence of alcohol', 'driving violation', 'drunk drive',
'drunk driver', 'drunk driving', 'dui',
'dui alcohol', 'hit and run', 'hit and run crash',
'hit run', 'hit run collision', 'hit run crash',
'hit run property', 'hit-and-run', 'hit-and-run crash',
'impaired driver', 'impaired driving', 'intoxicated driver',
'motor vehicle crash accident', 'motor vehicle violation', 'parking violation',
'ran a red light', 'ran red light', 'reckless driver',
'reckless driving', 'run a red light', 'run red light',
'street race', 'street races', 'traffic offense',
'traffic-moving violations'


In [66]:
TRAFFIC_VIOLATIONS_KEY_WORDS_REGEX = construct_distinct_word_regex(traffic_violations_key_words)

s = '''
An Ontario driver is facing impaired driving charges after a 
concerned citizen phoned 911 to report a suspected drunk driver.
In a video posted by York Regional Police, police officers can be 
seen driving towards a vehicle along Highway 407 as a caller details 
that customer had left his establishment “very, very drunk.”
'''

set(TRAFFIC_VIOLATIONS_KEY_WORDS_REGEX.findall(s))

{'drunk driver', 'impaired driving'}

## Trespassing

In [67]:
trespassing_key_words = [
    'trespassing', 'trespass', 'trespasser', 
    'alarm intrusion incident', 'intrusion'
]

In [68]:
TRESPASSING_KEY_WORDS_REGEX = construct_distinct_word_regex(trespassing_key_words)

s = '''
• 8:56 a.m. Longmont police cited a man for trespassing in the 
1300 block of Dry Creek Drive.
• 10:54 a.m. Police took a report of a theft in the 600 block of 
Martin Street. There is no suspect information.
• 5:45 p.m. Officers investigated a vehicle theft in the 1800 block 
of Lefthand Circle.
'''

set(TRESPASSING_KEY_WORDS_REGEX.findall(s))

{'trespassing'}

## Weapon Violations

In [69]:
weapon_violations_key_words = [
    'shots fired', 'possession weapons', 'armed person', 'weapons',
    'sound gunshots', 'shots heard', 'weapon', 'discharging firearm',
    'shooting', 'weapon law violations', 'shotspotter', 'shots fire',
    'brandishing weapon', 'illegally possessed assault rifle',
    'illegally possessed rifle', 'illegally possessed gun',
    "firearm's serial number was obliterated", 'serial number was obliterated',
    
]

In [70]:
pritty_print(weapon_violations_key_words)

'armed person', 'brandishing weapon', 'discharging firearm',
'firearm's serial number was obliterated', 'illegally possessed assault rifle', 'illegally possessed gun',
'illegally possessed rifle', 'possession weapons', 'serial number was obliterated',
'shooting', 'shots fire', 'shots fired',
'shots heard', 'shotspotter', 'sound gunshots',
'weapon', 'weapon law violations', 'weapons',



In [71]:
WEAPON_VIOLATIONS_KEY_WORDS_REGEX = construct_distinct_word_regex(weapon_violations_key_words)

s = '''
Police seek to identify a suspect after shots were fired in Oshawa, 
Ont., over the weekend. Durham Regional Police said on Saturday at 
around 9:30 p.m., officers received a report of an armed person in 
the Nassau Street and King Street West area.
Officers said an “incident” occurred between a male suspect and a 
victim.
'''

set(WEAPON_VIOLATIONS_KEY_WORDS_REGEX.findall(s))

{'armed person'}

## Assault

In [72]:
assault_key_words = [
    'assault', 'assaulting',
    'simple assault', 'common assault', 'aggravated assault',
    'assaulted', 'serious injury', 'serious bodily injury', 
    'injury to', 'injured', 'knifing',
    'stabbing', 'stab', 'stabbed', 'stab wound',
    'attempt to murder', 'attempted murder', 
    'attempts or threats to murder', 'attempted homicide',
    'hurted', 'bodily harm', 'grievous bodily harm', 
    'simply bodily harm',
    'knife wounds', 'brutally beat', 'violently attacked',
    'struck', 'beaten', 'beaten to unconsciousness', 'hit and kick',
    'beating the man', 'beating a man', 'beating man', 'beating men',
]

In [73]:
pritty_print(assault_key_words)

'aggravated assault', 'assault', 'assaulted',
'assaulting', 'attempt to murder', 'attempted homicide',
'attempted murder', 'attempts or threats to murder', 'beaten',
'beaten to unconsciousness', 'beating a man', 'beating man',
'beating men', 'beating the man', 'bodily harm',
'brutally beat', 'common assault', 'grievous bodily harm',
'hit and kick', 'hurted', 'injured',
'injury to', 'knife wounds', 'knifing',
'serious bodily injury', 'serious injury', 'simple assault',
'simply bodily harm', 'stab', 'stab wound',
'stabbed', 'stabbing', 'struck',
'violently attacked'


In [74]:
ASSAULT_KEY_WORDS_REGEX = construct_distinct_word_regex(assault_key_words)

s = """
Police from Serious and Organised Crime Branch have arrested four people following an investigation into a serious assault at Henley Beach earlier this month.
About 10.45pm on Friday 7 October, a number of staff and security guards were assaulted at a hotel on Seaview Road.
Following an investigation, Detectives from Serious and Organised Crime Branch subsequently arrested three men and a woman over the incident. 
"""

set(ASSAULT_KEY_WORDS_REGEX.findall(s))

{'assault', 'assaulted'}

## Homicide

In [75]:
homicide_key_words = [
    'murder', 'assassination', 'homicide', 'lynching', 'manslaughter',
    'genocide', 'kill', 'killed', 'death investigation',
    'fatally shot', 'died after shots fired', 'died after',
    'shot', 'shooting', 'shots', 'deadly shooting'
    'slaying', 'massacre',
    'died of a single gunshot wound', 'died of multiple gunshot wounds',
    'deceased victims', 'were found dead', 'was found dead'
    'investigated as homicides', 'person is dead',
    'fatal head injury', 'declared dead',
    'shooting death', 'was shot to death', 'were shot to death', 
    'did not survive the shooting', 'stabbed to death',
    'dismembered', 'concealment of a body',
    'reported dead by gunshot wound',
    'fatal stabbings', 'fatal stabbing', 'fatally stabed',
    'mutilating a corpse', 'shot dead', 'suffocated',
    'was beheaded', 'beheaded', 
]

In [76]:
pritty_print(homicide_key_words)

'assassination', 'beheaded', 'concealment of a body',
'deadly shootingslaying', 'death investigation', 'deceased victims',
'declared dead', 'did not survive the shooting', 'died after',
'died after shots fired', 'died of a single gunshot wound', 'died of multiple gunshot wounds',
'dismembered', 'fatal head injury', 'fatal stabbing',
'fatal stabbings', 'fatally shot', 'fatally stabed',
'genocide', 'homicide', 'kill',
'killed', 'lynching', 'manslaughter',
'massacre', 'murder', 'mutilating a corpse',
'person is dead', 'reported dead by gunshot wound', 'shooting',
'shooting death', 'shot', 'shot dead',
'shots', 'stabbed to death', 'suffocated',
'was beheaded', 'was found deadinvestigated as homicides', 'was shot to death',
'were found dead', 'were shot to death'


In [77]:
HOMICIDE_KEY_WORDS_REGEX = construct_distinct_word_regex(homicide_key_words)

s = """
Seattle police say they have arrested a 42-year-old man for the double murder of a man and woman in Georgetown.
He was booked into the King County jail for investigation of homicide.
Police posted in their blotter, a 911 caller reported two down subjects in an apartment in the 6100 block of 4th Avenue South. 
"""

set(HOMICIDE_KEY_WORDS_REGEX.findall(s))

{'homicide', 'murder'}

# Key words for non-relevant
## Trials

In [78]:
trials_key_words = [
    'prosecutor', 'prosecutors', 
    'judge', 'jury', 'judges', 'juries', 'Court jury', 
    'Court jury found',
    'trial', 'trials', 'mistrial', 
    'sentenced', 'verdict', 
    'probation', 'probations',
    'allegation', 'allegations',
    'lawsuit', 'lawsuits',
    'settle', 'settlement', 
    'sued', 'convicted',
    'testified',
    'paroled',
    'U.S. Supreme Court decision', 'District Court', 'Supreme Court',
    'U.S. District Court', 'US Department of Justice',
    'Supreme Court of the United States',
    'court-imposed curfew', 'court imposed curfew',
    'juvenile petitions', 'juvenile petition', 'petition filed for voluntary',
    'judge has denied', 'judges have denied',
    'pleaded guilty before U.S. District Court',
    'trial began in', 'filed an ethics complaint against', 'ethics complaint',
    'introduced a resolution', 'jury ruled in favour of', 'announced indictments',
    'judge\'s sentence', 'judge prohibited', 'acting on a request from prosecutors',
    'has been extradited', 'federal judge', 'closely supervised probation', 'special grand jury',
    'Supreme Court', 'has been found guilty', 'have been found guilty',
    'next hearing is set for', 'sentenced to life in prison',
    'execution of a death', 'life sentence', 'opportunity for parole',
    'suspended sentence for', 'deferred sentence for', 'was dismissed against',
    'has been denied parole', 'denied parole', 'was extradited back',
    'were extradicted back', 'was extradicted', 'were extradicted',
    'extradicted', 'schedule to be killed', 'scheduled to be killed',
    'death by lethal injection', 'scheduled to be put to death by',
    'the court entered not guilty pleas', 'not guilty pleas',
    'guilty pleas', 'pleading guilty', 'pleaded guilty', 'Lawmakers in Congress',
    'Presidential Records Act', 'definition of justified homicide',
    'were justified', 'was justified', 'awaits extradition to',
    'with conditions of release', 'no unsupervised contact with a child under 18',
    'could be sentenced up to 10 years', 'sentenced up to 10 years',
    'pleaded not guilty', 'was found guilty', 'were found guilty',
    'found guilty',
]

TRIAL_KEY_WORDS_REGEX = construct_distinct_word_regex(trials_key_words)

s = """
A 25-year-old man has been sentenced to a year in prison for straw purchasing guns in suburban Chicago by Supreme Court of the United States. 
Ismael Sene in 2019 and 2020 bought a 
total of seven handguns from licensed firearms dealers in   and Merrionette Park. During one of the purchases, officials say Sene falsely 
certified that he was the actual buyer. Sene purchased at least one of the guns for a person whom he had reason to believe was a convicted felon, 
officials said. Convicted felons under federal law may not purchase or possess a firearm. Sene in each purchase also falsely claimed he was not
an unlawful drug user when he regularly used cannabis — a violation of federal law. In June, Sene pleaded guilty to making false statements in 
connection with purchasing a firearm. Last Friday, a judge sentenced Sene to a year and a day in federal prison. Advertisement "Straw purchasers,
like the defendant, perpetuate the cycle of violence that is terrorizing this city," Assistant U.S. Attorney James P. Durkin argued in the 
government’s sentencing memorandum. "The tools that drive that mayhem are firearms in the hands of dangerous people who are often legally 
prohibited from purchasing and possessing them.
"""
print(set(TRIAL_KEY_WORDS_REGEX.findall(s)))

s = """
Two men have been charged in connection to a robbery and shooting trials that killed an armored car driver and critically wounded another in Chicago's
Chatham neighborhood Monday morning. Two women and a man were robbed in separate incidents minutes apart Wednesday night in the   neighborhood. 
The women, 27 and 28, were walking on the sidewalk at 9:21 p.m. in the 800 block of West Wolfram Street when two gunmen got out of a red Mazda SUV 
and demanded their belongings, police said. The gunmen took their cellphones and purses, police said. Neither woman was hurt during the incident, 
police said. Less than a half hour later, a 46-year-old man was approached by gunmen as he was entering his building in the 500 block of West 
Stratford Place, police said. They took his cellphones, wallet and a yellow backpack, police said. The man suffered a minor injury to his jaw but 
refused treatment at the scene, police said. The gunmen fled in a red Mazda SUV. Police have not said if they believe the two robberies are connected.
Advertisement Area Three detectives are investigating.
"""

print(set(TRIAL_KEY_WORDS_REGEX.findall(s)))

{'pleaded guilty', 'Convicted', 'judge', 'Supreme Court of the United States', 'convicted', 'sentenced'}
{'trials'}


In [79]:
pritty_print(trials_key_words)

'Court jury', 'Court jury found', 'District Court',
'Lawmakers in Congress', 'Presidential Records Act', 'Supreme Court',
'Supreme Court of the United States', 'U.S. District Court', 'U.S. Supreme Court decision',
'US Department of Justice', 'acting on a request from prosecutors', 'allegation',
'allegations', 'announced indictments', 'awaits extradition to',
'closely supervised probation', 'convicted', 'could be sentenced up to 10 years',
'court imposed curfew', 'court-imposed curfew', 'death by lethal injection',
'deferred sentence for', 'definition of justified homicide', 'denied parole',
'ethics complaint', 'execution of a death', 'extradicted',
'federal judge', 'filed an ethics complaint against', 'found guilty',
'guilty pleas', 'has been denied parole', 'has been extradited',
'has been found guilty', 'have been found guilty', 'introduced a resolution',
'judge', 'judge has denied', 'judge prohibited',
'judge's sentence', 'judges', 'judges have denied',
'juries', 'jury', 'jury ruled

## Car crashes

In [80]:
car_accidents_key_words = [
    'traffic crash', 'car accident', 'car crashed', 'truck crashed', 'crash', 'crashed'
]

CAR_ACCIDENT_KEY_WORDS_REGEX = construct_distinct_word_regex(car_accidents_key_words)

s = """Car crashed A Massachusetts State Police cruiser was involved in a multi-car crash on Route 128 in Needham late Sunday night. At least three vehicles 
were involved in the crash, officials said. It's not clear if anyone was injured in the crash, which remains under investigation Crash.
"""
print(CAR_ACCIDENT_KEY_WORDS_REGEX.findall(s))

['Car crashed', 'crash', 'crash', 'crash', 'Crash']


## Statistics

In [81]:
statistics_key_words = [
    'annual', 'quarterly', 'record number', 'statewide',
    'theft prevention unit', 'based on provisional data', 'provisional data',
    'arrests each year', 'annual death rates', 'homicide unit',
    'spikes in shootings', 'spikes in', 'Police Department homicide',
    'police robbery/homicide detectives', 'Motor vehicle theft is up',
    'overall theft is up', 'Anti-Defamation League', 'lower crime index rate', 'crime index rate',
    'lower property crime index', 'property crime index', '10-year average', 'violent index crimes',
    'number of crimes per 1,000 people', 'number of crimes per 1000 people', 'number of crimes per',
    'over 2 million kids aged 12 to 17 admitted to using drugs',
    'one-in-10', 'one-in-10 female', 'alarming rate of carjackings',
    'citing a rise in gun related crimes', 'crime trends',
    'crime rate per', 'violent crime rose', 'According to the Institute for',
    'in speed-related crashes', 'reported cases',
    'reported in the same period this year', 'reported have decreased',
    
]

STATS_KEY_WORDS_REGEX = construct_distinct_word_regex(statistics_key_words)

s = """
The number of homicides in the United States continued to rise in the first three quarters of 2021, but at a slower pace, 
one year after   that followed  , according to the latest quarterly report published Monday by the Council on Criminal Justice.
A study of homicides in 22 cities during the first nine months of this year showed the number of murders was 4% greater than 
the same period in 2020, with 126 more homicides between January and September, the report says. In the first three quarters of
2020, the number of homicides in the same 22 cities rose by 36% over the same time frame in 2019, according to the report. According
to a  , the number of homicides during the first half of 2021 increased by 16% compared to the same period last year. The number of
homicides in 2020 compared to 2019 rose by 25%, according to an FBI preliminary report, the largest jump since the FBI started releasing
annual homicide figures in the 1960s. The spike in violent crime came as the   swept across the country, millions of people protested 
racial injustice and police brutality following Floyd’s death last year, and the   under the weight of the pandemic. The homicide rate
remained elevated through the summer before decreasing in the fall and winter and then increasing again in the spring and summer of 
this year, the report says
"""

print(set(STATS_KEY_WORDS_REGEX.findall(s)))

{'annual', 'quarterly'}


In [82]:
pritty_print(statistics_key_words)

'10-year average', 'According to the Institute for', 'Anti-Defamation League',
'Motor vehicle theft is up', 'Police Department homicide', 'alarming rate of carjackings',
'annual', 'annual death rates', 'arrests each year',
'based on provisional data', 'citing a rise in gun related crimes', 'crime index rate',
'crime rate per', 'crime trends', 'homicide unit',
'in speed-related crashes', 'lower crime index rate', 'lower property crime index',
'number of crimes per', 'number of crimes per 1,000 people', 'number of crimes per 1000 people',
'one-in-10', 'one-in-10 female', 'over 2 million kids aged 12 to 17 admitted to using drugs',
'overall theft is up', 'police robbery/homicide detectives', 'property crime index',
'provisional data', 'quarterly', 'record number',
'reported cases', 'reported have decreased', 'reported in the same period this year',
'spikes in', 'spikes in shootings', 'statewide',
'theft prevention unit', 'violent crime rose', 'violent index crimes',



## Gun policy

In [83]:
gun_policy_key_words = [
    'gun policy', 'gun measures', 'gun policy measures',
    'gun restrictions', 'gun-related restrictions',
    'gun control advocates', 'assault weapons ban', 'assault weapon ban',
    'gun related restrictions', 'gun owners', 'requirements for carrying a handgun',
    'gun control laws', 'gun control', 'concealed carry permit', 'concealed carry of a firearm'
    'gun free zones', 'gun-free zones', 'tracking gun sales',
    'signed a new gun bill', 'gun bill', 'million guns sold',
    'gun sales went through the roof', 'anti-gun zealots',
    'anti gun zealots', 'gun ownership', 'relationship between state gun ownership rates',
    'gun haters', 'gun laws', 'semi-automatic weapon ban', 'anti-gun groups',
    'stockpiling weapons', 'suspicious weapons sales',
    'tracking large-scale firearms purchases', 
    'FOID card applications', 'regarding FOID card', 'Uvalde shooting',
    'Uvalde highlights', 'law for concealed carry of a firearm',
    'restrict guns', 'Uvalde slaughter', 
]

GUN_POLICY_KEY_WORDS_REGEX = construct_distinct_word_regex(gun_policy_key_words)

s = """
By Jennifer De Pinto, Fred Backus, Anthony Salvanto, Kabir Khanna, \/ CBS News A month after the school shooting 
in , Texans are overwhelmingly critical of law enforcement's response to the shooting, and a majority feel 
it's important to investigate their response. Most Texans are concerned about another mass shooting. 
Texans rate Gov. Abbott's response to Uvalde more negatively than positively. Nearly half of Texans report 
that the Uvalde shooting has spurred them to support some gun restrictions, and there is support in Texas for 
some measures. In backing many potential gun measures, Texas looks much like the nation as a whole. We see 
bipartisan backing for measures like universal background checks and making the minimum age for buying an AR-15 
at least 20 years old. But there are more partisan differences on policies focused on the guns themselves. 
Most Republicans oppose an AR-15 ban in Texas, and more than half oppose a red-flag law, in which a court 
can order the temporary removal of a gun from a person deemed to be a potential danger. More than half also 
disapprove of Abbott's overall job performance, but Abbott still leads Beto O'Rourke by eight points among likely 
voters in the race for governor. Senator John Cornyn is getting mixed marks from his own party on representing 
Texas' interests as it relates to guns, and this is dragging down Cornyn's overall job approval rating, which is j
ust 35% among Texans overall. Almost nine in 10 Texans feel Uvalde law enforcement could have done more to stop the 
shooter. Criticism is widespread across demographic and political groups. Abbott gets negative marks from younger 
Texans, women and Black and Latino people for his handling of the Uvalde shooting. Majorities of both Democrats and 
independents think he's done a bad job, but most in his own party rate his response positively. About eight in 10 
Texans are concerned about more mass shootings in Texas like the one in Uvalde, including almost half who are 
\"very concerned.\" Women express more concern than men do. Latino and Black people in Texas are more likely than 
White people to be very concerned about a mass shooting. For about half of Texans, the shooting at Robb elementary 
has made them more likely to support some gun restrictions. Democrats, whose party has long backed restrictions on 
guns, are particularly likely to say this, but they are joined by three in 10 Republicans who also say that the 
shooting has made them more inclined to favor some measures. Texans broadly support background checks and having a 
minimum age of at least 21. There is majority backing for a \"red flag\" law in Texas and a ban on the AR-15, 
but more division among Texans on these measures. Texans' views on these measures are in line with those of 
"""

print(set(GUN_POLICY_KEY_WORDS_REGEX.findall(s)))

{'Uvalde shooting', 'gun measures', 'gun restrictions'}


In [84]:
pritty_print(gun_policy_key_words)

'FOID card applications', 'Uvalde highlights', 'Uvalde shooting',
'Uvalde slaughter', 'anti gun zealots', 'anti-gun groups',
'anti-gun zealots', 'assault weapon ban', 'assault weapons ban',
'concealed carry of a firearmgun free zones', 'concealed carry permit', 'gun bill',
'gun control', 'gun control advocates', 'gun control laws',
'gun haters', 'gun laws', 'gun measures',
'gun owners', 'gun ownership', 'gun policy',
'gun policy measures', 'gun related restrictions', 'gun restrictions',
'gun sales went through the roof', 'gun-free zones', 'gun-related restrictions',
'law for concealed carry of a firearm', 'million guns sold', 'regarding FOID card',
'relationship between state gun ownership rates', 'requirements for carrying a handgun', 'restrict guns',
'semi-automatic weapon ban', 'signed a new gun bill', 'stockpiling weapons',
'suspicious weapons sales', 'tracking gun sales', 'tracking large-scale firearms purchases',



## Abortion-related news

In [85]:
abortion_key_words = [
    'pro-abortion', 'exceptions for instances of rape',
    'state records provided an abortion', 'provided an abortion',
    'abortion disclosure forms', 'abortion disclosure form',
    "state's new abortion law", 'abortion law', 'abortion advocates',
    'pro-abortion group', 'abortion ban', 'pro-life laws',
    'pro-abortion propaganda', 'pro-life activist', 'abortion decision',
    'outlaws abortion', 'abortion rights', 'right to an abortion',
    'pro-life volunteer', 'law on abortion', 'bill banning abortion',
    'right to abortion'
]

ABORTION_KEY_WORDS_REGEX = construct_distinct_word_regex(abortion_key_words)

s = '''pro-abortion forces broke the bank to convince voters abortion extremism is normal. They failed.'''

set(ABORTION_KEY_WORDS_REGEX.findall(s))

{'pro-abortion'}

In [86]:
pritty_print(abortion_key_words)

'abortion advocates', 'abortion ban', 'abortion decision',
'abortion disclosure form', 'abortion disclosure forms', 'abortion law',
'abortion rights', 'bill banning abortion', 'exceptions for instances of rape',
'law on abortion', 'outlaws abortion', 'pro-abortion',
'pro-abortion group', 'pro-abortion propaganda', 'pro-life activist',
'pro-life laws', 'pro-life volunteer', 'provided an abortion',
'right to abortion', 'right to an abortion', 'state records provided an abortion',
'state's new abortion law'


## Politics related

In [87]:
politics_key_words = [
    'Biden administration', 'President Joe Biden', "this year's election",
    'Democrat opponent', 'steps to stabilise economy', 'political weapon',
    'stronger penalties', 'stronger penalty', 'signed House Bill',
    'constitution was adopted', 'constitution specifies', 'constitutional convention',
    'signed the bill into law', 'Amendment', 'voting laws', 'voting rights restored',
    'municipal elections', 'gross domestic product', 'GDP', 'Republican Party',
    'legislature', 'threats to public health', 'law aimed at',
    'law aimed at cracking down on violent protests', 
    'signed a bill into law', 'Donald Trump', 'legislation',
    'legislation would limit', 'property crimes bill', 'sponsored in the state Senate',
    'Democratic Senator', 'US senators are asking President',
    'Republican Senate candidate', 'President Donald Trump',
    'Donald Trump', 'White Lives Matter', 'anti-mandate group',
    'conspiracy theorists', 'House Bill', 'House Speaker', 
    'against protesters', 'several dozen demonstrators', 'protest in support of',
    'hunger strike', 'anti-transgender healthcare protesters',
    'anti-transgender protesters', 'Senator', 'Senate',
    'mob invasion of the U.S. Capitol', 'U.S. Capitol', 'State Capitol',
    'White House', 'President of the United States',
    
]

POLITICS_KEY_WORDS_REGEX = construct_distinct_word_regex(politics_key_words)

s = 'Biden administration warns of potential influx of migrants immediately after Title 42 ends'

set(POLITICS_KEY_WORDS_REGEX.findall(s))

{'Biden administration'}

In [88]:
pritty_print(politics_key_words)

'Amendment', 'Biden administration', 'Democrat opponent',
'Democratic Senator', 'Donald Trump', 'GDP',
'House Bill', 'House Speaker', 'President Donald Trump',
'President Joe Biden', 'President of the United States', 'Republican Party',
'Republican Senate candidate', 'Senate', 'Senator',
'State Capitol', 'U.S. Capitol', 'US senators are asking President',
'White House', 'White Lives Matter', 'against protesters',
'anti-mandate group', 'anti-transgender healthcare protesters', 'anti-transgender protesters',
'conspiracy theorists', 'constitution specifies', 'constitution was adopted',
'constitutional convention', 'gross domestic product', 'hunger strike',
'law aimed at', 'law aimed at cracking down on violent protests', 'legislation',
'legislation would limit', 'legislature', 'mob invasion of the U.S. Capitol',
'municipal elections', 'political weapon', 'property crimes bill',
'protest in support of', 'several dozen demonstrators', 'signed House Bill',
'signed a bill into law', 'signed t

## Blaze-related news (not arson)

In [89]:
blaze_key_words = [
    'brush fire off', 'brush fire', 'wildland fire', 'wildfire',
    'fire broke out', 'destructive wildfire', 'blaze', 'firefighters battled the fire',
    'firefighters stop the spread', 'Firefighters', 'mopping up a burn area',
    
]

BLAZE_KEY_WORDS_REGEX = construct_distinct_word_regex(blaze_key_words)

s = '''
A brush fire in Deerfield was likely sparked by what a homeowner thought was extinguished coals, the Fire Department said Saturday. 
Crews responded to Upper Road for a fire around noon on Sunday. A neighbor initially made the call after seeing smoke and flames 
in the side yard of a nearby house, firefighters said.
'''

set(BLAZE_KEY_WORDS_REGEX.findall(s))

{'brush fire', 'firefighters'}

In [90]:
pritty_print(blaze_key_words)

'Firefighters', 'blaze', 'brush fire',
'brush fire off', 'destructive wildfire', 'fire broke out',
'firefighters battled the fire', 'firefighters stop the spread', 'mopping up a burn area',
'wildfire', 'wildland fire'


## Film-related news

In [91]:
film_key_words = [
    'actor was shooting', 'tops the Netflix chart', 'tops chart',
    'tops the chart', 'star in the film', 'starred in the film',
    'starred in film', 'star in film', 'Jeffrey Dahmer',
    "Netflix's", 'Netflix', 'starring', 'star as', 'Focused completely on real events',
    'trailers for', 'based on real events', 'science-fiction', 
    'horror genre', 'horror movies',
    'drama', 'first premiered in', 'scene had to be shot with',
    'streaming service', 'docuseries', 'Film Festival', 'anime series', 
    'DC Comics', 'rehearsal', 'live shot', 'shooting for her Hollywood debut film',
    'debut film', 'shooting for film', 'stole the show at the premiere',
    'stole the show', 'at the premiere', 'film revolves around',
    'hit the full interview above', 'album of songs', 'Vogue article',
    'showbiz gossip', 'speaking on a podcast', 
]

FILM_KEY_WORDS_REGEX = construct_distinct_word_regex(film_key_words)

s = 'Watch Margot Robbie Star As Barbie in first trailer for Greta Gerwig adaptation'

set(FILM_KEY_WORDS_REGEX.findall(s))

{'Star As'}

In [92]:
pritty_print(film_key_words)

'DC Comics', 'Film Festival', 'Focused completely on real events',
'Jeffrey Dahmer', 'Netflix', 'Netflix's',
'Vogue article', 'actor was shooting', 'album of songs',
'anime series', 'at the premiere', 'based on real events',
'debut film', 'docuseries', 'drama',
'film revolves around', 'first premiered in', 'hit the full interview above',
'horror genre', 'horror movies', 'live shot',
'rehearsal', 'scene had to be shot with', 'science-fiction',
'shooting for film', 'shooting for her Hollywood debut film', 'showbiz gossip',
'speaking on a podcast', 'star as', 'star in film',
'star in the film', 'starred in film', 'starred in the film',
'starring', 'stole the show', 'stole the show at the premiere',
'streaming service', 'tops chart', 'tops the Netflix chart',
'tops the chart', 'trailers for'


## Weather related news

In [93]:
weather_key_words = [
    'tracking a chance of storms', 'tracking closures',
    'Mostly clear and cool with lows falling into the lower',
    'Mostly clear and cool', 'Mostly sunny with increasing clouds by late afternoon',
    'Mostly sunny', 'increasing clouds', 'weekend starts off warm',
    'tracking low pressure to our east', 'storm tracking',
    'tornado warnings expired', 'tornado warnings', 'tornado warning',
    'storm survey', 'severe winds', 'severe weather', 
    'possibility of a tornado', 'hazardous air quality',
]

WEATHER_KEY_WORDS_REGEX = construct_distinct_word_regex(weather_key_words)

s = """
The National Weather Service in Jackson has issued a Tornado warning until 2:45 p.m. Wednesday:

Northern Forrest County in southeastern Mississippi
Northeastern Lamar County in southeastern Mississippi
Jones County in southeastern Mississippi
“At 1:43 p.m. CST, a severe thunderstorm capable of producing a tornado was located over Hattiesburg, moving northeast at 45 mph,” the NWS said.
"""

print(set(WEATHER_KEY_WORDS_REGEX.findall(s)))



In [94]:
pritty_print(weather_key_words)

'Mostly clear and cool', 'Mostly clear and cool with lows falling into the lower', 'Mostly sunny',
'Mostly sunny with increasing clouds by late afternoon', 'hazardous air quality', 'increasing clouds',
'possibility of a tornado', 'severe weather', 'severe winds',
'tracking closures', 'tracking low pressure to our east', 'weekend starts off warm',



## COVID-19 related news

In [95]:
covid_key_words = [
    'COVID-19 booster shots', 'COVID-19 shots', 'COVID-19 struck',
    'COVID-19 cases',
    'Pfizer shots', 'Pfizer shot', 'shots from Moderna and Pfizer',
    'tracking Covid-19', 'shots for those under five',
    'shots for under five', 'offering shots to children',
    'eligible for the shots', 'vaccine is safe', 'COVID-19 vaccines for children',
    'Covid vaccines', 'Anti-vaccination', 
    'booster shot', 'booster shots', 'COVID-19 booster',
    'primary shots plus a booster', 'primary shots',
    'flu shot', 'flu shots', 
]

COVID_KEY_WORDS_REGEX = construct_distinct_word_regex(covid_key_words)

s = """
Omicron BF.7: Is mixing Covid vaccines more effective against the new subvariant? What experts say
"""

print(set(COVID_KEY_WORDS_REGEX.findall(s)))

{'Covid vaccines'}


In [96]:
pritty_print(covid_key_words)

'Anti-vaccination', 'COVID-19 booster', 'COVID-19 booster shots',
'COVID-19 cases', 'COVID-19 shots', 'COVID-19 struck',
'COVID-19 vaccines for children', 'Covid vaccines', 'Pfizer shot',
'Pfizer shots', 'booster shot', 'booster shots',
'eligible for the shots', 'flu shot', 'flu shots',
'offering shots to children', 'primary shots', 'primary shots plus a booster',
'shots for those under five', 'shots for under five', 'shots from Moderna and Pfizer',
'tracking Covid-19', 'vaccine is safe'


## Other non-relevant news

In [97]:
other_key_words = [
    'injured in a shark attack',
    'radio tracking collar',
    'coins struck', 'struck a panic', 'struck panic', '9/11 videos',
    '9/11 attacks', 
    'army closed areas', 'honors a fallen officer', 'honors officer',
    'irrigation efforts', 'turned to drugs', 'steel intrusion plates',
    'intrusion plates', 'attacked by a large shark', 'flag known patterns of suspicious activity',
    'month investigation of the case', 'non-emergency lockdown',
    'victims and heroes of the 9/11 terrorist attacks', '9/11 attacks', 
    'cut pharmacy robberies', 'time delay safes', 'Coalition Against Sexual Assault',
    'guide for combating bullying', 'twin towers in New York City',
    'September 11th, 2001', 'go up against in the upper bracket final',
    'upper bracket final', 'carrying his squad to victory',
    'winning a round', 'hit match point', 'PETA supporters', 
    'Today we solemnly remember the lives', 'Ground Zero',
    'mourned the victims', 'fight for a livable wage', 'safer working conditions',
    'right to form a union', 'active shooter training session',
    'All fireworks are illegal', 'fireworks use', 'Department of Transportation',
    'project was compiled into a book titled', 'amoeba', 'gored by a bison',
    'offering their condolences', 'Rest in peace', 'in our thoughts and prayers',
    'UFC', 'years since the attack', 'Flowers and a message left in memory of the victims',
    'in memory of the victims', 'realistic combat scenarios for pilots',
    'advanced aerial tactics', 'air combat training',
    'On this day in history', 'September 11, 2001', 'prevent fake reports of emergencies',
    
]

OTHER_KEY_WORDS_REGEX = construct_distinct_word_regex(other_key_words)

s = """
Omicron Boosters: 9 Questions Answered About the Updated COVID-19 Shots injured in a shark attack
"""

print(set(OTHER_KEY_WORDS_REGEX.findall(s)))

{'injured in a shark attack'}


In [98]:
pritty_print(other_key_words)

'9/11 attacks', '9/11 videos', 'All fireworks are illegal',
'Coalition Against Sexual Assault', 'Department of Transportation', 'Flowers and a message left in memory of the victims',
'Ground Zero', 'On this day in history', 'PETA supporters',
'Rest in peace', 'September 11, 2001', 'September 11th, 2001',
'Today we solemnly remember the lives', 'UFC', 'active shooter training session',
'advanced aerial tactics', 'air combat training', 'amoeba',
'army closed areas', 'attacked by a large shark', 'carrying his squad to victory',
'coins struck', 'cut pharmacy robberies', 'fight for a livable wage',
'fireworks use', 'flag known patterns of suspicious activity', 'go up against in the upper bracket final',
'gored by a bison', 'guide for combating bullying', 'hit match point',
'honors a fallen officer', 'honors officer', 'in memory of the victims',
'in our thoughts and prayers', 'injured in a shark attack', 'intrusion plates',
'irrigation efforts', 'month investigation of the case', 'mourned th

In [99]:
prelabeled_df = pd.DataFrame(news_df[['cleaned_body', 'paragraphs_nwords']].dropna())
prelabeled_df.head()

Unnamed: 0,cleaned_body,paragraphs_nwords
0,"By Jennifer De Pinto, Fred Backus, Anthony Sal...",781.0
1,"(ENCINAL, Texas) -- Four people were killed an...",107.0
2,"On Wednesday, June 29, at 1:09 a.m, officers r...",430.0
3,"Sonny Barger, the leather-clad figurehead of t...",650.0
4,If she's not cheffing it up in her Apartment M...,1313.0


In [100]:
prelabeled_df.shape

(49309, 2)

In [101]:
q

NameError: name 'q' is not defined

# Regex prelabeling

In [None]:
relevant_cols = [
    'is_harassment', 'is_theft',
    'is_robbery', 'is_auto_theft',
    'is_assault', 'is_exortion',
    'is_kidnapping', 'is_sex_offences',
    'is_vandalism', 'is_trafficking_illegalgoods',
    'is_fraud', 'is_organised_crime',
    'is_homicide', 'is_terrorist_threats',
    'is_diturbance', 'is_suspicious_activity',
    'is_domestic_offences', 'is_drugalcohol_violations',
    'is_traffic_violations', 'is_trespassing',
    'is_weapon_violations'
]

relevant_regexs = [
    HARASSMENT_KEY_WORDS_REGEX, THEFT_KEY_WORDS_REGEX,
    ROBBERY_KEY_WORDS_REGEX, AUTO_THEFT_KEY_WORDS_REGEX,
    ASSAULT_KEY_WORDS_REGEX, EXTORTION_KEY_WORDS_REGEX,
    KIDNAPING_KEY_WORD_REGEX, SEX_OFFENCES_KEY_WORD_REGEX,
    VANDALISM_KEY_WORDS_REGEX, TRAFFICKING_ILLEGAL_GOODS_REGEX,
    FRAUD_KEY_WORDS_REGEX, ORGANISED_CRIME_KEY_WORDS_REGEX,
    HOMICIDE_KEY_WORDS_REGEX, TERRORIST_THREATS_KEY_WORDS_REGEX,
    DISTURBANCE_KEY_WORDS_REGEX, SUSPICIOUS_ACTIVITY_KEY_WORDS_REGEX,
    DOMESTIC_OFFENCES_KEY_WORDS_REGEX, DRUG_ALCOHOL_KEY_WORDS_REGEX,
    TRAFFIC_VIOLATIONS_KEY_WORDS_REGEX, TRESPASSING_KEY_WORDS_REGEX,
    WEAPON_VIOLATIONS_KEY_WORDS_REGEX
]

for col, regex in zip(relevant_cols, relevant_regexs):
    
    col_name = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:])) + '_keywords_body'
    prelabeled_df[col_name] = prelabeled_df['cleaned_body'].str.lower().apply(lambda x: re.sub(PUNCTUATION_REGEX, " ", x) if isinstance(x, str) else x)
    prelabeled_df[col_name] = prelabeled_df[col_name].apply(regex.findall)
    prelabeled_df[col_name] = prelabeled_df[col_name].apply(lambda x: [x.strip() for x in sorted(set(x))])
    
    prelabeled_df[col] = 0
    prelabeled_df.loc[prelabeled_df[col_name].astype(str) != '[]', col] = 1
    #prelabeled_df[col] = prelabeled_df[col].astype(int)
    
    print(prelabeled_df[col].value_counts().head())
    print(prelabeled_df[col_name].astype(str).value_counts().head(5))
    print('\n'+'-'*30+'\n')

In [None]:
non_relevant_cols = [
    'is_trial',
    'is_car_accident',
    'is_stats_news',
    'is_gun_policy_news',
    'is_abortion_news',
    'is_politics_news',
    'is_blaze_news',
    'is_film_news',
    'is_weather_news',
    'is_covid_news',
    'is_other',
]

non_relevant_regexs = [
    TRIAL_KEY_WORDS_REGEX,
    CAR_ACCIDENT_KEY_WORDS_REGEX,
    STATS_KEY_WORDS_REGEX,
    GUN_POLICY_KEY_WORDS_REGEX,
    ABORTION_KEY_WORDS_REGEX,
    POLITICS_KEY_WORDS_REGEX,
    BLAZE_KEY_WORDS_REGEX,
    FILM_KEY_WORDS_REGEX,
    WEATHER_KEY_WORDS_REGEX,
    COVID_KEY_WORDS_REGEX,
    OTHER_KEY_WORDS_REGEX
]

for col, regex in zip(non_relevant_cols, non_relevant_regexs):
    
    col_name = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:])) + '_keywords_body'
    prelabeled_df[col_name] = prelabeled_df['cleaned_body'].str.lower().apply(lambda x: re.sub(PUNCTUATION_REGEX, " ", x) if isinstance(x, str) else x)
    prelabeled_df[col_name] = prelabeled_df[col_name].apply(regex.findall)
    prelabeled_df[col_name] = prelabeled_df[col_name].apply(lambda x: [x.strip() for x in sorted(set(x))])
    
    prelabeled_df[col] = 0
    prelabeled_df.loc[prelabeled_df[col_name].astype(str) != '[]', col] = 1
    prelabeled_df[col] = prelabeled_df[col].astype(int)
    
    print(prelabeled_df[col].value_counts().head())
    print(prelabeled_df[col_name].astype(str).value_counts().head(5))
    print('\n'+'-'*30+'\n')

In [None]:
prelabeled_df[non_relevant_cols].apply(any, axis=1).astype(int).value_counts()

In [None]:
prelabeled_df['relevant_pred'] = prelabeled_df[relevant_cols].apply(any, axis=1).astype(int)
prelabeled_df['not-relevant_pred'] = prelabeled_df[non_relevant_cols].apply(any, axis=1).astype(int)

#mask = (prelabeled_df['relevant_pred'] == 1) & (prelabeled_df['not-relevant_pred'] == 1)
#mask.sum()

grouped = prelabeled_df.groupby(by=['relevant_pred', 'not-relevant_pred'], as_index=False)
#grouped_count = grouped_count[['relevant_pred', 'not-relevant_pred', 'cleaned_body']].rename(columns={'cleaned_body': 'count'})
grouped_count = grouped.agg({'cleaned_body': 'count', 'paragraphs_nwords': ['mean', 'median']})
#grouped_count[('cleaned_body', 'count_normalised')] = np.round(100 * grouped_count['cleaned_body'] / grouped_count['cleaned_body'].sum(), 4)
#grouped_count.columns = grouped_count.columns.sort_values()
grouped_count

In [None]:
def categorize(x):
    if np.array_equal(x[['relevant_pred', 'not-relevant_pred']], [1, 1]):
        return (1, 1)
    if np.array_equal(x[['relevant_pred', 'not-relevant_pred']], [1, 0]):
        return (1, 0)
    if np.array_equal(x[['relevant_pred', 'not-relevant_pred']], [0, 1]):
        return (0, 1)
    return (0, 0)

prelabeled_df['category'] = prelabeled_df.apply(categorize, axis=1)

In [None]:
sns.kdeplot(data=prelabeled_df, x='paragraphs_nwords', hue='category')

## Sample news for labeling

In [None]:
q

In [None]:
# this cell is for sampling news grouped by having and not having relevent/not-relevant key words

grouped_df = prelabeled_df.groupby(by=['relevant_pred', 'not-relevant_pred'], as_index=False)
groups = [(0, 1), ]

sample_df = pd.Series()

for group in groups:
    s = grouped_df.get_group(group)#.sample(n=100, random_state=SEED)['cleaned_body']
    s['nword_bin'] = pd.qcut(s['paragraphs_nwords'], q = 10)
    s = s.groupby(by='nword_bin', group_keys=False).apply(lambda x: x.sample(n=5, random_state=SEED))['cleaned_body']
    sample_df = pd.concat([sample_df, s])
    
sample_df = pd.DataFrame(sample_df, columns=['text'])
sample_df = sample_df.sort_values(by='text', key=lambda x: x.apply(len))

print(sample_df.head())
print(sample_df.shape)

In [None]:
# this cell is for sampling news for labeling with specific key words
q

cols_to_sample = [
    'is_fraud', 'is_exortion', 'is_trafficking_illegalgoods',
    'is_terrorist_threats', 'is_organised_crime', 'is_trespassing',
    'is_suspicious_activity', 'is_vandalism',
    'is_drugalcohol_violations', 'is_traffic_violations',
    'is_auto_theft',
    ######
    'is_film_news', 'is_politics_news', 'is_abortion_news',
    'is_gun_policy_news', 'is_stats_news', 'is_other', 'is_blaze_news'
]

sample_df = pd.Series()

for col in cols_to_sample:
    s = prelabeled_df[prelabeled_df[col] == 1].sample(n=5, random_state=SEED+1)['cleaned_body']
    sample_df = pd.concat([sample_df, s])
    
sample_df = pd.DataFrame(sample_df, columns=['text'])
sample_df = sample_df.sort_values(by='text', key=lambda x: x.apply(len))
sample_df.head()

In [None]:
# this cell is for sampling from all columns equal number of samples
q

sample_df = pd.Series()

for col in relevant_cols + non_relevant_cols:
    s = prelabeled_df[prelabeled_df[col] == 1].sample(n=10, random_state=SEED)['cleaned_body']
    sample_df = pd.concat([sample_df, s])
    
sample_df = pd.DataFrame(sample_df, columns=['text'])
sample_df = sample_df.sort_values(by='text', key=lambda x: x.apply(len))
sample_df.head()

In [None]:
suffix = '_not_relevant_key_words'

fpath = os.path.join(PATH_TO_DATA_ROOT_DIR, fr'data_providers\gdelt\scraped_news\level_0_filter_data{suffix}.jsonl')
sample_df.to_json(fpath, orient='records', lines=True)

# Create patterns

In [102]:
nlp = spacy.blank('en')
doc = nlp("9/11 attack shop-lifting Nazar's sister")

for t in doc:
    print(t, t.is_alpha, t.lower_)

9/11 False 9/11
attack True attack
shop True shop
- False -
lifting True lifting
Nazar True nazar
's False 's
sister True sister


In [116]:
test_patterns = create_prodigy_patterns(theft_key_words+other_key_words+weapon_violations_key_words, label="RELEVANT")
with open(os.path.join(PATH_TO_PATTERNS, 'test-patterns.jsonl'), 'w') as file:
    file.write(test_patterns)

In [117]:
nlp = spacy.blank('en')
new_ruler = nlp.add_pipe("entity_ruler").from_disk(os.path.join(PATH_TO_PATTERNS, 'test-patterns.jsonl'))

example = 'Shop-lifting goes up during 9/11 attacks and lot of firearm\'s serial number was obliterated'

doc = nlp(example)
displacy.render(doc, style="ent")

In [None]:
relevant_key_words = [
    harassment_key_words,
    theft_key_words,
    robbery_key_words,
    auto_theft_key_words,
    assault_key_words,
    extortion_key_words,
    kidnapping_key_words,
    sex_offences_key_words,
    vandalism_key_words,
    trafficking_illegal_goods,
    fraud_key_words,
    organised_crime_key_words,
    homicide_key_words,
    terrorist_threats_key_words,
    disturbance_key_words,
    suspicious_activity_key_words,
    domestic_offences_key_words,
    drug_alcohol_violations_key_words,
    traffic_violations_key_words,
    trespassing_key_words,
    weapon_violations_key_words
]

not_relevant_key_words = [
    trials_key_words,
    car_accidents_key_words,
    statistics_key_words,
    gun_policy_key_words,
    abortion_key_words,
    politics_key_words,
    blaze_key_words,
    film_key_words,
    weather_key_words,
    covid_key_words,
    other_key_words
]

PATH_TO_PATTERNS = os.path.join(PATH_TO_PROJECT_X_REPO, r"notebooks\nazar_notebooks\labeling_patterns")

In [None]:
level_0_pattern = ''

for key_words in relevant_key_words:
    for key in key_words:
        if len(key.split()) > 1:
            pattern = []
            for word in key.split():
                pattern.append({'lower' : word.lower()})
            level_0_pattern += json.dumps({'label': 'RELEVANT', 'pattern': pattern}) + '\n'
        else:
            level_0_pattern += json.dumps({'label': 'RELEVANT', 'pattern': [{'lower': key.lower()}]}) + '\n'
            
for key_words in not_relevant_key_words:
    for key in key_words:
        if len(key.split()) > 1:
            pattern = []
            for word in key.split():
                pattern.append({'lower' : word.lower()})
            level_0_pattern += json.dumps({'label': 'NOT-RELEVANT', 'pattern': pattern}) + '\n'
        else:
            level_0_pattern += json.dumps({'label': 'NOT-RELEVANT', 'pattern': [{'lower': key.lower()}]}) + '\n'
            
#level_0_pattern = level_0_pattern.replace("'", '"')
#level_0_pattern = re.sub('(^\')|(\'$)', '\"', level_0_pattern)
print(level_0_pattern)

In [None]:
with open(os.path.join(PATH_TO_PATTERNS, 'relevant-patterns.jsonl'), 'w') as file:
    file.write(level_0_pattern)

In [None]:
print(os.path.join(PATH_TO_PATTERNS, 'relevant-patterns.jsonl'))

# Labeled stats

In [None]:
import seaborn as sns
from sklearn import metrics
    
def evaluate_results(actual, predicted, title='Classifier', threshold=0.5):
    
    if predicted.shape[1] == 2:
        predicted = np.argmax(predicted, axis=1)
    else:
        if not isinstance(predicted, int):
            predicted = np.where(predicted > threshold, 1, 0)
    
    actual = actual.reshape(1, -1).T
    predicted = predicted.reshape(1, -1).T
    
    # Classification report
    print(metrics.classification_report(actual, predicted))
    
    # Confusion matrix
    cf = metrics.confusion_matrix(actual, predicted, normalize=None)
    cf_norm = metrics.confusion_matrix(actual, predicted, normalize='true')
    
    df_cm = pd.DataFrame(cf)
    df_cm_norm = pd.DataFrame(cf_norm)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6))
    sns.heatmap(df_cm, annot=True, ax = ax1, annot_kws={"size": 14}, fmt='g')
    ax1.set_title('Unnormalized confusion matrix')
    ax1.set_xlabel('PREDICTED VALUES')
    ax1.set_ylabel('ACTUAL VALUES')

    sns.heatmap(df_cm_norm, annot=True, ax = ax2, annot_kws={"size": 14})
    ax2.set_title('Normalized confusion matrix')
    ax2.set_xlabel('PREDICTED VALUES')
    ax2.set_ylabel('ACTUAL VALUES')

    fig.suptitle(title, fontsize=16)
    plt.show()

In [None]:
fpath = os.path.join(PATH_TO_SCRAPED_NEWS, 'level_0_filter_labeled.jsonl')
df = pd.read_json(fpath, lines=True, encoding='utf-16')
df.head()
#type(df.loc[0, 'accept'])

In [None]:
df['answer'].value_counts()

In [None]:
df[df['answer']=='accept']['accept'].astype(str).value_counts(normalize=True)

### Merge previously labeled news into 6 categories which are relevant

In [None]:
usa_news_path = r'C:\ProjectX\workspace\data\data_providers\gdelt\scraped_news\usa-news-annotated.jsonl'
usa_news_df = pd.read_json(usa_news_path, lines=True, encoding='utf-16')
usa_news_df.head()

In [None]:
accept_mask = (usa_news_df['answer'] == 'accept') & (usa_news_df['accept'].astype(str) != '[]')
#cols = ['text', 'accept']
usa_news_out = usa_news_df[accept_mask]
usa_news_out['accept'] = [['RELEVANT']] * usa_news_out.shape[0]
usa_news_out.head()

In [None]:
usa_news_out.shape

In [None]:
fpath = os.path.join(PATH_TO_DATA_ROOT_DIR, fr'data_providers\gdelt\scraped_news\usa_news_merge_level_0_filter.jsonl')
usa_news_out.to_json(fpath, orient='records', lines=True)

In [None]:
accept_mask = (usa_news_df['answer'] == 'accept') & (usa_news_df['accept'].astype(str) != '[]')
cols = ['text', 'accept']
usa_news_df_prep = usa_news_df.loc[accept_mask, cols]

usa_news_df_prep = usa_news_df_prep.drop_duplicates(subset='text')
usa_news_df_prep['accept'] = usa_news_df_prep['accept'].apply(set).apply(list)
usa_news_df_prep['accept_int'] = (usa_news_df_prep['accept'].astype(str) != '[]').astype(int)
usa_news_df_prep.head()

In [None]:
df_prep = df[df['answer'] == 'accept'][['text', 'accept']]
df_prep['accept_int'] = (df_prep['accept'].astype(str) == "['RELEVANT']").astype(int)
df_prep.head()

In [None]:
t = pd.concat([usa_news_df_prep, df_prep])
t.duplicated(subset='text').sum()

In [None]:
labeled_df = pd.concat([usa_news_df_prep, df_prep])
labeled_df = labeled_df.drop_duplicates(subset='text', keep='last').reset_index(drop=True)
labeled_df['length'] = labeled_df['text'].apply(len)
labeled_df.shape

### Merged stats

In [None]:
labeled_df['accept_int'].value_counts(normalize=True)

In [None]:
sns.kdeplot(data=labeled_df, x='length', hue='accept_int', )

In [None]:
labeled_df.groupby(by='accept_int')['length'].agg(['mean', 'median'])

In [None]:
relevant_cols = [
    'is_harassment', 'is_theft',
    'is_robbery', 'is_auto_theft',
    'is_assault', 'is_exortion',
    'is_kidnapping', 'is_sex_offences',
    'is_vandalism', 'is_trafficking_illegalgoods',
    'is_fraud', 'is_organised_crime',
    'is_homicide', 'is_terrorist_threats',
    'is_diturbance', 'is_suspicious_activity',
    'is_domestic_offences', 'is_drugalcohol_violations',
    'is_traffic_violations', 'is_trespassing',
    'is_weapon_violations'
]

relevant_regexs = [
    HARASSMENT_KEY_WORDS_REGEX, THEFT_KEY_WORDS_REGEX,
    ROBBERY_KEY_WORDS_REGEX, AUTO_THEFT_KEY_WORDS_REGEX,
    ASSAULT_KEY_WORDS_REGEX, EXTORTION_KEY_WORDS_REGEX,
    KIDNAPING_KEY_WORD_REGEX, SEX_OFFENCES_KEY_WORD_REGEX,
    VANDALISM_KEY_WORDS_REGEX, TRAFFICKING_ILLEGAL_GOODS_REGEX,
    FRAUD_KEY_WORDS_REGEX, ORGANISED_CRIME_KEY_WORDS_REGEX,
    HOMICIDE_KEY_WORDS_REGEX, TERRORIST_THREATS_KEY_WORDS_REGEX,
    DISTURBANCE_KEY_WORDS_REGEX, SUSPICIOUS_ACTIVITY_KEY_WORDS_REGEX,
    DOMESTIC_OFFENCES_KEY_WORDS_REGEX, DRUG_ALCOHOL_KEY_WORDS_REGEX,
    TRAFFIC_VIOLATIONS_KEY_WORDS_REGEX, TRESPASSING_KEY_WORDS_REGEX,
    WEAPON_VIOLATIONS_KEY_WORDS_REGEX
]

non_relevant_cols = [
    'is_trial',
    'is_car_accident',
    'is_stats_news',
    'is_gun_policy_news',
    'is_abortion_news',
    'is_politics_news',
    'is_blaze_news',
    'is_film_news',
    'is_weather_news',
    'is_covid_news',
    'is_other',
]

non_relevant_regexs = [
    TRIAL_KEY_WORDS_REGEX,
    CAR_ACCIDENT_KEY_WORDS_REGEX,
    STATS_KEY_WORDS_REGEX,
    GUN_POLICY_KEY_WORDS_REGEX,
    ABORTION_KEY_WORDS_REGEX,
    POLITICS_KEY_WORDS_REGEX,
    BLAZE_KEY_WORDS_REGEX,
    FILM_KEY_WORDS_REGEX,
    WEATHER_KEY_WORDS_REGEX,
    COVID_KEY_WORDS_REGEX,
    OTHER_KEY_WORDS_REGEX
]

In [None]:
for col, regex in zip(relevant_cols+non_relevant_cols, relevant_regexs+non_relevant_regexs):
    
    col_name = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:])) + '_keywords_body'
    labeled_df[col_name] = labeled_df['text'].str.lower().apply(lambda x: re.sub(PUNCTUATION_REGEX, " ", x) if isinstance(x, str) else x)
    labeled_df[col_name] = labeled_df[col_name].apply(regex.findall)
    labeled_df[col_name] = labeled_df[col_name].apply(lambda x: [x.strip() for x in sorted(set(x))])
    
    labeled_df[col] = 0
    labeled_df.loc[labeled_df[col_name].astype(str) != '[]', col] = 1
    labeled_df[col] = labeled_df[col].astype(int)
    
    print(labeled_df[col].value_counts().head())
    print(labeled_df[col_name].astype(str).value_counts().head(5))
    print('\n'+'-'*30+'\n')

In [None]:
l = []

for col in relevant_cols:
    s = (labeled_df[col] == 1).sum()
    l.append([col, s])
    
relevent_cols_stats = pd.DataFrame(data=l, columns=['col_name', 'count']).sort_values(by='count')
relevent_cols_stats[relevent_cols_stats['count'] < 10]['col_name'].values

In [None]:
l = []

for col in non_relevant_cols:
    s = (labeled_df[col] == 1).sum()
    l.append([col, s])
    
not_relevent_cols_stats = pd.DataFrame(data=l, columns=['col_name', 'count']).sort_values(by='count')
not_relevent_cols_stats[not_relevent_cols_stats['count'] < 10]['col_name'].values

# Level-0 filter

## Base filter by key words
* If there is only relevant key words than its relevant, otherwise not relevant

In [None]:
labeled_df['relevant_pred'] = labeled_df[relevant_cols].apply(any, axis=1).astype(int)
labeled_df['not-relevant_pred'] = labeled_df[non_relevant_cols].apply(any, axis=1).astype(int)
labeled_df['base-filter'] = np.all(labeled_df[['relevant_pred', 'not-relevant_pred']].values == [1, 0], axis=1)
labeled_df['base-filter'] = labeled_df['base-filter'].astype(int)
labeled_df[['text', 'accept_int', 'base-filter', 'relevant_pred', 'not-relevant_pred']].head(10)

In [None]:
labeled_df['base-filter'] = labeled_df['text'].apply(base_filter)
labeled_df['base-filter'] = (labeled_df['base-filter'] == 'RELEVANT').astype(int)
labeled_df.head()

In [None]:
y_true = labeled_df['accept_int'].values.reshape(-1, 1)
y_pred = labeled_df['base-filter'].values.reshape(-1, 1)

evaluate_results(y_true, y_pred, title='Base-filter')

In [None]:
(labeled_df['accept_int'] == labeled_df['base-filter']).astype(int).mean()

## Analyze false negative and false positives

In [None]:
import spacy
from spacy import displacy

PATH_TO_PATTERNS = os.path.join(PATH_TO_PROJECT_X_REPO, r"notebooks\nazar_notebooks\labeling_patterns\relevant-patterns.jsonl")

nlp = spacy.blank('en')
new_ruler = nlp.add_pipe("entity_ruler").from_disk(PATH_TO_PATTERNS)

false_neg = labeled_df[(labeled_df['accept_int'] == 1) & (labeled_df['base-filter'] == 0)]
print(f"There are {false_neg.shape[0]} false negatives\n")

for idx in false_neg.index:
    #print(false_neg.loc[idx, 'text'])
    #print('-'*150)
    doc = nlp(false_neg.loc[idx, 'text'])
    displacy.render(doc, style="ent")
    print('-'*150)

In [None]:
false_pos = labeled_df[(labeled_df['accept_int'] == 0) & (labeled_df['base-filter'] == 1)]
print(f"There are {false_pos.shape[0]} false positives\n")

for idx in false_pos.index:
    #print(false_neg.loc[idx, 'text'])
    #print('-'*150)
    doc = nlp(false_pos.loc[idx, 'text'])
    displacy.render(doc, style="ent")
    print('-'*150)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from level_0_filter_utils import (base_filter, base_filter_with_key_words)

In [None]:
base_filter(labeled_df.loc[0, 'text'])

In [None]:
labeled_df.loc[:10, 'text'].apply(base_filter)

In [None]:
res = pd.DataFrame(labeled_df['text'], columns=['text'])
res['base-filter'] = res['text'].apply(base_filter)
res['base-filter'] = (res['base-filter'] == 'RELEVANT').astype(int)
res.head()

In [None]:
labeled_df[['text', 'base-filter']].head()

In [None]:
(labeled_df['text'] == res['text']).value_counts()

In [None]:
(labeled_df['base-filter'] == res['base-filter']).value_counts()

In [None]:
example = labeled_df.loc[29, 'text']

rel = []
not_rel = []

for r in relevant_regexs:
    t = r.findall(example)
    if t:
        rel.append(t)
        
for r in non_relevant_regexs:
    t = r.findall(example)
    if t:
        not_rel.append(t)
        
print(rel)
print(not_rel)

In [None]:
base_filter_with_key_words(labeled_df.loc[29, 'text'])

In [None]:
labeled_df.loc[29, :]

In [None]:
base_filter(labeled_df.loc[29, 'text'])

In [None]:
mask = (labeled_df['base-filter'] != res['base-filter'])

labeled_df.loc[mask, ['text']]