In [1]:
import warnings
warnings.filterwarnings("ignore") 

In [2]:
import os
import re
import sys
import json
import numpy as np
import pandas as pd
from wordcloud import WordCloud

import spacy
from spacy import displacy
from spacy.matcher import Matcher

from nltk.corpus import stopwords

import seaborn as sns
from matplotlib import pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_colwidth', -1)

SPACE_REGEX = re.compile(r"\s+")
STOP_WORDS = set(stopwords.words('english'))
SEED = 2022

In [3]:
HOME_DIR = r'C:\ProjectX'

WORKSPACE_ROOT = os.path.join(HOME_DIR, 'workspace')
PATH_TO_PROJECT_X_REPO = os.path.join(WORKSPACE_ROOT, 'project_x')
PATH_TO_DATA_ROOT_DIR = os.path.join(WORKSPACE_ROOT, "data")
PATH_TO_SCRAPED_NEWS = os.path.join(PATH_TO_DATA_ROOT_DIR, "data_providers\gdelt\scraped_news")

In [4]:
# Add path to library to sys path
generic_utils_lib_dir = os.path.join(PATH_TO_PROJECT_X_REPO, 'common')

sys.path.extend([generic_utils_lib_dir])

#from generic_utils import (downcast_datatypes, timing, create_output_dir, parallelize)

from crime_mapper_utils import crimemapper, map_event_types_from_es_to_en

In [5]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(12.0,8.0),
                   title: str = None, title_size: int = 24):

    wordcloud = WordCloud(background_color='black',
                          stopwords=STOP_WORDS,
                          max_words=max_words,
                          max_font_size=max_font_size,
                          random_state=SEED,
                          width=1200,
                          height=600,
                          mask=mask)

    wordcloud.generate(str(text))

    plt.figure(figsize=figure_size)

    plt.imshow(wordcloud)
    plt.title(title, fontdict={'size': title_size, 'color': 'black',
                              'verticalalignment': 'bottom'})
    plt.axis('off')
    plt.tight_layout()

In [7]:
news_df.head()

Unnamed: 0,source_name,url,paragraphs,paragraphs_nwords,dateadded,eventrootcode,eventbasecode,eventcode,actiongeo_fullname,actiongeo_countrycode_iso3,v2themes_names,body,cleaned_body
0,www.cbsnews.com,https://www.cbsnews.com/news/law-enforcement-c...,"By Jennifer De Pinto, Fred Backus, Anthony Sal...",781.0,2022-07-01 00:00:00,1.0,12.0,12.0,"Uvalde, Texas, United States",USA,['BAN' 'CRISISLEX_C07_SAFETY' 'CRISISLEX_CRISI...,"By Jennifer De Pinto, Fred Backus, Anthony Sal...","By Jennifer De Pinto, Fred Backus, Anthony Sal..."
1,www.850wftl.com,https://www.850wftl.com/four-dead-three-injure...,"(ENCINAL, Texas) — Four people were killed and...",107.0,2022-07-01 00:00:00,19.0,190.0,190.0,"San Antonio, Texas, United States",USA,['BORDER' 'CRISISLEX_C07_SAFETY' 'CRISISLEX_CR...,"(ENCINAL, Texas) -- Four people were killed an...","(ENCINAL, Texas) -- Four people were killed an..."
2,www.potomaclocal.com,https://www.potomaclocal.com/2022/06/30/manass...,"On Wednesday, June 29, at 1:09 a.m, officers r...",430.0,2022-07-01 00:00:00,4.0,43.0,43.0,"Willacoochee, Georgia, United States",USA,['ARREST' 'CRISISLEX_C03_WELLBEING_HEALTH' 'CR...,"On Wednesday, June 29, at 1:09 a.m, officers r...","On Wednesday, June 29, at 1:09 a.m, officers r..."
3,www.wfae.org,https://www.wfae.org/2022-06-30/hells-angels-f...,"Sonny Barger, the leather-clad figurehead of t...",650.0,2022-07-01 00:00:00,8.0,84.0,84.0,"Altamont, California, United States",USA,['ARMEDCONFLICT' 'ARREST' 'BAN' 'CRISISLEX_C07...,"Sonny Barger, the leather-clad figurehead of t...","Sonny Barger, the leather-clad figurehead of t..."
4,news.yahoo.com,https://news.yahoo.com/exclusive-sophia-roe-ta...,If she’s not cheffing it up in her Apartment M...,1313.0,2022-07-01 00:00:00,19.0,190.0,190.0,United States,USA,['ARMEDCONFLICT' 'CRISISLEX_CRISISLEXREC' 'CRI...,If she's not cheffing it up in her Apartment M...,If she's not cheffing it up in her Apartment M...


In [8]:
news_df['cleaned_body'].isnull().sum()

585

In [9]:
news_df.shape

(49894, 13)

In [10]:
news_df = news_df.dropna(subset=['cleaned_body'])
news_df.shape

(49309, 13)

In [11]:
news_df['paragraphs_nwords'].describe()

count    49309.000000
mean       574.425865
std        369.156367
min         60.000000
25%        283.000000
50%        489.000000
75%        791.000000
max       1702.000000
Name: paragraphs_nwords, dtype: float64

### Key words in 6 categories

### 1) Harassment

In [35]:
harassment_key_words = [
    'abused',
    'obscenity',
    'defamation',
    'tracking',
    'harassment',
    'harassing',
    'harass',
    'harassed',
    'stalking',
    'bully',
    'bullied',
    'intimidation',
    'peeping tom',
    'insult',
    'intent to outrage her modesty',
    'unlawful exposure',
    'accosting',
    'intimidating',
    'intimidate',
    'intimidated',
    'obscene',
    'obscene phone calls',
    'Obscene phone call',
    'armed disturb',
    'hurled racist slurs',
    'coercion',
    'cursing',
    'threatened to kill',
    'threatening to kill',
    'groping',
    'groped',
    'intimidation with a dangerous weapon',
    'threatening to shoot'
]

In [36]:
HARASSMENT_KEY_WORDS = "|".join(fr'\b{str.lower(harass_key_word)}\b' for harass_key_word in set(harassment_key_words))
HARASSMENT_KEY_WORDS_REGEX = re.compile(r'%s' % HARASSMENT_KEY_WORDS, flags=re.I)

s = """
The head of the Royal Navy has ordered an investigation into allegations of bullying and sexual harassment against 
women in the Submarine Service. Several whistleblowers who served in the fleet told the Daily Mail they faced 
mistreatment from all ranks. Adm Sir Ben Key, the First Sea Lord, called the claims "abhorrent", adding 
"sexual harassment has no place in the Royal Navy and will not be tolerated". "Anyone who is found culpable will 
be held accountable," he said. The allegations, revealed in detail by the Mail, include male crew members 
compiling a list setting out the order in which women would be assaulted in the event of a catastrophic event. unlawful exposure
"""

set(HARASSMENT_KEY_WORDS_REGEX.findall(s))

{'harassment', 'unlawful exposure'}

### 2) Theft

In [15]:
theft_key_words =[
    'theft',
    'industrial espionage',
    'burglary',
    'burglaries',
    'burglar',
    'steal',
    'stole',
    'shoplifting',
    'larceny',
    'pocket-picking',
    'obtaining a service without payment',
    'obtained a service without payment',
    'false pretenses',
    'swindle',
    'confidence game',
    'making off from a hotel',
    'made off from a hotel',
    'restaurant without payment',
    'bar without payment',
    'pickpocketing',
    'pickpocket',
    'shop-lifting',
    'shoplift',
    'shoplifted'
]

In [17]:
THEFT_KEY_WORDS = "|".join(fr'\b{str.lower(theft_key_word)}\b' for theft_key_word in set(theft_key_words))
THEFT_KEY_WORDS_REGEX = re.compile(r'%s' % THEFT_KEY_WORDS, flags=re.I)

s = """
A 33-year-old man was arrested on Saturday morning on the Limassol – Nicosia motorway, to facilitate police investigations into cases of house burglary 
and theft. In a written statement police stated that at around 8.20am on Saturday, traffic police, who were conducting a traffic control on the 
Limassol-Nicosia motorway close to Mari, stopped a driver for inspection having noticed that his car did not have a registration plate at the front. 
industrial espionage 
"""

set(THEFT_KEY_WORDS_REGEX.findall(s))

{'burglary', 'industrial espionage', 'theft'}

### 3) Robbery

In [18]:
robbery_key_words = [
    'snatching',
    'purse-snatching',
    'robbery',
    'robbers', 
    'robberies',
    'robbing',
    'robber', 
    'robbed',
    'housebreaking',
    'break and enter',
    'violent robbery',
    'snatch',
    'snatched',
    'mugging',
    'armed robbery'
]

In [19]:
#[{'lemma': 'gun'}, {'not_in': ['store']}
# concealed firearmsROBBERY

In [20]:
ROBBERY_KEY_WORDS = "|".join(fr'\b{str.lower(robbery_key_word)}\b' for robbery_key_word in set(robbery_key_words))
ROBBERY_KEY_WORDS_REGEX = re.compile(r'%s' % ROBBERY_KEY_WORDS, flags=re.I)

s = """
A teenager who was out on two previous gun arrests was busted again in Queens on Wednesday for an armed robbery, according to police sources.
Alleged gang member Jeffrey Mendoza, 18, was arrested after he and two friends pistol whipped a person with a 9 mm handgun and stole the victim’s cellphone 
and wallet, sources said. 
"""

set(ROBBERY_KEY_WORDS_REGEX.findall(s))

{'armed robbery'}

### Auto Theft

In [21]:
auto_theft_key_words = [
    'carjacking',
    'carjackings',
    'hijacking',
    'hijackings',
    'hijacked',
    'hijack',
    'conveyance',
    'vehicle larceny',
    'vehicle theft',
    'vehicle grand',
    'gta',
    'car jack',
    'car theft',
    'car was stolen',
    'carjacker',
    'carjackers',
    'carjack',
    'carjacked',
    'steal a car',
    'stole a car',
    'steal car',
    'stole car',
    'stole a vehicle',
    'stole another vehicle',
    'stole a Chevy truck',
    'stole another motor vehicle',
    'armed carjacking',
    'stole her car',
    'stole his car',
    'stole their car',
    'theft of a motor vehicle',
    'stole a truck',
    'grand theft',
    'grand theft auto',
    'stole his vehicle',
    'commandeered the vehicle',
    'motor vehicle theft',
    'vehicle was stolen',
    'Vehicle was burglarized',
    'stolen van',
    'van robbers',
    'stolen pickup truck',
]

In [22]:
AUTO_THEFT_KEY_WORDS = "|".join(fr'\b{str.lower(auto_theft_key_word)}\b' for auto_theft_key_word in set(auto_theft_key_words))
AUTO_THEFT_KEY_WORDS_REGEX = re.compile(r'%s' % AUTO_THEFT_KEY_WORDS, flags=re.I)

s = """
Ottawa police say three men from Montreal are facing charges in connection with an alleged car theft in Kanata South.
Officers were called to Gowrie Street and Barra Avenue Tuesday afternoon after someone reported a man disconnecting a trailer from a vehicle and thought 
it was suspicious, police said in a news release. A similar vehicle was stopped some time later on Highway 417. Police said it was reported stolen. 
"""

set(AUTO_THEFT_KEY_WORDS_REGEX.findall(s))

{'car theft'}

### Assault

In [1]:
assault_key_words = [
    'assault',
    'aggravated assault',
    'assaulted',
    'weapons offense',
    'hit-and-run',
    'hit and run',
    'strangulation',
    'obstruct breath',
    'serious injury',
    'injury to',
    'injured',
    'knifing',
    'stabbing',
    'stab',
    'poison',
    'attempt to murder',
    'attempted murder',
    'hurt',
    'hurted',
    'wounding',
    'wound',
    'wounded',
    'attempts or threats to murder',
    'weapons and explosives offenses',
    'aggressive panhandling',
    'affray',
    'shooting',
    'attempted homicide',
    'bodily harm',
    'grievous bodily harm',
    'simply bodily harm',
    'female genital mutilation',
    'endangering the life or health',
    'endangering the welfare',
    'participation brawl',
    'participation attack',
    'representations of acts of violence',
    'against humanity',
    'endangering public safety with weapons',
    'crimes against persons',
    'an offense against a person',
    'offensive weapon',
    'armed dispute',
    'abduction',
    'violently attacked',
    'knife wounds',
    'gunshot wounds',
    'struck him multiple times',
    'stab wound',
    'brutally beat',
    'was bound with duct tape',
    'were bound with duct tape',
    'strangled'
]

In [24]:
ASSAULT_KEY_WORDS = "|".join(fr'\b{str.lower(assault_key_word)}\b' for assault_key_word in set(assault_key_words))
ASSAULT_KEY_WORDS_REGEX = re.compile(r'%s' % ASSAULT_KEY_WORDS, flags=re.I)

s = """
Police from Serious and Organised Crime Branch have arrested four people following an investigation into a serious assault at Henley Beach earlier this month.
About 10.45pm on Friday 7 October, a number of staff and security guards were assaulted at a hotel on Seaview Road.
Following an investigation, Detectives from Serious and Organised Crime Branch subsequently arrested three men and a woman over the incident. 
"""

set(ASSAULT_KEY_WORDS_REGEX.findall(s))

{'assault', 'assaulted'}

### Homicide

In [25]:
homicide_key_words = [
    'murder',
    'assassination',
    'homicide',
    'lynching',
    'manslaughter',
    'genocide',
    'kill',
    'killed',
    'dowry death',
    'death investigation',
    'fatally shot',
    'died after shots fired',
    'died after',
    'shot', 
    'shooting', 
    'shots', 
    'opened fire', 
    'gunfire',
    'stabbed',
    'slaying',
    'massacre',
    'died of a single gunshot wound', 
    'died of multiple gunshot wounds',
    'deceased victims',
    'were found dead',
    'investigated as homicides',
    'person is dead',
    'fatal head injury',
    'declared dead',
    'shooting death',
    'was shot to death',
    'were shot to death',
    'did not survive the shooting',
    'stabbed to death',
    'deadly shooting',
    'dismembered',
    'concealment of a body',
    'reported dead by gunshot wound',
    'fatal stabbings',
]

In [26]:
HOMICIDE_KEY_WORDS = "|".join(fr'\b{str.lower(homicide_key_word)}\b' for homicide_key_word in set(homicide_key_words))
HOMICIDE_KEY_WORDS_REGEX = re.compile(r'%s' % HOMICIDE_KEY_WORDS, flags=re.I)

s = """
Seattle police say they have arrested a 42-year-old man for the double murder of a man and woman in Georgetown.
He was booked into the King County jail for investigation of homicide.
Police posted in their blotter, a 911 caller reported two down subjects in an apartment in the 6100 block of 4th Avenue South. 
"""

set(HOMICIDE_KEY_WORDS_REGEX.findall(s))

{'homicide', 'murder'}

In [28]:
crime_words_key = harassment_key_words + theft_key_words + robbery_key_words \
                + auto_theft_key_words + assault_key_words + homicide_key_words

CRIME_KEY_WORDS = "|".join(fr'\b{str.lower(crime_key_word)}\b' for crime_key_word in set(crime_words_key))
CRIME_KEY_WORDS_REGEX = re.compile(r'%s' % CRIME_KEY_WORDS, flags=re.I)

s = """
Homicide Filed under: The group was standing outside about 9 p.m. in the 800 block of East 79th Street when someone opened fire before fleeing the scene. 
Five people wounded in a shooting Friday night in Chatham on the South Side. The group was standing outside about 9 p.m. in the 800 block of
East 79th Street when someone opened fire before fleeing the scene, Chicago police said.  A male and female, whose ages weren’t known, were shot 
in the arm and transported to the University of Chicago Medical Center, police said. The male was in critical condition and the  female was stabilized.
A 50-year-old woman was shot in the arm while a man, also 50, was struck in the ankle, police said. Another man, 43, was shot in the shoulder. 
They were all stabilized and taken to the same hospital, according to police.  Area Two detectives are investigating.  
"""

set(CRIME_KEY_WORDS_REGEX.findall(s))

{'Homicide', 'opened fire', 'shooting', 'shot', 'wounded'}

### Detect news with trials key words

In [29]:
trials_key_words = [
    'prosecutor', 'prosecutors', 
    'judge', 'jury', 
    'trial', 'mistrial', 
    'sentenced', 'verdict', 
    'was charged', 'were charged', 'have been charged', 'facing charges',
    'probation', 'probations',
    'allegation', 'allegations',
    'lawsuit', 'lawsuits',
    'settle', 'settlement', 
    'sued', 'convicted',
    'letter',
    'testified',
    'paroled',
    'U.S. Supreme Court decision',
    'Supreme Court of the United States',
]

TRIAL_KEY_WORDS = "|".join(fr'\b{str.lower(trials_key_word)}\b' for trials_key_word in set(trials_key_words))
TRIAL_KEY_WORDS_REGEX = re.compile(r'%s' % TRIAL_KEY_WORDS, flags=re.I)

s = """
A 25-year-old man has been sentenced to a year in prison for straw purchasing guns in suburban Chicago by Supreme Court of the United States. 
Ismael Sene in 2019 and 2020 bought a 
total of seven handguns from licensed firearms dealers in   and Merrionette Park. During one of the purchases, officials say Sene falsely 
certified that he was the actual buyer. Sene purchased at least one of the guns for a person whom he had reason to believe was a convicted felon, 
officials said. Convicted felons under federal law may not purchase or possess a firearm. Sene in each purchase also falsely claimed he was not
an unlawful drug user when he regularly used cannabis — a violation of federal law. In June, Sene pleaded guilty to making false statements in 
connection with purchasing a firearm. Last Friday, a judge sentenced Sene to a year and a day in federal prison. Advertisement "Straw purchasers,
like the defendant, perpetuate the cycle of violence that is terrorizing this city," Assistant U.S. Attorney James P. Durkin argued in the 
government’s sentencing memorandum. "The tools that drive that mayhem are firearms in the hands of dangerous people who are often legally 
prohibited from purchasing and possessing them.
"""
print(set(TRIAL_KEY_WORDS_REGEX.findall(s)))

s = """
Two men have been charged in connection to a robbery and shooting that killed an armored car driver and critically wounded another in Chicago's
Chatham neighborhood Monday morning. Two women and a man were robbed in separate incidents minutes apart Wednesday night in the   neighborhood. 
The women, 27 and 28, were walking on the sidewalk at 9:21 p.m. in the 800 block of West Wolfram Street when two gunmen got out of a red Mazda SUV 
and demanded their belongings, police said. The gunmen took their cellphones and purses, police said. Neither woman was hurt during the incident, 
police said. Less than a half hour later, a 46-year-old man was approached by gunmen as he was entering his building in the 500 block of West 
Stratford Place, police said. They took his cellphones, wallet and a yellow backpack, police said. The man suffered a minor injury to his jaw but 
refused treatment at the scene, police said. The gunmen fled in a red Mazda SUV. Police have not said if they believe the two robberies are connected.
Advertisement Area Three detectives are investigating.
"""

print(set(TRIAL_KEY_WORDS_REGEX.findall(s)))

{'Supreme Court of the United States', 'sentenced', 'judge', 'Convicted', 'convicted'}
{'have been charged'}


### Detect news with crash keywords

In [30]:
car_accidents_key_words = [
    'traffic crash', 'car accident', 'car crashed', 'truck crashed', 'crash'
]

CAR_ACCIDENT_KEY_WORDS = "|".join(fr'\b{str.lower(car_accidents_key_word)}\b'
                                    for car_accidents_key_word in set(car_accidents_key_words))

CAR_ACCIDENT_KEY_WORDS_REGEX = re.compile(r'%s' % CAR_ACCIDENT_KEY_WORDS, flags=re.I)

s = """Car crashed A Massachusetts State Police cruiser was involved in a multi-car crash on Route 128 in Needham late Sunday night. At least three vehicles 
were involved in the crash, officials said. It's not clear if anyone was injured in the crash, which remains under investigation Crash.
"""
print(CAR_ACCIDENT_KEY_WORDS_REGEX.findall(s))

['Car crashed', 'crash', 'crash', 'crash', 'Crash']


### Detect news about statistics

In [31]:
statistics_key_words = [
    'annual', 'quarterly', 'record number', 'this year', 'statewide'
]

STATS_KEY_WORDS = "|".join(fr'\b{str.lower(statistics_key_word)}\b'
                             for statistics_key_word in set(statistics_key_words))

STATS_KEY_WORDS_REGEX = re.compile(r'%s' % STATS_KEY_WORDS, flags=re.I)

s = """
The number of homicides in the United States continued to rise in the first three quarters of 2021, but at a slower pace, 
one year after   that followed  , according to the latest quarterly report published Monday by the Council on Criminal Justice.
A study of homicides in 22 cities during the first nine months of this year showed the number of murders was 4% greater than 
the same period in 2020, with 126 more homicides between January and September, the report says. In the first three quarters of
2020, the number of homicides in the same 22 cities rose by 36% over the same time frame in 2019, according to the report. According
to a  , the number of homicides during the first half of 2021 increased by 16% compared to the same period last year. The number of
homicides in 2020 compared to 2019 rose by 25%, according to an FBI preliminary report, the largest jump since the FBI started releasing
annual homicide figures in the 1960s. The spike in violent crime came as the   swept across the country, millions of people protested 
racial injustice and police brutality following Floyd’s death last year, and the   under the weight of the pandemic. The homicide rate
remained elevated through the summer before decreasing in the fall and winter and then increasing again in the spring and summer of 
this year, the report says
"""

print(set(STATS_KEY_WORDS_REGEX.findall(s)))

{'annual', 'this year', 'quarterly'}


### Detect news about gun policy

In [32]:
gun_policy_key_words = [
    'gun policy', 'gun measures', 'gun policy measures',
    'gun restrictions', 'gun-related restrictions', 
    'gun related restrictions', 'gun owners', 'requirements for carrying a handgun',
    'gun control laws', 'gun control', 'concealed carry permit', 'concealed carry of a firearm'
    'gun free zones', 'gun-free zones'
]

GUN_POLICY_KEY_WORDS = "|".join(fr'\b{str.lower(gun_policy_key_word)}\b'
                             for gun_policy_key_word in set(gun_policy_key_words))

GUN_POLICY_KEY_WORDS_REGEX = re.compile(r'%s' % GUN_POLICY_KEY_WORDS, flags=re.I)

s = """
By Jennifer De Pinto, Fred Backus, Anthony Salvanto, Kabir Khanna, \/ CBS News A month after the school shooting 
in , Texans are overwhelmingly critical of law enforcement's response to the shooting, and a majority feel 
it's important to investigate their response. Most Texans are concerned about another mass shooting. 
Texans rate Gov. Abbott's response to Uvalde more negatively than positively. Nearly half of Texans report 
that the Uvalde shooting has spurred them to support some gun restrictions, and there is support in Texas for 
some measures. In backing many potential gun measures, Texas looks much like the nation as a whole. We see 
bipartisan backing for measures like universal background checks and making the minimum age for buying an AR-15 
at least 20 years old. But there are more partisan differences on policies focused on the guns themselves. 
Most Republicans oppose an AR-15 ban in Texas, and more than half oppose a red-flag law, in which a court 
can order the temporary removal of a gun from a person deemed to be a potential danger. More than half also 
disapprove of Abbott's overall job performance, but Abbott still leads Beto O'Rourke by eight points among likely 
voters in the race for governor. Senator John Cornyn is getting mixed marks from his own party on representing 
Texas' interests as it relates to guns, and this is dragging down Cornyn's overall job approval rating, which is j
ust 35% among Texans overall. Almost nine in 10 Texans feel Uvalde law enforcement could have done more to stop the 
shooter. Criticism is widespread across demographic and political groups. Abbott gets negative marks from younger 
Texans, women and Black and Latino people for his handling of the Uvalde shooting. Majorities of both Democrats and 
independents think he's done a bad job, but most in his own party rate his response positively. About eight in 10 
Texans are concerned about more mass shootings in Texas like the one in Uvalde, including almost half who are 
\"very concerned.\" Women express more concern than men do. Latino and Black people in Texas are more likely than 
White people to be very concerned about a mass shooting. For about half of Texans, the shooting at Robb elementary 
has made them more likely to support some gun restrictions. Democrats, whose party has long backed restrictions on 
guns, are particularly likely to say this, but they are joined by three in 10 Republicans who also say that the 
shooting has made them more inclined to favor some measures. Texans broadly support background checks and having a 
minimum age of at least 21. There is majority backing for a \"red flag\" law in Texas and a ban on the AR-15, 
but more division among Texans on these measures. Texans' views on these measures are in line with those of 
Americans overall. Texans' opinions on gun policy measures are connected to what they think would be most 
effective in trying to prevent mass shootings. Republicans tend to focus on measures not related to guns: more 
religion and faith in people's lives, better mental health services and more armed security in public places. 
Democrats look for more gun-related restrictions -- including background checks and red-flag laws, along with more 
mental health services. Republicans who think Cornyn has done a bad job on gun policy overwhelmingly disapprove of 
his overall job performance. Cornyn's job approval among Republicans is 20 points lower than it is for Ted Cruz. 
Most of the interviewing for the poll was conducted as the legislation was being considered and before it was passed. 
More Texas Republicans said they oppose the legislation that was being considered than favor it. The Texas public 
overall is more inclined to favor it. Gun owners in Texas, like those in the country overall, are more likely to 
identify as Republican than Democrat, and that's reflected in some of their views on gun policies, which are similar 
to Republicans generally. But what does being a gun owner represent? Far and away the top answer is \"protection\", 
followed by \"responsibility\", \" freedom\" and \"self-reliance.\" These sentiments are similar across demographics 
of gun owners, including both men and women. Gov. Abbott gets mixed results when Texans look at the job he's doing. 
Most Republicans like the job he's doing, most Democrats do not, while independents are split down the middle. 
Still, in a state that leans more red than the rest of the country, he is viewed as doing a better job as governor 
than Joe Biden is as president. Despite his low approval marks, early polling shows Greg Abbott with an eight-point 
lead over Democratic challenger Beto O'Rourke, if the election for governor of Texas were held today.
"""

print(set(GUN_POLICY_KEY_WORDS_REGEX.findall(s)))

{'gun measures', 'gun policy', 'gun owners', 'gun restrictions', 'gun-related restrictions', 'Gun owners'}


### Other non-crime key words (COVID-19, firefighters, etc)

In [37]:
no_crime_key_words = [
    'injured in a shark attack',
    'COVID-19 booster shots',
    'COVID-19 shots',
    'Pfizer shots', 'shots from Moderna and Pfizer',
    'blaze', 'firefighters battled the fire',
    'firefighters stop the spread',
    'radio tracking collar',
    'Tracking Covid-19',
    'tracking a chance of storms'
]

NO_CRIME_KEY_WORDS = "|".join(fr'\b{str.lower(no_crime_key_word)}\b'
                             for no_crime_key_word in set(no_crime_key_words))

NO_CRIME_KEY_WORDS_REGEX = re.compile(r'%s' % NO_CRIME_KEY_WORDS, flags=re.I)

s = """
Omicron Boosters: 9 Questions Answered About the Updated COVID-19 Shots injured in a shark attack
"""

print(set(NO_CRIME_KEY_WORDS_REGEX.findall(s)))

{'COVID-19 Shots', 'injured in a shark attack'}


### Labeling

In [38]:
news_df['cleaned_body'].isnull().sum()

0

In [39]:
PUNCTUATION_REGEX= re.compile(r"""[?.,\/\\><:;'"\()!%$*|^\~`+#]""")

In [40]:
cols = [
    'is_harassment',
    'is_theft',
    'is_robbery',
    'is_auto_theft',
    'is_assault',
    'is_homicide'
]

regexs = [
    HARASSMENT_KEY_WORDS_REGEX,
    THEFT_KEY_WORDS_REGEX,
    ROBBERY_KEY_WORDS_REGEX,
    AUTO_THEFT_KEY_WORDS_REGEX,
    ASSAULT_KEY_WORDS_REGEX,
    HOMICIDE_KEY_WORDS_REGEX
]

for col, regex in zip(cols, regexs):
    
    col_name = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:])) + '_keywords_body'
    news_df[col_name] = news_df['cleaned_body'].str.lower().apply(lambda x: re.sub(PUNCTUATION_REGEX, " ", x) if isinstance(x, str) else x)
    news_df[col_name] = news_df[col_name].apply(regex.findall)
    news_df[col_name] = news_df[col_name].apply(lambda x: [x.strip() for x in sorted(set(x))])
    
    news_df[col] = 0
    news_df.loc[news_df[col_name].astype(str) != '[]', col] = 1
    news_df[col] = news_df[col].astype(int)
    
    print(news_df[col].value_counts().head())
    print(news_df[col_name].astype(str).value_counts().head(5))
    print('\n'+'-'*30+'\n')

0    45241
1     4068
Name: is_harassment, dtype: int64
[]                45241
['tracking']        614
['abused']          512
['harassment']      426
['defamation']      328
Name: harassment_keywords_body, dtype: int64

------------------------------

0    46726
1     2583
Name: is_theft, dtype: int64
[]              46726
['theft']         640
['burglary']      414
['stole']         361
['steal']         355
Name: theft_keywords_body, dtype: int64

------------------------------

0    47266
1     2043
Name: is_robbery, dtype: int64
[]                   47266
['robbery']            844
['robbed']             198
['armed robbery']      165
['robberies']          125
Name: robbery_keywords_body, dtype: int64

------------------------------

0    48624
1      685
Name: is_auto_theft, dtype: int64
[]                 48624
['carjacking']       118
['hijacked']          92
['grand theft']       64
['carjackings']       43
Name: auto_theft_keywords_body, dtype: int64

----------------------

In [41]:
cols = [
    'is_trial',
    'is_car_accident',
    'is_stats_news',
    'is_gun_policy_news',
    'is_non_crime',
]

regexs = [
    TRIAL_KEY_WORDS_REGEX,
    CAR_ACCIDENT_KEY_WORDS_REGEX,
    STATS_KEY_WORDS_REGEX,
    GUN_POLICY_KEY_WORDS_REGEX,
    NO_CRIME_KEY_WORDS_REGEX
]

for col, regex in zip(cols, regexs):
    
    col_name = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:])) + '_keywords_body'
    news_df[col_name] = news_df['cleaned_body'].str.lower().apply(lambda x: re.sub(PUNCTUATION_REGEX, " ", x) if isinstance(x, str) else x)
    news_df[col_name] = news_df[col_name].apply(regex.findall)
    news_df[col_name] = news_df[col_name].apply(lambda x: [x.strip() for x in sorted(set(x))])
    
    news_df[col] = 0
    news_df.loc[news_df[col_name].astype(str) != '[]', col] = 1
    news_df[col] = news_df[col].astype(int)
    
    print(news_df[col].value_counts().head())
    print(news_df[col_name].astype(str).value_counts().head(5))
    print('\n'+'-'*30+'\n')

0    31264
1    18045
Name: is_trial, dtype: int64
[]                 31264
['judge']           1315
['letter']           990
['was charged']      806
['trial']            610
Name: trial_keywords_body, dtype: int64

------------------------------

0    46583
1     2726
Name: is_car_accident, dtype: int64
[]                           46583
['crash']                     2462
['car accident']                92
['car accident', 'crash']       43
['car crashed', 'crash']        43
Name: car_accident_keywords_body, dtype: int64

------------------------------

0    41649
1     7660
Name: is_stats_news, dtype: int64
[]                         41649
['this year']               4701
['annual']                  1239
['statewide']                780
['annual', 'this year']      428
Name: stats_news_keywords_body, dtype: int64

------------------------------

0    48279
1     1030
Name: is_gun_policy_news, dtype: int64
[]                               48279
['gun control']                    519


In [42]:
# set is_criminal = 0 if is_trial/is_car_accident/is_stats_news = 1
crime_cols = [
    'is_harassment',
    'is_theft',
    'is_robbery',
    'is_auto_theft',
    'is_assault',
    'is_homicide'
]

news_df.loc[news_df['is_trial'] == 1, crime_cols] = 0
news_df.loc[news_df['is_car_accident'] == 1, crime_cols] = 0
news_df.loc[news_df['is_stats_news'] == 1, crime_cols] = 0
news_df.loc[news_df['is_gun_policy_news'] == 1, crime_cols] = 0
news_df.loc[news_df['is_non_crime'] == 1, crime_cols] = 0

news_df['is_criminal'] = news_df[cols].apply(any, axis=1)

print(news_df['is_criminal'].value_counts(normalize=True), end='\n\n')

for col in crime_cols:
    print(news_df[col].value_counts())
    print('-'*30)

True     0.524914
False    0.475086
Name: is_criminal, dtype: float64

0    48127
1     1182
Name: is_harassment, dtype: int64
------------------------------
0    48571
1      738
Name: is_theft, dtype: int64
------------------------------
0    48620
1      689
Name: is_robbery, dtype: int64
------------------------------
0    49083
1      226
Name: is_auto_theft, dtype: int64
------------------------------
0    39690
1     9619
Name: is_assault, dtype: int64
------------------------------
0    37483
1    11826
Name: is_homicide, dtype: int64
------------------------------


In [36]:
#plot_wordcloud(news_df[news_df['is_criminal'] == 0]["cleaned_body"], 
#               title="Word Cloud of non-criminal news",
#               max_words=200)

In [37]:
#plot_wordcloud(news_df[news_df['is_harassment'] == 1]["cleaned_body"], 
#               title="Word Cloud of harassment news",
#               max_words=200)

In [38]:
#plot_wordcloud(news_df[news_df['is_theft'] == 1]["cleaned_body"], 
#               title="Word Cloud of theft news",
#               max_words=200)

In [39]:
#plot_wordcloud(news_df[news_df['is_robbery'] == 1]["cleaned_body"], 
#               title="Word Cloud of robbery news",
#               max_words=200)

In [40]:
#plot_wordcloud(news_df[news_df['is_auto_theft'] == 1]["cleaned_body"], 
#               title="Word Cloud of auto theft news",
#               max_words=200)

In [41]:
#plot_wordcloud(news_df[news_df['is_assault'] == 1]["cleaned_body"], 
#               title="Word Cloud of assault news",
#               max_words=200)

In [42]:
#plot_wordcloud(news_df[news_df['is_homicide'] == 1]["cleaned_body"], 
#               title="Word Cloud of homicide news",
#               max_words=200)

## Create patterns (PhraseMatcher) for prodigy

In [43]:
PATH_TO_PATTERNS = os.path.join(PATH_TO_PROJECT_X_REPO, r"notebooks\nazar_notebooks\labeling_patterns")

In [44]:
harassment_pattern = ''

for key in harassment_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({'lower' : word})
        #theft_pattern += json.dumps({'label': 'THEFT', 'pattern': pattern}) + '\n'
        harassment_pattern += str({'label': 'HARASSMENT', 'pattern': pattern}) + '\n'
    else:
        #theft_pattern += json.dumps({'label': 'THEFT', 'pattern': [{'lower': key}]}) + '\n'
        harassment_pattern += str({'label': 'HARASSMENT', 'pattern': [{'lower': key}]}) + '\n'
        
print(harassment_pattern)

{'label': 'HARASSMENT', 'pattern': [{'lower': 'abused'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'obscenity'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'defamation'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'tracking'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'harassment'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'harassing'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'harass'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'harassed'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'stalking'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'bully'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'bullied'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'intimidation'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'peeping'}, {'lower': 'tom'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'insult'}]}
{'label': 'HARASSMENT', 'pattern': [{'lower': 'intent'}, {'lower': 'to'}, {'lower': 'outrage'}, {'lower': 'her'}, {'lower': 'modesty'}]}
{'labe

In [45]:
theft_pattern = ''

for key in theft_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({'lower' : word})
        #theft_pattern += json.dumps({'label': 'THEFT', 'pattern': pattern}) + '\n'
        theft_pattern += str({'label': 'THEFT', 'pattern': pattern}) + '\n'
    else:
        #theft_pattern += json.dumps({'label': 'THEFT', 'pattern': [{'lower': key}]}) + '\n'
        theft_pattern += str({'label': 'THEFT', 'pattern': [{'lower': key}]}) + '\n'
        
print(theft_pattern)
print(len(theft_pattern))

{'label': 'THEFT', 'pattern': [{'lower': 'theft'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'industrial'}, {'lower': 'espionage'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'burglary'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'burglaries'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'burglar'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'steal'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'stole'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'shoplifting'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'larceny'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'pocket-picking'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'obtaining'}, {'lower': 'a'}, {'lower': 'service'}, {'lower': 'without'}, {'lower': 'payment'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'obtained'}, {'lower': 'a'}, {'lower': 'service'}, {'lower': 'without'}, {'lower': 'payment'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'false'}, {'lower': 'pretenses'}]}
{'label': 'THEFT', 'pattern': [{'lower': 'swindle'}]}
{'label': 'THEFT', 

In [46]:
robbery_pattern = ''

for key in robbery_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({'lower' : word})
        robbery_pattern += str({'label': 'ROBBERY', 'pattern': pattern}) + '\n'
    else:
        robbery_pattern += str({'label': 'ROBBERY', 'pattern': [{'lower': key}]}) + '\n'
        
print(robbery_pattern)

{'label': 'ROBBERY', 'pattern': [{'lower': 'snatching'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'purse-snatching'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robbery'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robbers'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robberies'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robbing'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robber'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'robbed'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'housebreaking'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'break'}, {'lower': 'and'}, {'lower': 'enter'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'violent'}, {'lower': 'robbery'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'snatch'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'snatched'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'mugging'}]}
{'label': 'ROBBERY', 'pattern': [{'lower': 'armed'}, {'lower': 'robbery'}]}



In [47]:
auto_theft_pattern = ''

for key in auto_theft_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({'lower' : word})
        auto_theft_pattern += str({'label': 'AUTO_THEFT', 'pattern': pattern}) + '\n'
    else:
        auto_theft_pattern += str({'label': 'AUTO_THEFT', 'pattern': [{'lower': key}]}) + '\n'
        
print(auto_theft_pattern)

{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'carjacking'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'carjackings'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'hijacking'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'hijackings'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'hijacked'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'hijack'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'conveyance'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'vehicle'}, {'lower': 'larceny'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'vehicle'}, {'lower': 'theft'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'vehicle'}, {'lower': 'grand'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'gta'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'car'}, {'lower': 'jack'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'car'}, {'lower': 'theft'}]}
{'label': 'AUTO_THEFT', 'pattern': [{'lower': 'car'}, {'lower': 'was'}, {'lower': 'stolen'}]}
{'label': 'AUTO_THEFT', 'pattern': 

In [48]:
assault_pattern = ''

for key in assault_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({'lower' : word})
        assault_pattern += str({'label': 'ASSAULT', 'pattern': pattern}) + '\n'
    else:
        assault_pattern += str({'label': 'ASSAULT', 'pattern': [{'lower': key}]}) + '\n'
        
print(assault_pattern)

{'label': 'ASSAULT', 'pattern': [{'lower': 'assault'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'aggravated'}, {'lower': 'assault'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'assaulted'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'weapons'}, {'lower': 'offense'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'hit-and-run'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'hit'}, {'lower': 'and'}, {'lower': 'run'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'strangulation'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'obstruct'}, {'lower': 'breath'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'serious'}, {'lower': 'injury'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'injury'}, {'lower': 'to'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'injured'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'knifing'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'stabbing'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'stab'}]}
{'label': 'ASSAULT', 'pattern': [{'lower': 'poison'}]}
{'label': 'AS

In [49]:
homicide_pattern = ''

for key in homicide_key_words:
    if len(key.split()) > 1:
        pattern = []
        for word in key.split():
            pattern.append({"lower" : word})
        homicide_pattern += str({"label": "HOMICIDE", "pattern": pattern}) + '\n'
    else:
        homicide_pattern += str({"label": "HOMICIDE", "pattern": [{"lower": key}]}) + '\n'
        
print(homicide_pattern)

{'label': 'HOMICIDE', 'pattern': [{'lower': 'murder'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'assassination'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'homicide'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'lynching'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'manslaughter'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'genocide'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'kill'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'killed'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'dowry'}, {'lower': 'death'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'death'}, {'lower': 'investigation'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'fatally'}, {'lower': 'shot'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'died'}, {'lower': 'after'}, {'lower': 'shots'}, {'lower': 'fired'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'died'}, {'lower': 'after'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'shot'}]}
{'label': 'HOMICIDE', 'pattern': [{'lower': 'shooting'}]}
{

In [50]:
final_pattern = ''.join([harassment_pattern, theft_pattern, robbery_pattern, auto_theft_pattern, assault_pattern, homicide_pattern])

with open(os.path.join(PATH_TO_PATTERNS, 'base-patterns.jsonl'), 'w') as file:
    file.write(final_pattern.replace("\'", '\"'))

### Upload news body in jsonl where is_criminal == 1 

In [51]:
type(news_df['harassment_keywords_body'][0])

list

In [52]:
news_df['harassment_keywords_body'].apply(len).sort_values()

0        0
32542    0
32543    0
32544    0
32545    0
        ..
44953    5
39356    5
45464    5
17111    6
17327    6
Name: harassment_keywords_body, Length: 49309, dtype: int64

In [53]:
news_df[news_df['is_harassment'] == 1].sort_values('harassment_keywords_body', key = lambda x: x.apply(len), ascending=False).head()

Unnamed: 0,source_name,url,paragraphs,paragraphs_nwords,dateadded,eventrootcode,eventbasecode,eventcode,actiongeo_fullname,actiongeo_countrycode_iso3,v2themes_names,body,cleaned_body,harassment_keywords_body,is_harassment,theft_keywords_body,is_theft,robbery_keywords_body,is_robbery,auto_theft_keywords_body,is_auto_theft,assault_keywords_body,is_assault,homicide_keywords_body,is_homicide,trial_keywords_body,is_trial,car_accident_keywords_body,is_car_accident,stats_news_keywords_body,is_stats_news,gun_policy_news_keywords_body,is_gun_policy_news,non_crime_keywords_body,is_non_crime,is_criminal
17327,thecolu.mn,http://thecolu.mn/11203/top-5-falsehoods-the-r...,Minnesota is poised to pass one of the nation’...,1558.0,2022-07-30 11:45:00,2.0,23.0,23.0,"Little Falls, Minnesota, United States",USA,['AFFECT' 'BAN' 'BULLYING' 'CRISISLEX_CRISISLE...,Minnesota is poised to pass one of the nation'...,Minnesota is poised to pass one of the nation'...,"[bullied, harassing, harassment, intimidating,...",1,[],0,[],0,[],0,[],0,[kill],1,[],0,[],0,[],0,[],0,[],0,False
34777,jewishlink.news,https://jewishlink.news/features/53295-what-cn...,There were some good points in CNN’s recent sp...,1321.0,2022-09-01 16:30:00,16.0,163.0,163.0,"Jersey City, New Jersey, United States",USA,['ARMEDCONFLICT' 'CRISISLEX_C07_SAFETY' 'CRISI...,There were some good points in CNN's recent sp...,There were some good points in CNN's recent sp...,"[defamation, harass, harassing, harassment, in...",1,[],0,[],0,[],0,"[assault, shooting]",1,"[murder, shooting]",1,[],0,[],0,[],0,[],0,[],0,False
38313,www.candgnews.com,https://www.candgnews.com/news/antidefamation-...,SOUTHFIELD — People will soon come together to...,570.0,2022-09-08 18:30:00,14.0,141.0,141.0,"Detroit, Michigan, United States",USA,['BULLYING' 'CRISISLEX_CRISISLEXREC' 'DEMOCRAC...,SOUTHFIELD -- People will soon come together t...,SOUTHFIELD -- People will soon come together t...,"[bullied, defamation, harassed, tracking]",1,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,False
4878,www.huffpost.com,https://www.huffpost.com/entry/brett-kavanaugh...,Supreme Court Justice ducked out the back door...,732.0,2022-07-09 02:45:00,14.0,141.0,141.0,"Washington, District of Columbia, United States",USA,['CONSTITUTIONAL' 'CRISISLEX_C03_WELLBEING_HEA...,Supreme Court Justice ducked out the back door...,Supreme Court Justice ducked out the back door...,"[harass, harassed, intimidate, intimidation]",1,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0,False
32248,www.grandrapidsmn.com,https://www.grandrapidsmn.com/opinion/commenta...,Believing and breaking down the barriers of wh...,706.0,2022-08-27 13:30:00,18.0,180.0,180.0,United States,USA,['ACT_HARMTHREATEN' 'CRISISLEX_CRISISLEXREC' '...,Believing and breaking down the barriers of wh...,Believing and breaking down the barriers of wh...,"[groping, harass, harassment, obscene]",1,[],0,[],0,[],0,[assault],1,[],0,[],0,[],0,[],0,[],0,[],0,False


In [54]:
q

NameError: name 'q' is not defined

In [None]:
crime_cols

In [None]:
fpath = os.path.join(PATH_TO_DATA_ROOT_DIR, r'data_providers\gdelt\scraped_news')
news_body_to_write = news_df.sample(frac=1)

for col in crime_cols:
    category = (col.split('_')[1] if len(col.split('_')) == 2 else '_'.join(col.split('_')[1:]))
    fname = 'usa_news_text_body_2022-07-01_2022-10-01_' + category + '.jsonl'
    sort_column = category + "_keywords_body"
    
    mask = news_body_to_write[col] == 1
    news_body_jsonl = pd.DataFrame(news_body_to_write[mask].reset_index(drop=True).sort_values(by='paragraphs_nwords')['cleaned_body'])
    #news_body_jsonl = pd.DataFrame(news_body_jsonl.sort_values(by=sort_column, key=lambda x: x.apply(len), ascending=False)['cleaned_body'])
    news_body_jsonl = news_body_jsonl.rename(columns={'cleaned_body' : 'text'})
    news_body_jsonl.to_json(os.path.join(fpath, fname), orient='records', lines=True)
    news_body_to_write = news_body_to_write[~mask]
    
    print(f'Number of news in {category}: {news_body_jsonl.shape[0]}')

In [None]:
news_body_to_write = news_df.sample(frac=1)

category = 'harassment'
fname = 'usa_news_text_body_2022-07-01_2022-10-01_' + category + '.jsonl'
sort_column = category + "_keywords_body"
    
mask = news_body_to_write['is_harassment'] == 1
news_body_jsonl = news_body_to_write[mask].reset_index(drop=True)
news_body_jsonl = pd.DataFrame(news_body_jsonl.sort_values(by=sort_column, key=lambda x: x.apply(len), ascending=False)['cleaned_body'])
news_body_jsonl = news_body_jsonl.rename(columns={'cleaned_body' : 'text'})

In [None]:
news_body_jsonl

In [None]:
mask = news_df['is_criminal'] == 1

news_body_jsonl = pd.DataFrame(news_df.loc[mask, 'cleaned_body'].reset_index(drop=True))
news_body_jsonl = news_body_jsonl.rename(columns={'cleaned_body' : 'text'})

print(news_body_jsonl.shape)
news_body_jsonl.head()

In [None]:
q

In [None]:
fpath_jsonl = os.path.join(PATH_TO_DATA_ROOT_DIR, r'data_providers\gdelt\scraped_news\usa_news_text_body_2022-07-01_2022-10-01.jsonl')

news_body_jsonl.to_json(fpath_jsonl, orient='records', lines=True)

In [None]:
from spacy.lang.en import English

nlp = English()
doc = nlp('Hello Nazar Protsiv, congrats You')

for toc in doc:
    print(doc.lower)

In [None]:
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
from spacy.matcher import Matcher

nlp = English()
matcher = Matcher(nlp.vocab)

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "unlawful"}, {'LOWER': 'exposure'}]
matcher.add("HARASS", [pattern])

#nlp = spacy.load("en_core_web_lg")
#matcher = PhraseMatcher(nlp.vocab)
#terms = ['unlawful exposure']
# Only run nlp.make_doc to speed things up
#patterns = [nlp.make_doc(text) for text in terms]
#matcher.add("HARASSMENT", patterns)

doc = nlp("A man was accused on unlawful exposure "
          "A man was accused on Unlawful Exposure ")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
harassment_patterns