# Using NLP to Classify News Articles about Intimate Partner Violence

### Capstone project by Sean Justice for the Fall 2018 cohort at the New York City Data Science Academy
Further details available on [this blog post](https://nycdatascience.com/blog/student-works/using-nlp-to-classify-articles-about-intimate-partner-violence/)


In [None]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import spacy
from spacy import displacy
import json
import requests

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Use dataset generated from parsing the [pdf on Arizona intimate partner violence](https://everytownresearch.org/documents/2015/09/census-domestic-violence-gun-homicides-arizona.pdf)
- PDF contains information on 105 instances of IPV in Arizona from 2009 to 2013
    - Research performed by [Everytown for Gun Safety](https://everytownresearch.org/)
- Each entry contains the data of the incident, city where it occurred, and a paragraph summarizing the incident
- Also contained are some labels that classify the type of incident
     - Was this a shooter suicide incident?
     - Was there a history of domestic violence?
     - Were they any prior convictions in the case?
     - Had an order of protection been granted by the courts?
     - Did the order require firearms be turned in to law inforcement?
     - Did the federal government prohibit the attacker from possessing firearms?
- Just two of the subclassifications will be used for creating a model using news articles about the incidents
    - Shooter suicide
    - History of domestic violence


In [None]:
az_df = pd.read_csv('./data/census-domestic-violence-arizona.csv', skiprows=[1], na_filter=False)

In [None]:
az_df

### Perform named entity extraction to get the names of all those involved in each instance of IPV
- These names will be used for finding news articles about each instance
- Using both nltk and spacy for named entity extraction since I found that on their own they missed some names

In [None]:
# NLTK based named entity extraction
def extract_entities(text):
    ret_list = []
    '''
    Divide each text portion of the incident in to sentences, tokenize each
    sentence, and then apply the part of speech tag to it, and combine those in 
    to chunks. Return only the chunks that are labeled as a PERSON
    '''
    #  Divide each text portion of the incident in to sentences, tokenize each
    # sentence, and then apply the part of speech tag to it, and combine those in 
    # to chunks. Once those chunks are labeled, filter to return only the ones that
    # are labeled as a PERSON
    
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                cur_name = ' '.join([c[0] for c in chunk.leaves()])
                if cur_name not in ret_list:
                    ret_list.append(cur_name)
    return(ret_list)

az_df['ne_names'] = az_df.Text.apply(extract_entities)

In [None]:
# Load the small english model for spacy to perform named entity detection
nlp = spacy.load('en_core_web_sm')
def parse_nlp_spacy(text):
    doc = nlp(text)
    # Return the items in the text that are labeled as PERSON
    return [X.text for X in doc.ents if X.label_ == 'PERSON']

In [None]:
az_df['spacy_names'] = az_df.Text.apply(parse_nlp_spacy)

In [None]:
# unravel the lists in the ne_names and spacy_names columns and put them in a set then join the union of the sets
az_df['names'] = az_df.apply(lambda x: ','.join({*x.ne_names} | {*x.spacy_names}), axis=1)

In [None]:
az_ipv_data = az_df[['Text', 'Location', 'Date', 'shooter_suicide', 'dv_history', 'names']].copy()
az_ipv_data.head()

In [None]:
# Intermediate writing of data to csv
# az_ipv_data.to_csv('./az_ipv_data.csv', index=False)

## Use bing search to find news articles about each incident using the names
- Uses Microsoft's Azure Cognitive Search service
- Each search returns ten results
- Search uses two keys that are read from the ./data/bing_keys.json config file
- Tutorial on how to setup Azure search can be found here: https://docs.microsoft.com/en-us/azure/cognitive-services/bing-web-search/quick-start


In [None]:
with open('./data/bing_keys.json') as f:
    data = json.load(f)
bing_key1 = data['bing_key1']
bing_key2 = data['bing_key2']
bing_search_url = 'https://api.cognitive.microsoft.com/bing/v7.0/search'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'

In [None]:
# websites that appear in the results but do not give meaningful results
website_ban = ['spokeo', 'facebook', 'twitter', 'ancestry', 'youtube', 'tripadvisor', 'gofundme', 'mylife', 'amazon',
               'peoplesmart', 'beenverified', 'myheritage', 'obituary', 'obituaries', 'whitepages', 'findagrave',
               'peoplefinder', 'peekyou', 'courtlistener', 'elder-law', 'linkedin', 'imdb', 'wikipedia', 'celebrity',
               'mugshots', 'walsh', 'madehoops', 'trakt.tv', 'tvmaze', 'brianshealinghearts', 'medicine.yale', 'freerepublic',
               'nocera.blogs', 'webuygoldandsilver', 'aminoapps', 'contempglass', 'earnthenecklace', 'change.org', 'pdf', 'xlsx']

# The search query is the proper names of each individual in the incident
# Each query returns a json that 
def bing_search(text):
    #print(text)
    text = ','.join([x for x in text.split(',') if x.count(' ') > 0])
    url_list = []
    headers = {"Ocp-Apim-Subscription-Key" : bing_key1, 'UserAgent':user_agent, 'Pragma': 'no-cache'}
    params  = {"q":text, "textDecorations":True, "textFormat":"HTML"}
    response = requests.get(bing_search_url, headers=headers, params=params)
    response.raise_for_status()
    response_json = response.json()
    # Use try to handle when no results are returned for a query
    # Also don't include any results that are in the list of banned websites
    # Result is a list of urls for each incident
    try:    
        for inst in response_json['webPages']['value']:
            if not any(x in inst['url'] for x in website_ban):
                url_list.append(inst['url'])
        if not url_list:
            print('No results for search term: {}'.format(text))
        return url_list
    except KeyError:
        print('No results for search term: {}'.format(text))
        return url_list

In [None]:
az_ipv_data['url_list'] = az_ipv_data.names.apply(bing_search)

In [None]:
# Remove rows that don't have any urls associated with them
az_ipv_data = az_ipv_data.loc[~az_ipv_data.url_list.apply(lambda x: len(x) > 0)].copy()

In [None]:
az_ipv_data = pd.read_csv('./data/az_ipv_data_with_urls2.csv', index_col=0)
az_ipv_data.head()

In [None]:
# Create a dataframe where each url has a row
az_ipv_url_df = pd.DataFrame(az_ipv_data.url_list.apply(lambda x: ' '.join(x)).str.split(expand=True).stack())
az_ipv_url_df.index = az_ipv_url_df.index.droplevel(level=1)
az_ipv_url_df.columns = ['url']

In [None]:
# merge the dataframes using the index
merged_df = pd.merge(az_ipv_data, az_ipv_url_df, left_index=True, right_index=True)

In [None]:
az_ipv_merged_url_df = merged_df[['Text','City', 'Date', 'shooter_suicide', 'dv_history', 'Year', 'names', 'url']]
az_ipv_merged_url_df.head()

## Gather contents for each url related to the IPV incident
- First check that the url is valid otherwise skip it
- Grab the contents of the website using BeautifulSoup


In [None]:
# Check that the url is valid
def check_request_ok(url):
    try:
        r = requests.get(url)
        return r.status_code == requests.codes.ok
    except:
        print('Connection refused to {}'.format(url))
        return False

In [None]:
request_check = az_ipv_merged_url_df.url.apply(check_request_ok)

In [None]:
# Filter out invalid urls
az_ipv_merged_url_df = az_ipv_merged_url_df.loc[request_check].copy()

In [None]:
# Parse the text contents of each site. Skip some entries based on experience with some websites
def get_url_text(url):
    req = requests.get(url)
    bs = BeautifulSoup(req.text)
    ret_text = []
    ret_text += [tag.text for tag in bs.findAll() if tag.name in ['p','text'] and len(tag.text.split()) > 10]
    #print(ret_text)
    #print(len(ret_text))
    if ret_text and ret_text[0].find('compilation') < 0 :
        return ret_text
    else:
        ret_text = []
        #print('using entry-content')
        for inst in bs.select('div[class*="entry-content"]'):
            ret_text.append(inst.get_text())
        return [x for x in ret_text if x]

In [None]:
# Set the text returned from each website to a new column
az_ipv_merged_url_df['url_text'] = az_ipv_merged_url_df.url.apply(get_url_text)
# and filter out entries that didn't return any text
az_ipv_text_df = az_ipv_merged_url_df.loc[az_ipv_merged_url_df.url_text.apply(lambda x: len(x) > 0)].copy()

## Continue filtering the entries to only the ones that appear to be related to the incident
- Use keywords to see if the entry is valid
- Also make sure the names of the individuals appear in the article
- This reduces the number of articles down to the ones that are related to IPV
- Also clean up the text for each entry

In [None]:
keywords_list = ['kill', 'murder', 'arizona', 'victim', 'gun', 'homicide', 'arrest', 'manslaugter', 'police', 'argument']
def check_for_keywords(row):
    keyword_match = False
    name_match = False
    for cur_block in row.url_text:
        #print(cur_block)
        cur_block_lower = cur_block.lower()
        name_list = [x.lower() for x in set(re.split(',| ',row.names))]
        keyword_match = keyword_match | len([x for x in keywords_list if x in cur_block_lower]) > 0
        name_match = name_match | len([x for x in name_list if x in cur_block_lower]) > 0
        if keyword_match and name_match:
            return True
    return False

In [None]:
keyword_check = az_ipv_text_df.apply(check_for_keywords, axis=1)
az_ipv_news_df = az_ipv_text_df.loc[keyword_check].copy()

In [None]:
# Some entries are in lists so combine them all in to a single string
az_ipv_news_df['url_text'] = az_ipv_news_df.url_text.apply(lambda x: ' '.join(x))

In [None]:
# Clean up the text since there are some tags in the strings
def clean_text_data(text):
    ret_string = ''
    newline_regex = re.compile('\\n|\\xa0|\\t|<[a-z"= ]+?>')
    space_regex = re.compile('\s\s+')
    ret_string = newline_regex.sub(' ', text)
    ret_string = space_regex.sub(' ', ret_string)
    return ret_string

az_ipv_cleaned_text = az_ipv_news_df.copy()
az_ipv_cleaned_text.loc[:, 'url_text'] = az_ipv_news_df.url_text.apply(clean_text_data)

# Remove this cell

## Building a classification model using text about each IPV incident
- Built two models. One using the shooter suicide category as the response, 
- Another on the history of domestic violence as the response
- First, since there are multiple articles for some incidents, decide on which on to use based on the amount of words and the number of IPV keywords that appear in the text

In [None]:
# Encode the responses as 0 or 1
az_ipv_cleaned_text.loc[:, 'shooter_suicide'] = az_ipv_cleaned_text.shooter_suicide.apply(lambda x: 1 if x == 'Yes' else 0)
az_ipv_cleaned_text.loc[:, 'dv_history'] = az_ipv_cleaned_text.dv_history.apply(lambda x: 1 if x.strip() == 'Yes' else 0)

In [None]:
def extract_named_entities(text):
    ret_list = []
    # Reload en_core_web_sm model
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    ret_list += [X.text for X in doc.ents if X.label_ == 'PERSON' and X.text.count(' ') == 0 and re.match('[A-Z]', X.text)]
    return([x.lower() for x in set(ret_list) if len(x) > 3])

az_ipv_cleaned_text['url_names'] = az_ipv_cleaned_text.url_text.apply(extract_named_entities)

In [None]:
keywords = ['kill', 'murder', 'arizona', 'victim', 'gun', 'homicide', 'arrest', 'manslaugter', 'police', 'argument']
def get_num_keywords(row):
        #print(cur_block)
        cur_keywords = keywords + [x.lower() for x in re.split(',| ', row.names)]
        keyword_match = [x for x in cur_keywords if x in row.url_text]
        return len(keyword_match)

az_ipv_cleaned_text['url_text_length'] = az_ipv_cleaned_text.url_text.apply(lambda x: len(x))
az_ipv_cleaned_text['keyword_count'] = az_ipv_cleaned_text.apply(get_num_keywords, axis=1)
groupby_index = az_ipv_cleaned_text.reset_index().groupby('index').agg({'url_text_length':'max', 'keyword_count':'max'})
groupby_index.rename({'url_text_length':'max_text_length', 'keyword_count':'max_keyword_count'}, axis=1, inplace=True)
az_merged_df = pd.merge(az_ipv_cleaned_text, groupby_index, left_index=True, right_index=True)

In [None]:
name_list = az_ipv_cleaned_text.url_names.sum()
# Some last names happen to be common words so don't remove those
names_to_remove = ['manslaughter', 'ar-15/m16', 'bias', 'bigamy', 'blanks', 'broken', 'case', 'child', 'colt', 'drive', 
                   'fill', 'girl', 'husband', 'jesus', 'long', 'neighbors', 'organization', 'rich', 'scene', 'sheriff',
                   'sirens', 'smart', 'xanax', 'young', 'younger', 'burger', 'burritos', 'glass']
name_list = [x for x in name_list if x not in names_to_remove]
az_reduced_data = az_merged_df[['Text', 'City', 'Date', 'shooter_suicide', 'dv_history', 'prior_convict', 'order_of_protect', 
                             'require_turn_in_firearm', 'fed_prohib', 'Year', 'url','url_text', 'names', 'url_text_length', 
                             'keyword_count', 'max_text_length', 'max_keyword_count']]

In [None]:
find_max_mask = az_reduced_data.keyword_count == az_reduced_data.max_keyword_count
az_reduced_data = az_reduced_data.loc[az_reduced_data.apply(lambda x: x.url_text_length == x.max_text_length, axis=1)].copy()
az_reduced_data.drop_duplicates(inplace=True)
# Combine the text from the pdf with the text from the news articles to increase the words count associated with each article
#az_reduced_data['combined_text'] = az_reduced_data.url_text + az_reduced_data.Text

## NLP analysis of text from new articles
- Updating the stop word list based on some words that were appearing in the model, but did not give meaningful results
- Also removing the list of names so that the model doesn't overfit the training data 
- Perform grid search cross validation to find the best settings for the models.
- Use a pipeline to first create a TF-IDF vectorizer and then put that through a stochastic gradient descent classifier that used a linear SVM kernel

### Domestic violence model

In [None]:
manual_stopwords = ['dallas', 'anthony', 'turquoise', 'manuel', 'phoenix', 'tpd', 'phoenix', 'pima', 'mohave',
                    'thompson', 'jr', 'ramapo', 'tucson', 'rios', 'county', 'trevino', 'mccormick', 'deckard', 'tia', 
                    'vicki', 'caldwell', 'luberda', 'scottsdale', 'havasu', 'azcentral', 'la', 'mccaskill', 
                    'wayne', 'douglas', 'jay', 'adtech_pagealias', 'chicago', 'rosarito', 'margaret', 'css', 
                    'ford', 'de', 'los', 'seattle', 'carolina', 'martos', 'morales', 'san', 'joe', 'torres', 
                    'se', 'perez', 'doris', 'west', 'gilbert', 'oro', 'ismael', 'bodine', 'martos', 'ulan', 'los',
                    'flagstaff', 'torres', 'comments', 'var', 'readers', 'reading', 'nyland', 'las', 'unlimited', 
                    'philip', 'joshua', 'bali', 'lizzie', 'wakeham', 'sanders', 'dagle', 'tina', 'vondran', 'dewitteâ', 
                    'mesa', 'east', 'city', 'theresa', 'beaver', 'gallegos', 'scharge', 'pascual', 'ned', 'california']
new_stopwords = set(stopwords.words('english') + name_list + manual_stopwords)

dv_stopwords = ['domestic', 'violence'] + list(new_stopwords)
dv_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=dv_stopwords, sublinear_tf=True)),
    ('clf', SGDClassifier(loss='hinge', n_jobs=2, random_state=42, class_weight='balanced'))])

In [None]:
# Hyperparameters to vary during grid search
my_grid_dv = {'tfidf__ngram_range':((1,2), (1,3)), 'tfidf__min_df': (2, 5), 'tfidf__max_df': (0.9, 0.8), 
              'tfidf__use_idf': (True, False), 'tfidf__max_features': (10000, 25000, 50000), 
              'clf__alpha': (1e-3, 1e-5, 1e-7), 'clf__max_iter': (10, 50, 80)}
grid_cv_dv = GridSearchCV(estimator=dv_pipeline, n_jobs=5, cv=5, param_grid=my_grid_dv, verbose=3)

In [None]:
# Divide the domestic 
X_train_dv, X_test_dv, Y_train_dv, Y_test_dv = train_test_split(az_reduced_data.url_text, 
                                                                az_reduced_data.dv_history, random_state=42, 
                                                                test_size=0.2, stratify=az_reduced_data.dv_history)
grid_cv_dv.fit(X_train_dv, Y_train_dv)

In [None]:
# Best hyperparamters found during grid search
print(grid_cv_dv.best_params_)
print(grid_cv_dv.best_score_)
Y_test_predict_dv = grid_cv_dv.best_estimator_.predict(X_test_dv)
print(grid_cv_dv.best_estimator_.score(X_test_dv, Y_test_dv))
confusion_matrix(y_pred=Y_test_predict_dv, y_true=Y_test_dv)

In [None]:
dv_pipeline.set_params(**grid_cv_dv.best_params_)
dv_pipeline.fit(X_train_dv, Y_train_dv)
print(dv_pipeline.score(X_test_dv, Y_test_dv))
best_sdg_dv = dv_pipeline.named_steps.clf
best_tfidf_dv = dv_pipeline.named_steps.tfidf
X_train_tfidf_dv = best_tfidf_dv.transform(X_train_dv)
coef_dict_dv = dict(zip(best_tfidf_dv.get_feature_names(), map(lambda x: x[0],best_sdg_dv.coef_.T.tolist())))
sorted_coef_list_dv = sorted(coef_dict_dv.items(), key=lambda kv: abs(kv[1]), reverse=True)

In [None]:
# Feature importance based on the terms from the TF-IDF vectorizer
sorted_coef_list_dv

### Shooter suicide model

In [None]:
X_train_ss, X_test_ss, Y_train_ss, Y_test_ss = train_test_split(az_reduced_data.url_text, 
                                                                az_reduced_data.shooter_suicide, 
                                                                random_state=42, test_size=0.2, 
                                                                stratify=az_reduced_data.shooter_suicide)
ss_stopwords = list(new_stopwords) + ['suicide']
ss_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=ss_stopwords, sublinear_tf=True)),
    ('clf', SGDClassifier(loss='hinge', n_jobs=2, random_state=42, class_weight='balanced')),
])
my_grid_ss = {'tfidf__ngram_range':((1,2), (1,3)), 'tfidf__min_df': (2, 5), 'tfidf__max_df': (0.9, 0.8), 
              'tfidf__use_idf': (True, False), 'tfidf__max_features': (10000, 25000, 50000), 
              'clf__alpha': (1e-3, 1e-5, 1e-7), 'clf__max_iter': (10, 50, 80)}
grid_cv_ss = GridSearchCV(estimator=ss_pipeline, n_jobs=5, cv=5, param_grid=my_grid_ss, verbose=3)
grid_cv_ss.fit(X_train_ss, Y_train_ss)

In [None]:
print(grid_cv_ss.best_params_)
print(grid_cv_ss.best_score_)
Y_test_predict_ss = grid_cv_ss.best_estimator_.predict(X_test_ss)
print(grid_cv_ss.best_estimator_.score(X_test_ss, Y_test_ss))
confusion_matrix(y_pred=Y_test_predict_ss, y_true=Y_test_ss)

In [None]:
# Get the feature importance
ss_pipeline.set_params(**grid_cv_ss.best_params_)
ss_pipeline.fit(X_train_ss, Y_train_ss)
print(ss_pipeline.score(X_test_ss, Y_test_ss))
best_sdg_ss = ss_pipeline.named_steps.clf
best_tfidf_ss = ss_pipeline.named_steps.tfidf
X_train_tfidf_ss = best_tfidf_ss.transform(X_train_ss)
coef_dict_ss = dict(zip(best_tfidf_ss.get_feature_names(), map(lambda x: x[0],best_sdg_ss.coef_.T.tolist())))
sorted_coef_list_ss = sorted(coef_dict_ss.items(), key=lambda kv: abs(kv[1]), reverse=True)

In [None]:
sorted_coef_list_ss