In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from datetime import datetime
import operator
import string
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

pd.set_option('display.max_columns', 500)
pd.options.mode.chained_assignment = None

In [2]:
# Load in the data
path = os.getcwd()
path = path + '\\NLP_Data\\'

df = pd.read_csv(path + 'accident_and_injury.csv')

In [3]:
# Replace empty data cells with Nan
df = df.replace('', np.nan)
df = df.replace(' ', np.nan)

In [4]:
# change backslashe to whitespace because sentences will be split into words based on whitespaces
df['Abstract Text'] = df['Abstract Text'].map(lambda x: x.replace('\\', ' '))

In [5]:
# Extract the time of each event based by locating it in the text, this is done by using regex
pattern = '([0-9]{1,2}:[0-9]{2,2}\s(?:a\.m\.|p\.m\.))'
df['event_time'] = df['Abstract Text'].map(lambda x: re.findall(pattern, x))

In [6]:
for i in range(len(df)):
    # If multiple times are found, the first one is taken to be the event time
    if (len(df['event_time'].iloc[i]) > 0) and (type(df['event_time'].iloc[i]) == list):
        df['event_time'].iloc[i] = str(df['event_time'].iloc[i][0])
    # If no times are found, then fill the cell with NaN
    elif len(df['event_time'].iloc[i]) == 0:
        df['event_time'].iloc[i] = np.nan

In [7]:
# Change the string format to convert it over to datetime
df['event_time'] = df['event_time'].map(lambda x: x.replace('.', '').replace(' ', '') if type(x) != float else x)
df['event_time'] = df['event_time'].map(lambda x: datetime.strptime(x, '%I:%M%p').time() if type(x) != float else x)

In [8]:
def abstract_process(text):
    # split into words
    words = word_tokenize(text)

    # Convert all words to lower case
    words = [w.lower() for w in words]

    # Remove punctuation from each word
    temp = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(temp) for w in words]

    # Remove non-alphabetic words
    words = [word for word in words if word.isalpha()]

    # Filter out so called stop words with a built in function
    # Example words: the, is, at, which and on
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # Stemming all words
    # Stemming is the process of reducing words to their word stem, base or root form
    # Example: cook, cooking and cooked are all stemmed to the word cook
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in words]
    
    return stemmed_words

In [9]:
# Create a seperate function for keywords that takes in a single string and returns stemmed words 
def keyword_process(text):
    # Split the string into words on | as seperator
    words = re.split(',| ', text)

    # Convert all words to lower case
    words = [w.lower() for w in words]

    # Remove punctuation from each word
    temp = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(temp) for w in words]

    # Remove non-alphabetic words
    words = [word for word in words if word.isalpha()]

    # Filter out so called stop words with a built in function
    # Example words: the, is, at, which and on
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # Stemming all words
    # Stemming is the process of reducing words to their word stem, base or root form
    # Example: cook, cooking and cooked are all stemmed to the word cook
    porter = PorterStemmer()
    stemmed_keywords = [porter.stem(word) for word in words]
    
    return stemmed_keywords

In [10]:
# Define a function that cleans a string of words that provide no information on how an accident or injury occured
# This function takes care of words that provide no context at all
def string_clean(string):
    # Define a hard coded list of noisy words
    noisewords = ['January','February','March','April','May','June','July', 
                  'August', 'September', 'October', 'November', 'December',
                  'january','february','march','april','may','june','july', 
                  'august', 'september', 'october', 'november', 'december', 
                  'a.m.', 'p.m.']
    
    # Split the text into words
    words = string.split()
    
    # Remove all noisewords and recombine into a string
    cleaned_words  = [word for word in words if word.lower() not in noisewords]
    clean_string = ' '.join(cleaned_words)
    
    return clean_string

In [11]:
# Define a function that cleans a list of words that provide no information on how an accident or injury occured
# This function takes care of words that might provide some informatino but not in this case
def list_clean(word_list):
    # Define a hard coded list of noisy words
    noise_words = ['employe', 'work', 'approxim', 'cowork', 'right', 'left', 'worker', 'kill', 'sustain', 'hospit', 'oper', 
                   'suffer', 'die', 'injuri', 'one', 'two', 'three', 'finger', 'head', 'feet', 'amput', 'hand', 'use', 
                   'back', 'remov', 'cut', 'caus', 'foot']
    
    # Remove all noisewords from the list
    clean_list = [x for x in word_list if x not in noise_words]
    
    return clean_list

In [12]:
def get_ratio(dataframe, clean, event_desc_weight):
    ratio_list = list()
    overall_list = list()

    for i in range(len(dataframe)):
        if clean == True:
            # Clean the abstract text of words that provide no context
            abstract_text = string_clean(dataframe['Abstract Text'].iloc[i])
        else:
            abstract_text = dataframe['Abstract Text'].iloc[i]

        # Get the stemmed words from abstract text, event description and the event keywords
        stemmed_text = abstract_process(abstract_text)
        stemmed_desc = abstract_process(dataframe['Event Description'].iloc[i])
        stemmed_keywords = keyword_process(dataframe['Event Keywords'].iloc[i])

        # Get the total number of keywords in the text
        number_of_keywords = len(set(stemmed_keywords))

        if clean == True:
            # Clean the stemmed lists of words extracted from the abstract text and event description
            # These words provide no context in this specific scenario
            stemmed_text = list_clean(stemmed_text)
            stemmed_desc = list_clean(stemmed_desc)

        # Get an overall list of words that are common in all texts or lists
        overall_list = overall_list + stemmed_text + stemmed_desc

        # If any keywords were logged
        if number_of_keywords > 0:

            # Use built in NLP function to count the frequency of stemmed words in abstract text and the event description
            freq_text = nltk.FreqDist(stemmed_text)
            freq_desc = nltk.FreqDist(stemmed_desc)

            # Increase the weight on event description words because these are often more precise when it comes to the cause
            for key in stemmed_desc:    
                freq_desc[key] *=  event_desc_weight

            # Combine the frequency distributions
            freq_combined = { k: freq_text.get(k, 0) + freq_desc.get(k, 0) for k in set(freq_text) | set(freq_desc) }

            # Get a list of words with the highest frequency
            # The number of words is limited based on the number of keywords, this is done for accuracy comparison
            highest_combined = dict(sorted(freq_combined.items(), 
                                           key = operator.itemgetter(1), reverse = True)[:int(number_of_keywords)])

            # Find how many words in the highest frequency table is in the keyword list
            intersec = set(highest_combined).intersection(stemmed_keywords)

            # Calculate the ratio to use for accuracy predictions
            ratio = len(intersec) / number_of_keywords

            ratio_list.append(ratio)
            
    return (np.mean(ratio_list), overall_list)

In [13]:
org_ratio, org_overall_list = get_ratio(df, False, 1)
print('Accuracy based on original method is  {} %'.format(org_ratio)) 

clean_ratio, clean_overall_list = get_ratio(df, True, 2)
print('Accuracy with noise words removed and increased weight on description  {} %'.format(clean_ratio)) 

Accuracy based on original method is  0.2639325579958668 %
Accuracy with noise words removed and increased weight on description  0.3659481481277102 %


This last part was used to create the application mockup for health and safety overview.

In [14]:
main_freq = nltk.FreqDist(clean_overall_list)
dict(sorted(main_freq.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'fall': 2200,
 'fell': 1853,
 'struck': 1811,
 'truck': 1619,
 'fractur': 1273,
 'machin': 1107,
 'ladder': 1040,
 'crush': 952,
 'roof': 909,
 'caught': 809}