In [291]:
import math
import numpy as np
import nltk
nltk.download('punkt')
import os
import pandas as pd
import random
import re
import stanza

from stanza.utils.datasets.ner.utils import write_dataset
from transform_weight_date import number_to_words, date_to_formats

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [292]:
# Read in synthetic data
df = pd.read_csv('../../data/data/generated/data_230124-172021.csv')

In [293]:
# Use re to replace any instances of "####kg" with "#### kg" where #### is any continuous 
# sequence of numbers and unit is one of those listed below
def separate_weight_unit(row):
    return re.sub(r'([0-9]+)(kgs|kg|lbs|lb|pounds|kilograms)', r"\1 \2", row)

# Function to remove spaces (e.g. "Take 3" -> "Take3")
def remove_spaces(text):
    return text.replace(" ", "")

# Function to replace long hyphen ASCII code with short hyphen '-' ASCII code
def character_norm(text):
    return text.replace(chr(8211), "-")

# Word tokenizer splits ',' into separate token, so we have this function to do the same
def add_comma_token(text):
    return text.replace(",", " ,")

# Word tokenizer splits ',' into separate token, so we have this function to do the same for our dates list
def add_date_var_comma_token(list):
    new_list = []
    for i in list:
        new_list.append(add_comma_token(i))
    return new_list

# Gets the first token of each date variation, to allow for faster downstream computation 
def get_first_token_set(list):
    new_set = set()
    for i in list:
        new_set.add(i.split()[0])
    return new_set

In [294]:
# Assign appropriate types
string_cols = ["item1", "item2", "location"]
df[string_cols] = df[string_cols].astype(str)
df['weight2'] = df['weight2'].astype('Int64')

# Normalize text columns to match tokenizer 
df['text'] = df['text'].apply(lambda x: separate_weight_unit(x))
text_cols = ['text', 'organization', "item1", "item2", "location", "date"]
for i in text_cols:
    df[i] = df[i].apply(lambda x: character_norm(x))
df['text'] = df['text'].apply(lambda x: x.strip())

# Tokenize text
df['text_split'] = df['text'].apply(lambda x: nltk.word_tokenize(x))

# Preprocess orgs and locations 
df['org_no_space'] = df['organization'].apply(lambda x: remove_spaces(x))
df['loc_no_space'] = df['location'].apply(lambda x: remove_spaces(x))
for i in string_cols:
    df[i] = df[i].apply(lambda x: add_comma_token(x))

# Compute variations of date and weight formats and preprocess into desired formats
df['date_vars'] = df['date'].apply(lambda x: date_to_formats(x))
df['weight1_text'] = df['weight1'].apply(lambda x: number_to_words(x)[1])
df['weight2_text'] = df['weight2'].apply(lambda x: number_to_words(x)[1] if pd.notnull(x) else str(x))
df['date_vars'] = df['date_vars'].apply(lambda x: add_date_var_comma_token(x))
df['date_vars_first_token'] = df['date_vars'].apply(lambda x: get_first_token_set(x))

# Make string columns lowercase for downstream comparisons
lowercase_cols = string_cols + ['organization', 'org_no_space', 'loc_no_space', 'weight1_text', 'weight2_text']
for i in lowercase_cols:
    df[i] = df[i].apply(lambda x: x.lower())

df.head()
# df.info()
# print(df.iloc[12])
# df['prompt'][13]
# df['date_vars_first_token'][13]

Unnamed: 0,location,organization,type,date,unit,weight1,item1,prompt,text,weight2,item2,text_split,org_no_space,loc_no_space,date_vars,weight1_text,weight2_text,date_vars_first_token
0,adams rocks,take 3,instagram caption,2017-04-07,kilograms,200,spread tubs,Generate an instagram caption for a beach clea...,It was inspiring to witness so many people com...,,,"[It, was, inspiring, to, witness, so, many, pe...",take3,adamsrocks,"[2017-04-07, April 7 , 2017, april 7 , 2017, A...",two hundred,<na>,"{April, 04/07/2017, 04/07/17, 7th, 7, 2017-04-..."
1,grahams beach,global alliance against marine pollution,instagram caption,2018-09-04,pounds,100,trash,Generate an instagram caption for a beach clea...,We just made a huge difference at Grahams Beac...,,,"[We, just, made, a, huge, difference, at, Grah...",globalallianceagainstmarinepollution,grahamsbeach,"[2018-09-04, September 4 , 2018, september 4 ,...",one hundred,<na>,"{2018-09-04, 09/04/2018, Sep, 4th, sep, 4, Sep..."
2,norfolk island,plastic pollution coalition australia,press release,2021-05-25,kilograms,386,glass bottles,Generate a press release for a beach cleanup w...,Plastic Pollution Coalition Australia (PPCA) i...,,,"[Plastic, Pollution, Coalition, Australia, (, ...",plasticpollutioncoalitionaustralia,norfolkisland,"[2021-05-25, May 25 , 2021, may 25 , 2021, May...",three hundred and eighty-six,<na>,"{05-25-2021, 25th, 25, 2021-05-25, may, May, 0..."
3,playa grande de saboga,take 3,press release,2016-12-30,kgs,332,tupperwares,Generate a press release for a beach cleanup w...,Take 3 Celebrates a Successful Beach Cleanup i...,,,"[Take, 3, Celebrates, a, Successful, Beach, Cl...",take3,playagrandedesaboga,"[2016-12-30, December 30 , 2016, december 30 ,...",three hundred and thirty-two,<na>,"{30, 12/30/16, 12/30/2016, 12-30-2016, decembe..."
4,playa de chachalacas,rameau project,press release,2022-04-15,lbs,168,plastic,Generate a press release for a beach cleanup w...,FOR IMMEDIATE RELEASE\n\nThe Rameau Project Ce...,,,"[FOR, IMMEDIATE, RELEASE, The, Rameau, Project...",rameauproject,playadechachalacas,"[2022-04-15, April 15 , 2022, april 15 , 2022,...",one hundred and sixty-eight,<na>,"{April, 04/15/2022, 04-15-2022, 04/15/22, 15, ..."


In [295]:
# Convert data into list of words with associated 'B - entity', 'I - entity' or 'O'

units = set(["kilograms", "kilogram", "kgs", "kg", "lb", "lbs", "pounds", "pound"])
filler_words = set(["and", "the", "a", "an", ","])

def assign_entity_types(row):
    words = row['text_split']
    new_tags = []
    prev_item_tag = False

    idx = 0
    while (idx < len(words)):
        loc_length = len(row['location'].split())
        org_length = len(row['organization'].split())
        weight1_text_length = len(row['weight1_text'].split())
        if row['weight2_text'] != None:
            weight2_text_length = len(row['weight2_text'].split())
        else:
            weight2_text_length = -1
        
        # Assign location labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        # Does not handle extraneous locations not provided in prompt!
        if ((idx <= len(words) - loc_length) and 
            [x.lower() for x in words[idx : idx + loc_length]] == row['location'].split()):
            new_tags.append("B-LOC")
            idx += 1
            for i in range(1, loc_length):
                new_tags.append("I-LOC")
                idx += 1
        elif (words[idx].lower() == row['loc_no_space']):
            new_tags.append("B-LOC")
            idx += 1

        # Assign organization labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        elif ((idx <= len(words) - org_length) and 
            [x.lower() for x in words[idx : idx + org_length]] == (row['organization'].lower().split())):
            new_tags.append("B-ORG")            # idea for later: tag acronyms for Orgs?
            idx += 1                            
            for i in range(1, org_length):
                new_tags.append("I-ORG")
                idx += 1
        elif (words[idx].lower() == row['org_no_space']):
            new_tags.append("B-ORG")      
            idx += 1
            
        # Assign unit labels
        elif words[idx] in units:   
            new_tags.append("B-UNT")
            idx += 1
        
        # Assign weight labels for numeric and text numbers (consider '-' and non- '-' versions of written numbers?)
        elif (words[idx] == str(row['weight1']) or (row['weight2'] != None and words[idx] == str(row['weight2']))): 
            new_tags.append("B-WEI")
            idx += 1
        elif ((idx <= len(words) - weight1_text_length) and 
                [x.lower() for x in words[idx : idx + weight1_text_length]] == row['weight1_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1
        elif ((weight2_text_length > 0) and (idx <= len(words) - weight2_text_length) and 
                [x.lower() for x in words[idx : idx + weight2_text_length]] == row['weight2_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1

        # Assign item labels (dont look for consecutive matches here)
        # Does not handle extraneous trash items not provided in prompt!
        elif ((any(words[idx] == word for word in row['item1'].split()) or 
             (row['item2'] != None and any(words[idx] == word for word in row['item2'].split()))) and
             words[idx] not in filler_words):
            if prev_item_tag: 
                new_tags.append("I-ITM")
            else:
                new_tags.append("B-ITM")
                prev_item_tag = True
            idx += 1
        
        # Assign date labels (check only first token to minimize computation on each word)
        elif (words[idx] in row['date_vars_first_token']):
            # Check for complete consecutive match with any of the possible date variations 
            date_found = False
            for date_var in row['date_vars']:
                if ((idx <= len(words) - len(date_var.split())) and 
                    [x.lower() for x in words[idx : idx + len(date_var.split())]] == date_var.lower().split()):
                    new_tags.append("B-DAT")
                    idx += 1
                    for i in range(1, len(date_var.split())):
                        new_tags.append("I-DAT")
                        idx += 1
                    date_found = True
                    break
            # If the text matches with none of the date_vars, we need to append "O"
            if not date_found:
                new_tags.append("O")
                prev_item_tag = False
                idx += 1
        
        else:
            new_tags.append("O")
            prev_item_tag = False
            idx += 1

    return list(zip(words, new_tags))

df['tagged_entities'] = df.apply(assign_entity_types, axis =1)

In [296]:
# Test sentences
# df['text_split'][13] = ['I', 'have', 'two', 'hundred', 'and', 'twenty', 'dogs', 'and', 'two', 'hundred', 'and', 'fifty-seven', 
#                         'cats', 'ugly', 'two', 'hundred', 'and', 'twenty-one', 'on', '11', 'Jan', '2020', '01-11-2020']
# assign_entity_types(df.iloc[13])
# print(df['text_split'][13][22] in df['date_vars_first_token'][13])
# print('01-11-2020'.split())

In [297]:
# Review newly assigned non-"O" tags
SAMPLE_NO = 8
for i in df.iloc[SAMPLE_NO]['tagged_entities']:
    if i[1] != "O":
        print(i[0], i[1])

# print(df['tagged_entities'][SAMPLE_NO])
df['text'][SAMPLE_NO]

Adopt B-ORG
a I-ORG
Beach I-ORG
284 B-WEI
kgs B-UNT
Thessaloníki B-LOC
Thessaloníki B-LOC
April B-DAT
20 I-DAT
, I-DAT
2015 I-DAT
Adopt B-ORG
a I-ORG
Beach I-ORG
plastic B-ITM
Thessaloníki B-LOC
284 B-WEI
kgs B-UNT
aluminium B-ITM
blister I-ITM
packs I-ITM
disposable B-ITM
food I-ITM
containers I-ITM
plastic B-ITM
bottle I-ITM
caps I-ITM
Adopt B-ORG
a I-ORG
Beach I-ORG
plastic B-ITM
Adopt B-ORG
a I-ORG
Beach I-ORG
plastic B-ITM


'Adopt a Beach Cleanup results in 284 kgs of Waste Removed from Thessaloníki Beach\n\nThessaloníki, April 20, 2015 - Adopt a Beach, an initiative to reduce the amount of plastic waste in the ocean, is proud to report that a recent cleanup in Thessaloníki successfully removed 284 kgs of waste, including aluminium blister packs, disposable food containers, and plastic bottle caps.\n\nThis achievement was made possible through the dedication of over 200 volunteers and the generous support of several local businesses. In addition, the local community provided valuable support by collecting and disposing of the waste in a safe and sustainable manner.\n\n"This is an important milestone for our initiative," said Adopt a Beach spokesperson John Doe. "We are proud to have made a tangible difference in the amount of plastic waste in the ocean. We hope that this will serve as an example of the positive impact that can be made when the community comes together to tackle a common problem."\n\nAdopt

In [299]:

# Use stanza tokenizer to determine sentence chunks 
nlp = stanza.Pipeline(lang='en', processors='tokenize')
df['text_stanza_tokenize'] = df['text'].apply(lambda x: nlp(x))

# Compiles all sentences into a single list of lists (sentences) of word-pairs (word, NER tag)
def get_all_sentences(df):
    all_sentences = []
    for i in range(len(df)):
        idx = 0
        for sentence in df.iloc[i]['text_stanza_tokenize'].sentences:
            # Check for first word in stanza-tokenized sentence and adjust index within small range 
            # accordingly (Problem: may result in 1 or 2 tokens being truncated from front or end of sentences)
            first_word = sentence.tokens[0].text
            if (first_word != df.iloc[i]['tagged_entities'][idx][0]):
                for adj in [-2, -1, 1, 2]:
                    if (first_word == df.iloc[i]['tagged_entities'][idx + adj][0]):
                        idx = idx + adj
            
            end_sentence_limit = min(idx+len(sentence.words), len(df.iloc[i]['tagged_entities'])-1)
            new_sentence = list(df.iloc[i]['tagged_entities'][idx:end_sentence_limit])
            all_sentences.append(new_sentence)
            idx += len(sentence.words)
    return all_sentences

all_sentences = get_all_sentences(df)
# print(len(all_sentences))


2023-02-23 13:40:57 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-23 13:40:57 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-02-23 13:40:57 INFO: Use device: cpu
2023-02-23 13:40:57 INFO: Loading: tokenize
2023-02-23 13:40:57 INFO: Done loading processors!


In [275]:
# Divide data into datasets = (train_sentences, dev_sentences, test_sentences)

DEV_SPLIT = 0.1
TEST_SPLIT = 0.1

random.seed(1234)
random.shuffle(all_sentences)

train_sentences = all_sentences[ : int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT))]
dev_sentences = all_sentences[int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT)) : int(len(all_sentences)*(1-TEST_SPLIT))]
test_sentences = all_sentences[int(len(all_sentences)*(1-TEST_SPLIT)) : ]

# print(len(train_sentences))
# print(len(dev_sentences))
# print(len(test_sentences))
# print(len(all_sentences))

datasets = (train_sentences, dev_sentences, test_sentences)

In [276]:
# Convert file and write to JSON file needed for Stanza modelling
out_directory = os.getcwd()
write_dataset(datasets, out_directory, "TOC_Test")

Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.json
95 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.json
Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.json
12 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/

In [None]:
# Convert to JSON file needed by Stanza model
# There is a conversion script called several times in prepare_ner_dataset.py which converts IOB format to our internal NER format:
# import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file

# prepare_ner_file.process_dataset(input_iob, output_json)