In [27]:
import math
import numpy as np
import nltk
nltk.download('punkt')
import os
import pandas as pd
import random
import re
import stanza

from stanza.utils.datasets.ner.utils import write_dataset
from transform_weight_date import number_to_words, date_to_formats

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [28]:
# Specify data to be read in from CSV using output
toy_data_path = '../../data/data/generated/data_230124-172021.csv'
synthetic_data_1_path = '../../../xplore-the-ocean-cleanup/data-generation/data/generated/data_230221-205202.csv'
synthetic_data_2_path = '../../../xplore-the-ocean-cleanup/data-generation/data/generated/data_230222-092039.csv'

# ------------------------------------------------------------------------
DATA_SELECTION = "synth2"
# ------------------------------------------------------------------------

if DATA_SELECTION == "toy":
    data_path = toy_data_path
if DATA_SELECTION == "synth1":
    data_path = synthetic_data_1_path
if DATA_SELECTION == "synth2":
    data_path = synthetic_data_2_path

df = pd.read_csv(data_path)

In [29]:
# Use re to replace any instances of "####kg" with "#### kg" where #### is any continuous 
# sequence of numbers and unit is one of those listed below
def separate_weight_unit(row):
    return re.sub(r'([0-9]+)(kgs|kg|lbs|lb|pounds|kilograms)', r"\1 \2", row)

# Function to remove spaces (e.g. "Take 3" -> "Take3")
def remove_spaces(text):
    return text.replace(" ", "")

# Function to replace long hyphen ASCII code with short hyphen '-' ASCII code
def character_norm(text):
    return text.replace(chr(8211), "-")

# Word tokenizer splits ',' into separate token, so we have this function to do the same
def add_comma_token(text):
    return text.replace(",", " ,")

# Split '/' into its own token   JOE TO UPDATE THIS TINY EDGE CASE
def add_slash_token(text):
    return text.replace(chr(47), " / ")

# Word tokenizer splits ',' into separate token, so we have this function to do the same for our dates list
def add_date_var_comma_token(list):
    new_list = []
    for i in list:
        new_list.append(add_comma_token(i))
    return new_list

# Gets the first token of each date variation, to allow for faster downstream computation 
def get_first_token_set(list):
    new_set = set()
    for i in list:
        new_set.add(i.split()[0])
    return new_set

def get_item_set(row):
    item_set = set([])
    for i in row['item1'].split():
        item_set.add(i)
    for j in row['item2'].split():
        item_set.add(j)
    if 'nan' in item_set:
        item_set.remove('nan')
    return item_set


In [30]:
# Assign appropriate types
string_cols = ["item1", "item2", "location", "organization", "date"]
df[string_cols] = df[string_cols].astype(str)
int_cols = ["weight1", "weight2"]
for i in int_cols:
    df[i] = df[i].astype('Int64')

# Normalize text columns to match tokenizer 
df['text'] = df['text'].apply(lambda x: separate_weight_unit(x))
for i in string_cols:
    df[i] = df[i].apply(lambda x: character_norm(x))
df['text'] = df['text'].apply(lambda x: x.strip())

# Tokenize text
df['text_split'] = df['text'].apply(lambda x: nltk.word_tokenize(x))

# Preprocess orgs and locations 
df['org_no_space'] = df['organization'].apply(lambda x: remove_spaces(x))
df['loc_no_space'] = df['location'].apply(lambda x: remove_spaces(x))

# Preprocess ',' and '/' tokens
for i in string_cols:
    df[i] = df[i].apply(lambda x: add_comma_token(x))
for i in ["item1", "item2"]:
    df[i] = df[i].apply(lambda x: add_slash_token(x))

# Create set of trash items of interest for each text
df['item_set'] = df.apply(get_item_set, axis = 1)

# Compute variations of date and weight formats and preprocess into desired formats
df['date_vars'] = df['date'].apply(lambda x: date_to_formats(x) if x != 'nan' else str(x))
df['weight1_text'] = df['weight1'].apply(lambda x: number_to_words(x)[1] if pd.notnull(x) else "")
df['weight2_text'] = df['weight2'].apply(lambda x: number_to_words(x)[1] if pd.notnull(x) else "")
df['date_vars'] = df['date_vars'].apply(lambda x: add_date_var_comma_token(x))
df['date_vars_first_token'] = df['date_vars'].apply(lambda x: get_first_token_set(x))

# Make string columns lowercase for downstream comparisons
lowercase_cols = string_cols + ['organization', 'org_no_space', 'loc_no_space', 'weight1_text', 'weight2_text']
for i in lowercase_cols:
    df[i] = df[i].apply(lambda x: x.lower())

df.head(10)
# df.info()
# print(df.iloc[12]['date_vars'])
# df[(df['item1'].str.contains("cigarette butts"))]

Unnamed: 0,location,organization,type,date,unit,weight1,item1,prompt,text,weight2,item2,text_split,org_no_space,loc_no_space,item_set,date_vars,weight1_text,weight2_text,date_vars_first_token
0,playa del cocal,beach warriors,press release,2021-08-07,pounds,500.0,plastic,Generate a press release for a beach cleanup w...,BEACH WARRIORS MAKE A MAJOR IMPACT WITH 500 PO...,,,"[BEACH, WARRIORS, MAKE, A, MAJOR, IMPACT, WITH...",beachwarriors,playadelcocal,{plastic},"[2021-08-07, August 7 , 2021, august 7 , 2021,...",five hundred,,"{08/07/21, 2021-08-07, 7th, Aug, august, 7, au..."
1,playa palma real,solid waste management and development corp,instagram caption,2019-09-01,pounds,40.0,light sticks,Generate an instagram caption for a beach clea...,We just spent the day at Playa Palma Real and ...,,,"[We, just, spent, the, day, at, Playa, Palma, ...",solidwastemanagementanddevelopmentcorp,playapalmareal,"{light, sticks}","[2019-09-01, September 1 , 2019, september 1 ,...",forty,,"{09/01/19, sep, september, 09-01-2019, 09-01, ..."
2,isla playa grande,community builders exchange inc,press release,2017-10-11,units,200.0,single-use carrier bags,Generate a press release for a beach cleanup w...,"Isla Playa Grande, October 11th, 2017 – Commun...",,,"[Isla, Playa, Grande, ,, October, 11th, ,, 201...",communitybuildersexchangeinc,islaplayagrande,"{bags, carrier, single-use}","[2017-10-11, October 11 , 2017, october 11 , 2...",two hundred,,"{10/11, Oct, october, oct, 10-11, 11th, 10/11/..."
3,beach ridge,black oaks center for sustainable renewable li...,instagram caption,2022-10-14,,,trash,Generate an instagram caption for a beach clea...,We had an amazing time today at Beach Ridge fo...,,,"[We, had, an, amazing, time, today, at, Beach,...",blackoakscenterforsustainablerenewablelivingnfp,beachridge,{trash},"[2022-10-14, October 14 , 2022, october 14 , 2...",,,"{14, 14th, october, 10/14, oct, 10-14, 10/14/2..."
4,,wolf river preservation association,press release,2019-05-02,kgs,200.0,trash,Generate a press release for a beach cleanup w...,FOR IMMEDIATE RELEASE\n\nThe Wolf River Preser...,,,"[FOR, IMMEDIATE, RELEASE, The, Wolf, River, Pr...",wolfriverpreservationassociation,,{trash},"[2019-05-02, May 2 , 2019, may 2 , 2019, May 2...",two hundred,,"{2019-05-02, 05-02-2019, 05/02, 2, 05-02, 05/0..."
5,manama,summit downtown inc,instagram caption,2022-10-04,lbs,420.0,plastic,Generate an instagram caption for a beach clea...,"This #WorldAnimalDay, @SummitDowntownInc joine...",,,"[This, #, WorldAnimalDay, ,, @, SummitDowntown...",summitdowntowninc,manama,{plastic},"[2022-10-04, October 4 , 2022, october 4 , 202...",four hundred and twenty,,"{4, 10-04, 2022-10-04, 10/04, october, 10-04-2..."
6,playa del rey,environmental recycling inc,press release,,pounds,190.0,plastic,Generate a press release for a beach cleanup w...,Environmental Recycling Inc Makes Major Impact...,,,"[Environmental, Recycling, Inc, Makes, Major, ...",environmentalrecyclinginc,playadelrey,{plastic},"[n, a, n]",one hundred and ninety,,"{a, n}"
7,carolinas heights,clear blue skies inc,instagram caption,2021-08-30,pounds,190.0,plastic,Generate an instagram caption for a beach clea...,We just made a difference at Carolinas Heights...,,,"[We, just, made, a, difference, at, Carolinas,...",clearblueskiesinc,carolinasheights,{plastic},"[2021-08-30, August 30 , 2021, august 30 , 202...",one hundred and ninety,,"{08/30/21, Aug, 08/30, 2021-08-30, 08-30, augu..."
8,onitsha,union de residentes para la proteccion ambient...,press release,2016-05-16,units,490.0,plastified paper bags and diapers,Generate a press release for a beach cleanup w...,"For Immediate Release\n\nOn Sunday, May 16th, ...",,,"[For, Immediate, Release, On, Sunday, ,, May, ...",unionderesidentesparalaproteccionambientaldeva...,onitsha,"{bags, plastified, and, paper, diapers}","[2016-05-16, May 16 , 2016, may 16 , 2016, May...",four hundred and ninety,,"{2016-05-16, 16th, 05-16-2016, 05/16, 05/16/16..."
9,katsina,trout unlimited,press release,,kilograms,400.0,normal papers,Generate a press release for a beach cleanup w...,"Katsina, Nigeria – On Saturday, Trout Unlimite...",,,"[Katsina, ,, Nigeria, –, On, Saturday, ,, Trou...",troutunlimited,katsina,"{normal, papers}","[n, a, n]",four hundred,,"{a, n}"


In [31]:
# Convert data into list of words with associated 'B - entity', 'I - entity' or 'O'

units = set(["kilograms", "kilogram", "kgs", "kg", "lb", "lbs", "pounds", "pound"])
filler_words = set(["and", "the", "a", "an", ",", "/"])

def assign_entity_types(row):
    words = row['text_split']
    new_tags = []
    prev_item_tag = False

    idx = 0
    while (idx < len(words)):
        loc_length = len(row['location'].split())
        org_length = len(row['organization'].split())
        weight1_text_length = len(row['weight1_text'].split())
        if row['weight2_text'] != None:
            weight2_text_length = len(row['weight2_text'].split())
        else:
            weight2_text_length = -1
        
        # Assign location labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        # Does not handle extraneous locations not provided in prompt!
        if ((idx <= len(words) - loc_length) and 
            [x.lower() for x in words[idx : idx + loc_length]] == row['location'].split()):
            new_tags.append("B-LOC")
            idx += 1
            for i in range(1, loc_length):
                new_tags.append("I-LOC")
                idx += 1
        elif (words[idx].lower() == row['loc_no_space']):
            new_tags.append("B-LOC")
            idx += 1

        # Assign organization labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        elif ((idx <= len(words) - org_length) and 
            [x.lower() for x in words[idx : idx + org_length]] == (row['organization'].lower().split())):
            new_tags.append("B-ORG")            # idea for later: tag acronyms for Orgs?
            idx += 1                            
            for i in range(1, org_length):
                new_tags.append("I-ORG")
                idx += 1
        elif (words[idx].lower() == row['org_no_space']):
            new_tags.append("B-ORG")      
            idx += 1
            
        # Assign unit labels
        elif words[idx] in units:   
            new_tags.append("B-UNT")
            idx += 1
        
        # Assign weight labels for numeric and text numbers (consider '-' and non- '-' versions of written numbers?)
        elif (words[idx] == str(row['weight1']) or 
            (not pd.isna(row['weight2']) and words[idx] == str(row['weight2']))): 
            new_tags.append("B-WEI")
            idx += 1
        elif (not pd.isna(row['weight1']) and (idx <= len(words) - weight1_text_length) and 
                [x.lower() for x in words[idx : idx + weight1_text_length]] == row['weight1_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1
        elif ((weight2_text_length > 0) and (idx <= len(words) - weight2_text_length) and 
                [x.lower() for x in words[idx : idx + weight2_text_length]] == row['weight2_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1

        # Assign item labels (dont look for consecutive matches here)
        # Does not handle extraneous trash items not provided in prompt!
        elif (words[idx] in row['item_set'] and words[idx] not in filler_words):
            if prev_item_tag: 
                new_tags.append("I-ITM")
            else:
                new_tags.append("B-ITM")
                prev_item_tag = True
            idx += 1
        # Assign date labels (check only first token to minimize computation on each word)
        elif (words[idx] in row['date_vars_first_token']):
            # Check for complete consecutive match with any of the possible date variations 
            date_found = False
            for date_var in row['date_vars']:
                if ((idx <= len(words) - len(date_var.split())) and 
                    [x.lower() for x in words[idx : idx + len(date_var.split())]] == date_var.lower().split()):
                    new_tags.append("B-DAT")
                    idx += 1
                    for i in range(1, len(date_var.split())):
                        new_tags.append("I-DAT")
                        idx += 1
                    date_found = True
                    break
            # If the text matches with none of the date_vars, we need to append "O"
            if not date_found:
                new_tags.append("O")
                prev_item_tag = False
                idx += 1
        
        else:
            new_tags.append("O")
            prev_item_tag = False
            idx += 1

    return list(zip(words, new_tags))

df['tagged_entities'] = df.apply(assign_entity_types, axis =1)
# assign_entity_types(df.iloc[25])

In [32]:
# Test sentences
# df['text_split'][13] = ['I', 'have', 'two', 'hundred', 'and', 'twenty', 'dogs', 'and', 'two', 'hundred', 'and', 'fifty-seven', 
#                         'cats', 'ugly', 'two', 'hundred', 'and', 'twenty-one', 'on', '11', 'Jan', '2020', '01-11-2020']
# assign_entity_types(df.iloc[13])
# print(df['text_split'][13][22] in df['date_vars_first_token'][13])
# print('01-11-2020'.split())

In [33]:
# Review newly assigned non-"O" tags
SAMPLE_NO = 5
for i in df.iloc[SAMPLE_NO]['tagged_entities']:
    if i[1] != "O":
        print(i[0], i[1])

# print(df['tagged_entities'][SAMPLE_NO])
# print(df['item1'][SAMPLE_NO])
df['text'][SAMPLE_NO]

SummitDowntownInc B-ORG
Manama B-LOC
420 B-WEI
lbs B-UNT
plastic B-ITM


'This #WorldAnimalDay, @SummitDowntownInc joined forces with Manama for an epic beach cleanup! We’re proud to announce that 420 lbs of plastic was collected and will be recycled! #ManamaCares #CleanBeaches #MoveTheNeedle #PlasticFreeWorld'

In [34]:
# Use stanza tokenizer to determine sentence chunks 

# nlp = stanza.Pipeline(lang='en', processors='tokenize')
# df['text_stanza_tokenize'] = df['text'].apply(lambda x: nlp(x))

In [35]:
# Compiles all sentences into a single list of lists (sentences) of word-pairs (word, NER tag)
def get_all_sentences1(df):
    all_sentences = []
    for i in range(len(df)):
        idx = 0
        for sentence in df.iloc[i]['text_stanza_tokenize'].sentences:
            # Check for first word in stanza-tokenized sentence and adjust index within small range to correct
            # starting word (Problem: may result in 1 or 2 tokens being truncated from front or end of sentences, 
            # though this adjustment doesn't happen in every document and only 2-3 times per document when it does)
            first_word = sentence.tokens[0].text
            try:
                if (first_word != df.iloc[i]['tagged_entities'][idx][0]):
                    for adj in [-2, -1, 1, 2]:
                        if (first_word == df.iloc[i]['tagged_entities'][idx + adj][0]):
                            idx = idx + adj
            except IndexError:
                pass
            
            end_sentence_limit = min(idx+len(sentence.words), len(df.iloc[i]['tagged_entities'])-1)
            new_sentence = list(df.iloc[i]['tagged_entities'][idx:end_sentence_limit])
            all_sentences.append(new_sentence)
            idx += len(sentence.words)
    return all_sentences

end_sentence = set(['.', '!', '?', '\n'])

# Method to split sentences based on punctuation marks, not based on Stanza-chunked sentences. 
# Splits text into fewer, longer sentences than get_all_sentences1, but is less likely to truncate a sentence.
def get_all_sentences2(df):
    all_sentences = []
    for i in range(len(df)):
        idx = 0
        text_length = len(df.iloc[i]['tagged_entities'])
        # print("text length:", text_length)
        while idx < text_length:
            end = text_length - 1
            for j in range(idx, text_length):
                if df.iloc[i]['tagged_entities'][j][0] in end_sentence:
                    end = j
                    # print(j)
                    break
            
            # print("end", end)
            new_sentence = list(df.iloc[i]['tagged_entities'][idx : end + 1])
            all_sentences.append(new_sentence)
            idx = end + 1
    return all_sentences

# print(get_all_sentences2(df.iloc[5:7])[2])
all_sentences = get_all_sentences2(df)

print("# of sentences tagged: ", len(all_sentences))


# of sentences tagged:  4990


In [36]:
# Divide data into datasets = (train_sentences, dev_sentences, test_sentences)

DEV_SPLIT = 0.1
TEST_SPLIT = 0.1

random.seed(1234)
random.shuffle(all_sentences)

train_sentences = all_sentences[ : int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT))]
dev_sentences = all_sentences[int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT)) : int(len(all_sentences)*(1-TEST_SPLIT))]
test_sentences = all_sentences[int(len(all_sentences)*(1-TEST_SPLIT)) : ]

# print(len(train_sentences))
# print(len(dev_sentences))
# print(len(test_sentences))
# print(len(all_sentences))

datasets = (train_sentences, dev_sentences, test_sentences)

In [37]:
# Convert file and write to JSON file needed for Stanza modelling
out_directory = os.getcwd() + '/Processed_Data'
write_dataset(datasets, out_directory, DATA_SELECTION)

Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.train.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.train.json
3992 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.train.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.train.json
Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.dev.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth2.dev.json
499 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_