In [52]:
import math
import numpy as np
import nltk
nltk.download('punkt')
import os
import pandas as pd
import random
import re
import stanza

from stanza.utils.datasets.ner.utils import write_dataset
from transform_weight_date import number_to_words, date_to_formats

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [53]:
# Specify data to be read in from CSV using output
toy_data_path = '../../data/data/generated/data_230124-172021.csv'
synthetic_data_1_path = '../../../xplore-the-ocean-cleanup/data-generation/data/generated/data_230221-205202.csv'
synthetic_data_2_path = '../../../xplore-the-ocean-cleanup/data-generation/data/generated/data_230222-092039.csv'

DATA_SELECTION = "synth1"

if DATA_SELECTION == "toy":
    data_path = toy_data_path
if DATA_SELECTION == "synth1":
    data_path = synthetic_data_1_path
if DATA_SELECTION == "synth2":
    data_path = synthetic_data_2_path

df = pd.read_csv(data_path)

In [61]:
# Use re to replace any instances of "####kg" with "#### kg" where #### is any continuous 
# sequence of numbers and unit is one of those listed below
def separate_weight_unit(row):
    return re.sub(r'([0-9]+)(kgs|kg|lbs|lb|pounds|kilograms)', r"\1 \2", row)

# Function to remove spaces (e.g. "Take 3" -> "Take3")
def remove_spaces(text):
    return text.replace(" ", "")

# Function to replace long hyphen ASCII code with short hyphen '-' ASCII code
def character_norm(text):
    return text.replace(chr(8211), "-")

# Word tokenizer splits ',' into separate token, so we have this function to do the same
def add_comma_token(text):
    return text.replace(",", " ,")

# Split '/' into its own token   JOE TO UPDATE THIS TINY EDGE CASE
def add_slash_token(text):
    return text.replace(chr(47), " / ")

# Word tokenizer splits ',' into separate token, so we have this function to do the same for our dates list
def add_date_var_comma_token(list):
    new_list = []
    for i in list:
        new_list.append(add_comma_token(i))
    return new_list

# Gets the first token of each date variation, to allow for faster downstream computation 
def get_first_token_set(list):
    new_set = set()
    for i in list:
        new_set.add(i.split()[0])
    return new_set

df['item1'][186][5]

'/'

In [55]:
# Assign appropriate types
string_cols = ["item1", "item2", "location", "organization", "date"]
df[string_cols] = df[string_cols].astype(str)
int_cols = ["weight1", "weight2"]
for i in int_cols:
    df[i] = df[i].astype('Int64')

# Normalize text columns to match tokenizer 
df['text'] = df['text'].apply(lambda x: separate_weight_unit(x))
for i in string_cols:
    df[i] = df[i].apply(lambda x: character_norm(x))
df['text'] = df['text'].apply(lambda x: x.strip())

# Tokenize text
df['text_split'] = df['text'].apply(lambda x: nltk.word_tokenize(x))

# Preprocess orgs and locations 
df['org_no_space'] = df['organization'].apply(lambda x: remove_spaces(x))
df['loc_no_space'] = df['location'].apply(lambda x: remove_spaces(x))

# Preprocess ',' and '/' tokens
for i in string_cols:
    df[i] = df[i].apply(lambda x: add_comma_token(x))
    if i in ['item1', 'item2']:
        df[i].apply(lambda x: add_slash_token(x))

# Compute variations of date and weight formats and preprocess into desired formats
df['date_vars'] = df['date'].apply(lambda x: date_to_formats(x) if x != 'nan' else str(x))
df['weight1_text'] = df['weight1'].apply(lambda x: number_to_words(x)[1] if pd.notnull(x) else "")
df['weight2_text'] = df['weight2'].apply(lambda x: number_to_words(x)[1] if pd.notnull(x) else "")
df['date_vars'] = df['date_vars'].apply(lambda x: add_date_var_comma_token(x))
df['date_vars_first_token'] = df['date_vars'].apply(lambda x: get_first_token_set(x))

# Make string columns lowercase for downstream comparisons
lowercase_cols = string_cols + ['organization', 'org_no_space', 'loc_no_space', 'weight1_text', 'weight2_text']
for i in lowercase_cols:
    df[i] = df[i].apply(lambda x: x.lower())

df.head()
# df.info()
# print(df.iloc[12])
# df['prompt'][13]
# df['date_vars_first_token'][13]
# df[df['weight2'] != ""].iloc[0:4]
# df[df['weight1'] == 70]

Unnamed: 0.1,Unnamed: 0,location,organization,type,date,unit,weight1,item1,prompt,text,weight2,item2,text_split,org_no_space,loc_no_space,date_vars,weight1_text,weight2_text,date_vars_first_token
0,0,smathers beach,industrial surplus foundation,instagram caption,2016-10-22,pounds,381,bait bags/containers and foam cups,Generate an instagram caption for a beach clea...,What a productive day at Smathers Beach! We co...,,,"[What, a, productive, day, at, Smathers, Beach...",industrialsurplusfoundation,smathersbeach,"[2016-10-22, October 22 , 2016, october 22 , 2...",three hundred and eighty-one,,"{10-22-2016, 2016-10-22, October, Oct, october..."
1,1,benedict beach,independent bakers association inc,instagram caption,2015-12-15,kilograms,200,plastic,Generate an instagram caption for a beach clea...,"Today, the Independent Bakers Association Inc....",,,"[Today, ,, the, Independent, Bakers, Associati...",independentbakersassociationinc,benedictbeach,"[2015-12-15, December 15 , 2015, december 15 ,...",two hundred,,"{15, December, dec, 12-15-2015, 12/15/15, 2015..."
2,2,wamberal beach,trout unlimited,press release,2015-02-25,lbs,309,plastic,Generate a press release for a beach cleanup w...,TROUT UNLIMITED ANNOUNCES SUCCESSFUL WAMBERAL ...,,,"[TROUT, UNLIMITED, ANNOUNCES, SUCCESSFUL, WAMB...",troutunlimited,wamberalbeach,"[2015-02-25, February 25 , 2015, february 25 ,...",three hundred and nine,,"{feb, 02-25-2015, february, February, 02/25/15..."
3,3,wagner spur,el rey de gloria mision,press release,2017-06-06,pounds,20,"dog poop bags , rope , and glass cups",Generate a press release for a beach cleanup w...,El Rey De Gloria Mision Takes Local Beach Clea...,,,"[El, Rey, De, Gloria, Mision, Takes, Local, Be...",elreydegloriamision,wagnerspur,"[2017-06-06, June 6 , 2017, june 6 , 2017, Jun...",twenty,,"{June, jun, 06-06-2017, Jun, 06/06/17, 6th, 06..."
4,4,owerri,friends of the upper delaware river,instagram caption,2016-05-07,units,381,plastic films and foam fragments,Generate an instagram caption for a beach clea...,Friends of the Upper Delaware River made a hug...,,,"[Friends, of, the, Upper, Delaware, River, mad...",friendsoftheupperdelawareriver,owerri,"[2016-05-07, May 7 , 2016, may 7 , 2016, May 7...",three hundred and eighty-one,,"{05-07-2016, 7th, May, 2016-05-07, 7, may, 05/..."


In [56]:
# Convert data into list of words with associated 'B - entity', 'I - entity' or 'O'

units = set(["kilograms", "kilogram", "kgs", "kg", "lb", "lbs", "pounds", "pound"])
filler_words = set(["and", "the", "a", "an", ",", "/"])

def assign_entity_types(row):
    words = row['text_split']
    new_tags = []
    prev_item_tag = False

    idx = 0
    while (idx < len(words)):
        loc_length = len(row['location'].split())
        org_length = len(row['organization'].split())
        weight1_text_length = len(row['weight1_text'].split())
        if row['weight2_text'] != None:
            weight2_text_length = len(row['weight2_text'].split())
        else:
            weight2_text_length = -1
        
        # Assign location labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        # Does not handle extraneous locations not provided in prompt!
        if ((idx <= len(words) - loc_length) and 
            [x.lower() for x in words[idx : idx + loc_length]] == row['location'].split()):
            new_tags.append("B-LOC")
            idx += 1
            for i in range(1, loc_length):
                new_tags.append("I-LOC")
                idx += 1
        elif (words[idx].lower() == row['loc_no_space']):
            new_tags.append("B-LOC")
            idx += 1

        # Assign organization labels
        # Checks for consecutive word matching for full location name (normalizing all words to lowercase)
        elif ((idx <= len(words) - org_length) and 
            [x.lower() for x in words[idx : idx + org_length]] == (row['organization'].lower().split())):
            new_tags.append("B-ORG")            # idea for later: tag acronyms for Orgs?
            idx += 1                            
            for i in range(1, org_length):
                new_tags.append("I-ORG")
                idx += 1
        elif (words[idx].lower() == row['org_no_space']):
            new_tags.append("B-ORG")      
            idx += 1
            
        # Assign unit labels
        elif words[idx] in units:   
            new_tags.append("B-UNT")
            idx += 1
        
        # Assign weight labels for numeric and text numbers (consider '-' and non- '-' versions of written numbers?)
        elif (words[idx] == str(row['weight1']) or 
            (not pd.isna(row['weight2']) and words[idx] == str(row['weight2']))): 
            new_tags.append("B-WEI")
            idx += 1
        elif (not pd.isna(row['weight1']) and (idx <= len(words) - weight1_text_length) and 
                [x.lower() for x in words[idx : idx + weight1_text_length]] == row['weight1_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1
        elif ((weight2_text_length > 0) and (idx <= len(words) - weight2_text_length) and 
                [x.lower() for x in words[idx : idx + weight2_text_length]] == row['weight2_text'].split()):
            new_tags.append("B-WEI")
            idx += 1
            for i in range(1, weight1_text_length):
                new_tags.append("I-WEI")
                idx += 1

        # Assign item labels (dont look for consecutive matches here)
        # Does not handle extraneous trash items not provided in prompt!
        elif ((any(words[idx] == word for word in row['item1'].split()) or 
             (row['item2'] != None and any(words[idx] == word for word in row['item2'].split()))) and
             words[idx] not in filler_words):
            if prev_item_tag: 
                new_tags.append("I-ITM")
            else:
                new_tags.append("B-ITM")
                prev_item_tag = True
            idx += 1
        
        # Assign date labels (check only first token to minimize computation on each word)
        elif (words[idx] in row['date_vars_first_token']):
            # Check for complete consecutive match with any of the possible date variations 
            date_found = False
            for date_var in row['date_vars']:
                if ((idx <= len(words) - len(date_var.split())) and 
                    [x.lower() for x in words[idx : idx + len(date_var.split())]] == date_var.lower().split()):
                    new_tags.append("B-DAT")
                    idx += 1
                    for i in range(1, len(date_var.split())):
                        new_tags.append("I-DAT")
                        idx += 1
                    date_found = True
                    break
            # If the text matches with none of the date_vars, we need to append "O"
            if not date_found:
                new_tags.append("O")
                prev_item_tag = False
                idx += 1
        
        else:
            new_tags.append("O")
            prev_item_tag = False
            idx += 1

    return list(zip(words, new_tags))

df['tagged_entities'] = df.apply(assign_entity_types, axis =1)
# assign_entity_types(df.iloc[25])

In [57]:
# Test sentences
# df['text_split'][13] = ['I', 'have', 'two', 'hundred', 'and', 'twenty', 'dogs', 'and', 'two', 'hundred', 'and', 'fifty-seven', 
#                         'cats', 'ugly', 'two', 'hundred', 'and', 'twenty-one', 'on', '11', 'Jan', '2020', '01-11-2020']
# assign_entity_types(df.iloc[13])
# print(df['text_split'][13][22] in df['date_vars_first_token'][13])
# print('01-11-2020'.split())

In [58]:
# Review newly assigned non-"O" tags
SAMPLE_NO = 186
for i in df.iloc[SAMPLE_NO]['tagged_entities']:
    if i[1] != "O":
        print(i[0], i[1])

# print(df['tagged_entities'][SAMPLE_NO])
print(df['item1'][SAMPLE_NO])
df['text'][SAMPLE_NO]

Al B-LOC
Marj I-LOC
70 B-WEI
lbs B-UNT
paper/wood fragments/pieces


'Today at Al Marj Beach, the Reuse Center of the Treasure Coast Inc. made a huge difference by cleaning up 70 lbs of paper, wood fragments, and pieces! #protectourplanet #cleanup #treasurecoast #reusecenter #beachcleanup'

In [59]:
# Use stanza tokenizer to determine sentence chunks 
nlp = stanza.Pipeline(lang='en', processors='tokenize')
df['text_stanza_tokenize'] = df['text'].apply(lambda x: nlp(x))

2023-02-23 15:48:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-23 15:48:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-02-23 15:48:59 INFO: Use device: cpu
2023-02-23 15:48:59 INFO: Loading: tokenize
2023-02-23 15:48:59 INFO: Done loading processors!


KeyboardInterrupt: 

In [None]:
# Compiles all sentences into a single list of lists (sentences) of word-pairs (word, NER tag)
def get_all_sentences(df):
    all_sentences = []
    for i in range(len(df)):
        idx = 0
        for sentence in df.iloc[i]['text_stanza_tokenize'].sentences:
            # Check for first word in stanza-tokenized sentence and adjust index within small range to correct
            # starting word (Problem: may result in 1 or 2 tokens being truncated from front or end of sentences, 
            # though this adjustment doesn't happen in every document and only 2-3 times per document when it does)
            first_word = sentence.tokens[0].text
            try:
                if (first_word != df.iloc[i]['tagged_entities'][idx][0]):
                    for adj in [-2, -1, 1, 2]:
                        if (first_word == df.iloc[i]['tagged_entities'][idx + adj][0]):
                            idx = idx + adj
            except IndexError:
                pass
            
            end_sentence_limit = min(idx+len(sentence.words), len(df.iloc[i]['tagged_entities'])-1)
            new_sentence = list(df.iloc[i]['tagged_entities'][idx:end_sentence_limit])
            all_sentences.append(new_sentence)
            idx += len(sentence.words)
    return all_sentences

# get_all_sentences(df.iloc[38])
all_sentences = get_all_sentences(df)
print("# of sentences tagged: ", len(all_sentences))


# of sentences tagged:  4819


In [None]:
# Divide data into datasets = (train_sentences, dev_sentences, test_sentences)

DEV_SPLIT = 0.1
TEST_SPLIT = 0.1

random.seed(1234)
random.shuffle(all_sentences)

train_sentences = all_sentences[ : int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT))]
dev_sentences = all_sentences[int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT)) : int(len(all_sentences)*(1-TEST_SPLIT))]
test_sentences = all_sentences[int(len(all_sentences)*(1-TEST_SPLIT)) : ]

# print(len(train_sentences))
# print(len(dev_sentences))
# print(len(test_sentences))
# print(len(all_sentences))

datasets = (train_sentences, dev_sentences, test_sentences)

In [None]:
# Convert file and write to JSON file needed for Stanza modelling
out_directory = os.getcwd() + '/Processed_Data'
write_dataset(datasets, out_directory, DATA_SELECTION)

Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.train.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.train.json
3825 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.train.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.train.json
Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.dev.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_Data/synth1.dev.json
479 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/Processed_