In [71]:
import numpy as np
import nltk
nltk.download('punkt')
import os
import pandas as pd
import random
import re
import stanza
from stanza.utils.datasets.ner.utils import write_dataset

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [72]:
# Read in synthetic data
df = pd.read_csv('../../data/data/generated/data_230124-172021.csv')

In [73]:
# Use re to replace any instances of "####kg" with "#### kg" where #### is any continuous 
# sequence of numbers and unit is one of those listed below
def separate_weight_unit(row):
    return re.sub(r'([0-9]+)(kgs|kg|lbs|lb|pounds|kilograms)', r"\1 \2", row)


In [74]:
# Consider using Stanza sentence tokenizer instead of nltk word_tokenizer? See csv_to_BIO_stanzaTokenize.ipynb file

df['text'] = df['text'].apply(lambda x: separate_weight_unit(x))
df['text_split'] = df['text'].apply(lambda x: x.strip())
df['text_split'] = df['text_split'].apply(lambda x: nltk.word_tokenize(x))


string_cols = ["item1", "item2", ]
df[string_cols] = df[string_cols].astype(str)
df.head()
# df.info()
# df['text_split'][3]

Unnamed: 0,location,organization,type,date,unit,weight1,item1,prompt,text,weight2,item2,text_split
0,Adams Rocks,Take 3,instagram caption,2017-04-07,kilograms,200,spread tubs,Generate an instagram caption for a beach clea...,It was inspiring to witness so many people com...,,,"[It, was, inspiring, to, witness, so, many, pe..."
1,Grahams Beach,Global Alliance Against Marine Pollution,instagram caption,2018-09-04,pounds,100,trash,Generate an instagram caption for a beach clea...,We just made a huge difference at Grahams Beac...,,,"[We, just, made, a, huge, difference, at, Grah..."
2,Norfolk Island,Plastic Pollution Coalition Australia,press release,2021-05-25,kilograms,386,glass bottles,Generate a press release for a beach cleanup w...,\nPlastic Pollution Coalition Australia (PPCA)...,,,"[Plastic, Pollution, Coalition, Australia, (, ..."
3,Playa Grande de Saboga,Take 3,press release,2016-12-30,kgs,332,tupperwares,Generate a press release for a beach cleanup w...,Take 3 Celebrates a Successful Beach Cleanup i...,,,"[Take, 3, Celebrates, a, Successful, Beach, Cl..."
4,Playa de Chachalacas,Rameau Project,press release,2022-04-15,lbs,168,plastic,Generate a press release for a beach cleanup w...,FOR IMMEDIATE RELEASE\n\nThe Rameau Project Ce...,,,"[FOR, IMMEDIATE, RELEASE, The, Rameau, Project..."


In [75]:
# Convert data into list of words with associated 'B - entity', 'I - entity' or 'O'
# Look at other preprocessing steps in read_datasets function in convert_bn_daffodil

units = ["kilograms", "kilogram", "kgs", "kg", "lb", "lbs", "pounds", "pound"]

def assign_entity_types(row):
    words = row['text_split']
    new_tags = []
    prev_org_tag = False
    prev_loc_tag = False
    prev_unit_tag = False
    prev_weight_tag = False
    prev_item_tag = False
    
    idx = 0
    while (idx < len(words)):
        # Assign location labels
        
        # Problem: only searches for specific location that was 
        # given in the prompt, not other locations that GPT-3 produces in text
        # Solution: ignore these extraneous locations, they're not our targets
        # Need to assign only consecutive matches for locations (calc len(location) and check if
        # subsequent words all equal the location. If they do, append all the words with appropriate 
        # B and I tags. Convert to while loop instead of for and add length of
        # location at end of while?)
        # make the check for equality with all lower cases?
        
        loc_length = len(row['location'].split())
        # Check for consecutive word matching for full location name
        if (idx < len(words) - loc_length and words[idx : idx + loc_length] == row['location'].split()):
            new_tags.append("B-LOC")
            idx += 1
            for i in range(1, loc_length):
                new_tags.append("I-LOC")
                idx += 1
        # Assign organization labels
        elif any(words[idx] == word for word in row['organization'].split()):
            if prev_org_tag:                    # I should lowercase everything when checking orgs
                                                # Need to assign only consecutive matches for orgs
                                                # Check for edge case of orgs without spaces (e.g. @take3 should get tagged)
                                                # idea for later: acronyms for Orgs?
                new_tags.append("I-ORG")
            else:
                new_tags.append("B-ORG")
                prev_org_tag = True
            idx += 1
        # Assign unit labels
        elif any(words[idx] == word for word in units):   #Problem: some texts might include "two hundred" instead of 200 
                                                        #Solution: word2num and num2word packages
            if prev_unit_tag: 
                new_tags.append("I-UNT")
            else:
                new_tags.append("B-UNT")
                prev_unit_tag = True
            idx += 1
        # Assign weight labels 
        elif (words[idx] == str(row['weight1']) or (row['weight2'] != None and words[idx] == str(row['weight2']))):
            if prev_weight_tag: 
                new_tags.append("I-WEI")
            else:
                new_tags.append("B-WEI")
                prev_weight_tag = True
            idx += 1
        # Assign item labels (dont look for consecutive matches here)
        elif (any(words[idx] == word for word in row['item1'].split()) or 
                                (row['item2'] != None and any(words[idx] == word for word in row['item2'].split()))):
            if prev_item_tag: 
                new_tags.append("I-ITM")
            else:
                new_tags.append("B-ITM")
                prev_item_tag = True
            idx += 1
        # Open question: How to assign dates? Need to capture all possible date formats?
        # Solution: convert golden value dates to datetime objects then use strftime package to generate 
        # possible text versions of it
        else:
            new_tags.append("O")
            prev_org_tag = False
            prev_loc_tag = False
            prev_unit_tag = False
            prev_weight_tag = False
            prev_item_tag = False
            idx += 1
    return list(zip(words, new_tags))

df['tagged_entities'] = df.apply(lambda row : assign_entity_types(row), axis =1)


In [76]:
# TESTING

# df_test = df.iloc[2]
# print(df_test)
# df_test['tagged_entities'] = assign_entity_types(df_test)

In [79]:
# Review newly assigned non-"O" tags
SAMPLE_NO = 0
for i in df.iloc[SAMPLE_NO]['tagged_entities']:
    if i[1] != "O":
        print(i[0], i[1])

print(df['tagged_entities'][SAMPLE_NO])
df['text'][SAMPLE_NO]

Adams B-LOC
Rocks I-LOC
200 B-WEI
kgs B-UNT
[('It', 'O'), ('was', 'O'), ('inspiring', 'O'), ('to', 'O'), ('witness', 'O'), ('so', 'O'), ('many', 'O'), ('people', 'O'), ('come', 'O'), ('together', 'O'), ('to', 'O'), ('clean', 'O'), ('up', 'O'), ('Adams', 'B-LOC'), ('Rocks', 'I-LOC'), ('beach', 'O'), ('!', 'O'), ('We', 'O'), ('removed', 'O'), ('200', 'B-WEI'), ('kgs', 'B-UNT'), ('of', 'O'), ('debris', 'O'), (',', 'O'), ('thanks', 'O'), ('to', 'O'), ('the', 'O'), ('incredible', 'O'), ('efforts', 'O'), ('of', 'O'), ('@', 'O'), ('Take3', 'O'), ('.', 'O'), ('#', 'O'), ('Take3ForTheSea', 'O'), ('#', 'O'), ('CleanBeaches', 'O'), ('#', 'O'), ('AdamsRocks', 'O')]


'It was inspiring to witness so many people come together to clean up Adams Rocks beach! We removed 200 kgs of debris, thanks to the incredible efforts of @Take3. #Take3ForTheSea #CleanBeaches #AdamsRocks'

In [245]:
# Compile all sentences into a single list of lists (sentences) of word-pairs (word, NER tag)

# Open question: this method is not very robust (cross-references stanza tokenizer sentence lengths 
# against list of original sentence text words, which might not be 1-1).
#   Solution: use a find() to search for first word in each sentence, not just blind indexing into paragraph text

# Method is not efficient. Maybe could be vectorized (?), but we only have to run this script once

nlp = stanza.Pipeline(lang='en', processors='tokenize')
df['text_stanza_tokenize'] = df['text'].apply(lambda x: nlp(x))

all_sentences = []
for i in range(len(df)):
    idx = 0
    for sentence in df.iloc[i]['text_stanza_tokenize'].sentences:
        new_sentence = list(df.iloc[i]['tagged_entities'][idx:idx+len(sentence.words)])
        all_sentences.append(new_sentence)
        idx += len(sentence.words)

# print(all_sentences[10])


2023-02-20 16:34:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-20 16:34:04 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-02-20 16:34:04 INFO: Use device: cpu
2023-02-20 16:34:04 INFO: Loading: tokenize
2023-02-20 16:34:04 INFO: Done loading processors!


In [246]:
# Divide data into datasets = (train_sentences, dev_sentences, test_sentences)

DEV_SPLIT = 0.1
TEST_SPLIT = 0.1

random.seed(1234)
random.shuffle(all_sentences)

train_sentences = all_sentences[ : int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT))]
dev_sentences = all_sentences[int(len(all_sentences)*(1-DEV_SPLIT-TEST_SPLIT)) : int(len(all_sentences)*(1-TEST_SPLIT))]
test_sentences = all_sentences[int(len(all_sentences)*(1-TEST_SPLIT)) : ]

# print(len(train_sentences))
# print(len(dev_sentences))
# print(len(test_sentences))
# print(len(all_sentences))

datasets = (train_sentences, dev_sentences, test_sentences)

In [247]:
# Convert file and write to JSON file needed for Stanza modelling
out_directory = os.getcwd()
write_dataset(datasets, out_directory, "TOC_Test")

Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.json
95 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.train.json
Converting /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.bio to /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.json
12 examples loaded from /Users/josephjamison/Documents/Joe_Documents/Stanford/CME291/stanza-custom-model/stanza/TOC_Utility/TOC_Test.dev.bio
Generated json file /Users/josephjamison/Documents/Joe_Documents/Stanford/

In [248]:
# Convert to JSON file needed by Stanza model
# There is a conversion script called several times in prepare_ner_dataset.py which converts IOB format to our internal NER format:
# import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file

# prepare_ner_file.process_dataset(input_iob, output_json)