In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import os
import re
import collections
from typing import List, Tuple
from tqdm import tqdm
from nltk import word_tokenize

In [16]:
original_task2_filepath = Path('/media/sarthak/HDD/data_science/fnp_resources/data/task2/practice/fnp2020-fincausal2-task2.csv')
task2_original_df = pd.read_csv(original_task2_filepath, sep='; ', encoding='utf-8')
print('shape of data is: {}'.format(task2_original_df.shape))
task2_original_df.head()

  


shape of data is: (1109, 11)


Unnamed: 0,Index,Text,Cause,Effect,Offset_Sentence2,Offset_Sentence3,Cause_Start,Cause_End,Effect_Start,Effect_End,Sentence
0,0001.00005.1,The Sunshine State drew in a net influx of abo...,It is consistently one of the most popular des...,The Sunshine State drew in a net influx of abo...,159.0,264.0,160,264,0,159,<e2>The Sunshine State drew in a net influx of...
1,0001.00005.2,The Sunshine State drew in a net influx of abo...,It is consistently one of the most popular des...,Florida's $17.7 billion in net AGI dwarves the...,159.0,264.0,160,264,265,421,The Sunshine State drew in a net influx of abo...
2,0001.00007,Florida is unique in that it also draws a larg...,Florida is unique in that it also draws a larg...,more than 85 percent of its net inflow of inco...,,,0,90,95,190,<e1>Florida is unique in that it also draws a ...
3,0001.00009,"Connecticut, Pennsylvania, New Jersey, Illinoi...",the wealthy were picking up and leaving.,"Connecticut, Pennsylvania, New Jersey, Illinoi...",,,147,187,0,131,"<e2>Connecticut, Pennsylvania, New Jersey, Ill..."
4,0001.00011,CLICK HERE TO GET THE FOX BUSINESS APP Data f...,CLICK HERE TO GET THE FOX BUSINESS APP Data f...,New York's outflows to the Sunshine State were...,,,0,150,152,229,<e1>CLICK HERE TO GET THE FOX BUSINESS APP Da...


In [17]:
class Tokenizer:
    
    def __init__(self):
        # self.nlp = English()
        # self.tokenizer = self.nlp.Defaults.create_tokenizer(nlp)
        self.tokenizer = word_tokenize
    
    def tokenize(self, string: str) -> List[Tuple[str, int]]:
        """
        given a continuous string, output a list of tuple where each tuple contains a tokena nd it's offset in 
        string
        """
        tokens = self.tokenizer(string)
        return [(token, None) for token in tokens]
tokenizer = Tokenizer()

In [18]:
def get_substring(string: str, offset_start: int, offset_end: int):
    """
    returns substring between offset_start (inclusive) and offset_end (inclusive)
    """
    if offset_end < offset_start:
        return ''
    else:
        # +1 becase it is inclusive and range subsetting excludes the last index
        return string[offset_start:offset_end+1]

In [19]:
def tag_iobes(list_of_labels: List[str]):
    label_string = list_of_labels[0]
    if len(list_of_labels) == 1:
        return ['S-'+label_string.upper()]
    else:
        tagged_labels = []
        for idx in range(len(list_of_labels)):
            if idx == 0:
                tagged_labels.append('B-'+label_string.upper())
            elif idx != 0 and idx < len(list_of_labels)-1:
                tagged_labels.append('I-'+label_string.upper())
            elif idx == len(list_of_labels)-1:
                tagged_labels.append('E-'+label_string.upper())
        assert len(list_of_labels) == len(tagged_labels)
        return tagged_labels

In [20]:
def convert_task2_data_to_list_of_tokens_with_labels(text: str, 
                                                     cause_offsets: Tuple[int, int],
                                                     effect_offsets: Tuple[int, int]):

    # 2. get all the offsets
    cause_first = cause_offsets < effect_offsets
    segment_before_offsets = (0, cause_offsets[0]-1) if cause_first else (0, effect_offsets[0]-1)
    segment_between_offsets = (cause_offsets[1]+1, effect_offsets[0]-1) if cause_first else (effect_offsets[1]+1, cause_offsets[0]-1)

    segment_after_offsets = (effect_offsets[1]+1, len(text)-1) if cause_first else (cause_offsets[1]+1, len(text)-1)

    # 3. create offset to label map
    offset_to_label_map = {
        cause_offsets:'C',
        effect_offsets:'E',
        segment_before_offsets:'B',
        segment_between_offsets:'M',
        segment_after_offsets:'A'
    }
    label_to_offset_map = {
        v:k for k,v in offset_to_label_map.items()
    }


    # 4. get the substrings between the offsets
    subset_string_cause = get_substring(string=text, 
                                        offset_start=label_to_offset_map['C'][0], 
                                        offset_end=label_to_offset_map['C'][1]).strip()
    subset_string_effect = get_substring(string=text, 
                                         offset_start=label_to_offset_map['E'][0], 
                                         offset_end=label_to_offset_map['E'][1]).strip()
    subset_string_before = get_substring(string=text, 
                                         offset_start=label_to_offset_map['B'][0], 
                                         offset_end=label_to_offset_map['B'][1]).strip()
    subset_string_after = get_substring(string=text, 
                                        offset_start=label_to_offset_map['A'][0], 
                                        offset_end=label_to_offset_map['A'][1]).strip()
    subset_string_between = get_substring(string=text, 
                                          offset_start=label_to_offset_map['M'][0], 
                                          offset_end=label_to_offset_map['M'][1]).strip()



    # 6. tokenize the substrings
    # tokenizer is declared outside
    offset_to_tokens_map = {
        cause_offsets: [tup[0] for tup in tokenizer.tokenize(subset_string_cause)],
        effect_offsets: [tup[0] for tup in tokenizer.tokenize(subset_string_effect)],
        segment_before_offsets: [tup[0] for tup in tokenizer.tokenize(subset_string_before)],
        segment_after_offsets: [tup[0] for tup in tokenizer.tokenize(subset_string_after)],
        segment_between_offsets: [tup[0] for tup in tokenizer.tokenize(subset_string_between)]
    }

    # 7. sort the offsets
    offsets_sorted = sorted(offset_to_label_map.keys())

    # 8. create the tokenized and labeled sentence
    text_tokenized_labeled = []
    for offsets in offsets_sorted:
        list_of_tokens = offset_to_tokens_map[offsets]
        list_of_labels = [offset_to_label_map[offsets]] * len(list_of_tokens)
        if list_of_tokens != []:
            list_of_labels = tag_iobes(list_of_labels)
            text_tokenized_labeled.extend([(token, label) for token, label in zip(list_of_tokens, list_of_labels)])

    return text_tokenized_labeled

In [21]:
# 1. get a text
sample_text = task2_original_df['Text'][0]
sample_cause = task2_original_df['Cause'][0]
sample_effect = task2_original_df['Effect'][0]
sample_cause_offsets = (task2_original_df['Cause_Start'][0], task2_original_df['Cause_End'][0])
sample_effect_offsets = (task2_original_df['Effect_Start'][0], task2_original_df['Effect_End'][0])

print('sample text: {}'.format(sample_text))
print('sample cause: {}'.format(sample_cause))
print('sample effect: {}'.format(sample_effect))
sample_text_tokenized_labeled = convert_task2_data_to_list_of_tokens_with_labels(text=sample_text,
                                                                                 cause_offsets=sample_cause_offsets,
                                                                                 effect_offsets=sample_effect_offsets)
print('okenized and labeled sentence: {}'.format(sample_text_tokenized_labeled))


sample text: The Sunshine State drew in a net influx of about $17.7 billion in adjusted gross income (AGI)  -  most of which (72 percent) came from those aged 55 and older. It is consistently one of the most popular destinations for retirees due to affordability and low taxes. Florida's $17.7 billion in net AGI dwarves the remaining 19 states that saw a positive net influx of income  -  which combined for a total of $19.4 billion.
sample cause: It is consistently one of the most popular destinations for retirees due to affordability and low taxes.
sample effect: The Sunshine State drew in a net influx of about $17.7 billion in adjusted gross income (AGI)  -  most of which (72 percent) came from those aged 55 and older.
okenized and labeled sentence: [('The', 'B-E'), ('Sunshine', 'I-E'), ('State', 'I-E'), ('drew', 'I-E'), ('in', 'I-E'), ('a', 'I-E'), ('net', 'I-E'), ('influx', 'I-E'), ('of', 'I-E'), ('about', 'I-E'), ('$', 'I-E'), ('17.7', 'I-E'), ('billion', 'I-E'), ('in', 'I-E'), ('ad

In [22]:
list_of_texts_tokenized_and_labeled = {}
for row_idx, row in tqdm(task2_original_df.iterrows()):
    list_of_texts_tokenized_and_labeled[row.Index] = convert_task2_data_to_list_of_tokens_with_labels(row['Text'],
                                                                                                (row['Cause_Start'], row['Cause_End']),
                                                                                                (row['Effect_Start'], row['Effect_End']))

1109it [00:00, 1473.72it/s]


In [23]:
list_of_texts_tokenized_and_labeled

{'0001.00005.1': [('The', 'B-E'),
  ('Sunshine', 'I-E'),
  ('State', 'I-E'),
  ('drew', 'I-E'),
  ('in', 'I-E'),
  ('a', 'I-E'),
  ('net', 'I-E'),
  ('influx', 'I-E'),
  ('of', 'I-E'),
  ('about', 'I-E'),
  ('$', 'I-E'),
  ('17.7', 'I-E'),
  ('billion', 'I-E'),
  ('in', 'I-E'),
  ('adjusted', 'I-E'),
  ('gross', 'I-E'),
  ('income', 'I-E'),
  ('(', 'I-E'),
  ('AGI', 'I-E'),
  (')', 'I-E'),
  ('-', 'I-E'),
  ('most', 'I-E'),
  ('of', 'I-E'),
  ('which', 'I-E'),
  ('(', 'I-E'),
  ('72', 'I-E'),
  ('percent', 'I-E'),
  (')', 'I-E'),
  ('came', 'I-E'),
  ('from', 'I-E'),
  ('those', 'I-E'),
  ('aged', 'I-E'),
  ('55', 'I-E'),
  ('and', 'I-E'),
  ('older', 'I-E'),
  ('.', 'E-E'),
  ('It', 'B-C'),
  ('is', 'I-C'),
  ('consistently', 'I-C'),
  ('one', 'I-C'),
  ('of', 'I-C'),
  ('the', 'I-C'),
  ('most', 'I-C'),
  ('popular', 'I-C'),
  ('destinations', 'I-C'),
  ('for', 'I-C'),
  ('retirees', 'I-C'),
  ('due', 'I-C'),
  ('to', 'I-C'),
  ('affordability', 'I-C'),
  ('and', 'I-C'),
  ('low', 'I

# Convert for Thesis code

In [24]:
def convert(out_folder: str, cebam: bool=True):
    sentences_file = os.path.join(out_folder, 'sentences.txt')
    labels_file = os.path.join(out_folder, 'labels.txt')
    id_file = os.path.join(out_folder, 'ID.txt')
    with open(sentences_file, 'w') as s_f, open(labels_file, 'w') as l_f, open(id_file, 'w') as i_f:
        for uid, tokens_with_labels in list_of_texts_tokenized_and_labeled.items():
            labels = [t[1] for t in tokens_with_labels]
            tokens = [t[0] for t in tokens_with_labels]
            if not cebam:
                labels = [l if l.split('-')[1]=='C' or l.split('-')[1]=='E' else 'O' for l in labels]
            assert len(tokens) == len(labels)
            sentence_str = " ".join(str(t) for t in tokens)
            label_str = " ".join(l for l in labels)
            id_str = uid
            i_f.write(id_str)
            s_f.write(sentence_str)
            l_f.write(label_str)
            
            i_f.write('\n')
            s_f.write('\n')
            l_f.write('\n')

In [27]:
convert('/media/sarthak/HDD/TUM/Thesis/thesis-sarthak/src/ner/datasets/fincausal_task2v2_iobes_CEBAM/test', cebam=True)

In [28]:
convert('/media/sarthak/HDD/TUM/Thesis/thesis-sarthak/src/ner/datasets/fincausal_task2v2_iobes_CEO/test', cebam=False)