## Transforming into NER Dataset
Now we have the data into the format we want, we still need to put it into the format of an NER datast. We need it in the format IOB (Inside, Out, Beginning) format for each word. But first we need to create the tags that we want.

In [4]:
import json
import pandas as pd


In [5]:
charities_train = pd.read_csv("realKieData/charities/train.csv")
charities_validation = pd.read_csv("realKieData/charities/val.csv")
charities_test = pd.read_csv("realKieData/charities/test.csv")

nda_train = pd.read_csv("realKieData/nda/train.csv")
nda_validation = pd.read_csv("realKieData/nda/val.csv")
nda_test = pd.read_csv("realKieData/nda/test.csv")

fcc_train = pd.read_csv("realKieData/fcc/train.csv")
fcc_validation = pd.read_csv("realKieData/fcc/val.csv")
fcc_test = pd.read_csv("realKieData/fcc/test.csv")

rc_train = pd.read_csv("realKieData/rc/train.csv")
rc_validation = pd.read_csv("realKieData/rc/val.csv")
rc_test = pd.read_csv("realKieData/rc/test.csv")

s1_train = pd.read_csv("realKieData/s1/train.csv")
s1_validation = pd.read_csv("realKieData/s1/val.csv")
s1_test = pd.read_csv("realKieData/s1/test.csv")


In [6]:
charities_train['labels'] = charities_train['labels'].apply(json.loads)
charities_validation['labels'] = charities_validation['labels'].apply(json.loads)
charities_test['labels'] = charities_test['labels'].apply(json.loads)

nda_train['labels'] = nda_train['labels'].apply(json.loads)
nda_validation['labels'] = nda_validation['labels'].apply(json.loads)
nda_test['labels'] = nda_test['labels'].apply(json.loads)

fcc_train['labels'] = fcc_train['labels'].apply(json.loads)
fcc_validation['labels'] = fcc_validation['labels'].apply(json.loads)
fcc_test['labels'] = fcc_test['labels'].apply(json.loads)

rc_train['labels'] = rc_train['labels'].apply(json.loads)
rc_validation['labels'] = rc_validation['labels'].apply(json.loads)
rc_test['labels'] = rc_test['labels'].apply(json.loads)

s1_train['labels'] = s1_train['labels'].apply(json.loads)
s1_validation['labels'] = s1_validation['labels'].apply(json.loads)
s1_test['labels'] = s1_test['labels'].apply(json.loads)

In [7]:
charities_train.iloc[0]['labels'][0]


{'label': 'Charity Name',
 'start': 2950,
 'end': 2970,
 'text': 'MICKLEHAM ALMSHOUSES'}

In [8]:
charities_train.text[0][2950:2970]

'MICKLEHAM ALMSHOUSES'

## Investigating different entities 

In [9]:
from collections import Counter
def count_labels(dataset, column_name='labels'):
    all_labels = []
    
    # Iterate over each row and parse the JSON-like strings
    for row in dataset[column_name]:
        try:
            # Parse the JSON-like string
            # Extract labels and add to the all_labels list
            for item in row:
                all_labels.append(item['label'])
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in row: {row} - {e}")
    
    # Count the occurrences of each label
    label_counts = Counter(all_labels)
    
    # Print the counts
    for label, count in label_counts.items():
        print(f"{label}: {count}")

In [10]:
count_labels(charities_train)

Charity Name: 4250
Charity Registered Number: 729
Year Ended: 3644
Accounting Basis: 231
Independent Examiner Name: 398
Examination Date: 211
Independent Examiner Street Address: 534
Independent Examiner City: 549
Independent Examiner Postal Code: 515
Net Income at Current Year End: 162
Net Income at Previous Year End: 165
Net Assets at Current Year End: 242
Net Assets at Previous Year End: 231
Cash In Hand at Current Year End: 281
Cash In Hand at Previous Year End: 273
Principal Office Street Address: 294
Principal Office City: 295
Principal Office Postal Code: 283
Trustee Name: 3510
Trustee Title: 1051
Objectives and Activities: 270
Independent Examiner Company: 447
Bank Name: 233
Named Donor: 445
Company Number: 181
Named Employee: 52
Event Name: 136
Project Name: 107


In [11]:
count_labels(nda_train)

Jurisdiction: 243
Party: 522
Effective Date: 237


In [12]:
count_labels(fcc_train)

Line Item - Description: 10170
Line Item - Days: 8810
Line Item - Rate: 11980
Line Item - Start Date: 11889
Line Item - End Date: 6010
Agency: 399
Advertiser: 608
Gross Total: 496
Net Amount Due: 375
Agency Commission: 224
Payment Terms: 254


In [13]:
count_labels(rc_train)

(Header) Contract Area Description: 144
Contract Area Description: 574
(Header)Governing law: 140
Governing law: 143
(Header)Hardship clause or force majeure: 118
Hardship clause or force majeure: 158
(Header)Reporting requirements: 301
Reporting requirements: 762
(Header)Environmental protections: 124
Environmental protections: 296
(Header)Income tax: rate: 101
Income tax: rate: 101
(Header)Term: 165
Term: 217
Renewal or extension of term: 271
Type: 202
Date Signed: 171
Participants: 518
Country: 243
Project: 117
Water use: 72
Signatories, company: 283
(Header)Water use: 19


In [14]:
count_labels(s1_train)

Company Officer: 1528
Company Officer Title: 1547
Risk Clauses: 14487
(Header) Risks To The Business: 194
(Header) Description of Securities: 220
Description of Securities (1st Para): 226
(Header) Dividend Policy: 180
Dividend Policy (1st Para): 181
(Header) Prospectus Summary: 191
Prospectus Summary (1st Para): 1852
Joint Book Runners: 352
Title of Security Registered: 537
Amount Registered: 524
Max Price: 280
Date of Prospectus: 187
Company Name: 195
Company Address: 192
Agent Name: 190
Agent Address: 190
Agent Telephone: 183
EIN: 187
Attorney Names: 738
Law Firm Name: 382
Law Firm Address: 527


# Cleaning text, removing blank lines and adjusting offsets.

In [15]:
def remove_empty_labels(df, dataset_name, split):
    # Filter out rows where labels are equal to '[]' and print them
    empty_labels_rows = df[df['labels'] == '[]']
    if not empty_labels_rows.empty:
        print(f"Removed rows from {dataset_name} {split}:")
        #print(empty_labels_rows)
    
    return df[df['labels'] != '[]']

In [16]:
charities_train = remove_empty_labels(charities_train, 'charities', 'train')
charities_validation = remove_empty_labels(charities_validation, 'charities', 'validation')
charities_test = remove_empty_labels(charities_test, 'charities', 'test')

nda_train = remove_empty_labels(nda_train, 'nda', 'train')
nda_validation = remove_empty_labels(nda_validation, 'nda', 'validation')
nda_test = remove_empty_labels(nda_test, 'nda', 'test')

fcc_train = remove_empty_labels(fcc_train, 'fcc', 'train')
fcc_validation = remove_empty_labels(fcc_validation, 'fcc', 'validation')
fcc_test = remove_empty_labels(fcc_test, 'fcc', 'test')

rc_train = remove_empty_labels(rc_train, 'rc', 'train')
rc_validation = remove_empty_labels(rc_validation, 'rc', 'validation')
rc_test = remove_empty_labels(rc_test, 'rc', 'test')

s1_train = remove_empty_labels(s1_train, 's1', 'train')
s1_validation = remove_empty_labels(s1_validation, 's1', 'validation')
s1_test = remove_empty_labels(s1_test, 's1', 'test')

In [17]:
#drop all columns apart from text and labels
charities_train = charities_train[['text', 'labels']]
charities_validation = charities_validation[['text', 'labels']]
charities_test = charities_test[['text', 'labels']]

nda_train = nda_train[['text', 'labels']]
nda_validation = nda_validation[['text', 'labels']]
nda_test = nda_test[['text', 'labels']]

fcc_train = fcc_train[['text', 'labels']]
fcc_validation = fcc_validation[['text', 'labels']]
fcc_test = fcc_test[['text', 'labels']]

rc_train = rc_train[['text', 'labels']]
rc_validation = rc_validation[['text', 'labels']]
rc_test = rc_test[['text', 'labels']]

s1_train = s1_train[['text', 'labels']]
s1_validation = s1_validation[['text', 'labels']]
s1_test = s1_test[['text', 'labels']]

# Create the entities 
We only use some of the labels to create 4 entities for the model initally. PER (Person), ORG (Orginisation), LOC (Location), FIN (Finance - monetary amounts)

Need to make the label to entity mappings

In [19]:
label_to_entity = {
    "Charity Name": "ORG",
    "Bank Name": "ORG",
    "Party": "ORG",
    "Agency": "ORG",
    "Participants": "ORG",
    "Law Firm Name": "ORG",
    "Company Name": "ORG",
    "Joint Book Runners": "ORG",
    "Company Officer": "PER",
    "Attorney names" : "PER",
    "Independent Examiner Name": "PER",
    "Trustee Name": "PER",
    "Signatories": "PER",
    "Company": "PER",
    "Date Signed": "DAT",
    "Effective Date": "DAT",
    "Examination Date": "DAT",
    "Year Ended": "DAT",
    "Date of Prospectus": "DAT",
    "Line Item - Start Date": "DAT",
    "Line Item - End Date": "DAT",
    "Independent Examiner City": "LOC",
    "Principal Office City": "LOC",
    "Jurisdiction": "LOC",
    "Country": "LOC",
    "Governing Law": "LOC",
    "Net Income at Current Year End": "FIN",
    "Net Income at Previous Year End": "FIN",
    "Net Assets at Current Year End": "FIN",
    "Net Assets at Previous Year End": "FIN",
    "Cash In Hand at Current Year End": "FIN",
    "Cash In Hand at Previous Year End": "FIN",
    "Max Price": "FIN",
    "Net Amount Due": "FIN",
    "Gross Total": "FIN"
}

In [20]:
import pandas as pd
from transformers import BertTokenizerFast

#Using BERT fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def label_tokens(labels, tokenized_inputs, token_labels):
    for label in labels:
        start_char = label['start']
        end_char = label['end']
        label_text = label['label']
        
        if label_text in label_to_entity:
            entity = label_to_entity[label_text]
            
            for idx, (start, end) in enumerate(tokenized_inputs['offset_mapping']):
                if start >= start_char and end <= end_char:
                    # Assign B-<entity> for the beginning token and I-<entity> for inside tokens
                    if start == start_char:
                        token_labels[idx] = f"B-{entity}"
                    else:
                        token_labels[idx] = f"I-{entity}"
                    
    return token_labels

def process_dataframe(dataframe):
    results = []

    for index, row in dataframe.iterrows():
        passage_id = index  # Using the index as the passage ID
        text = row['text']
        labels = row['labels']
        
        tokenized_inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=512)
        
        token_labels = ["O"] * len(tokenized_inputs['input_ids'])
        
        token_labels = label_tokens(labels, tokenized_inputs, token_labels)
        
        for token, label in zip(tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids']), token_labels):
            results.append({
                'Passage_ID': passage_id,
                'Token': token,
                'Label': label
            })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    
    return results_df



In [21]:
charities_test_labeled = process_dataframe(charities_test)
charities_train_labeled = process_dataframe(charities_train)
charities_validation_labeled = process_dataframe(charities_validation)

nda_test_labeled = process_dataframe(nda_test)
nda_train_labeled = process_dataframe(nda_train)
nda_validation_labeled = process_dataframe(nda_validation)

fcc_test_labeled = process_dataframe(fcc_test)
fcc_train_labeled = process_dataframe(fcc_train)
fcc_validation_labeled = process_dataframe(fcc_validation)

rc_test_labeled = process_dataframe(rc_test)
rc_train_labeled = process_dataframe(rc_train)
rc_validation_labeled = process_dataframe(rc_validation)

s1_test_labeled = process_dataframe(s1_test)
s1_train_labeled = process_dataframe(s1_train)
s1_validation_labeled = process_dataframe(s1_validation)

In [22]:
charities_train_labeled = process_dataframe(charities_train)

In [23]:
charities_train_labeled

Unnamed: 0,Passage_ID,Token,Label
0,0,[CLS],B-ORG
1,0,mick,B-ORG
2,0,##le,I-ORG
3,0,##ham,I-ORG
4,0,al,I-ORG
...,...,...,...
163385,321,ms,O
163386,321,.,O
163387,321,diane,O
163388,321,ru,O


In [24]:
s1_train_labeled.Label.value_counts()

O        93942
I-ORG     3035
B-ORG      576
I-DAT      565
B-DAT      184
B-PER        1
I-PER        1
Name: Label, dtype: int64

# Next Step: Build Datasets
Here we build our Ner numerical labels, we also construct the dataset for trainign BERT. Most of the code below for building things like input_ids, attention masks and paddings are taken care of by the HF transformers library, but it was good for my own learning to do this myself and understand BERT deeper.

In [45]:
label_to_index = {
    'O': 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-ORG': 3,
    'I-ORG': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    'B-DAT': 7,
    'I-DAT': 8,
    'B-FIN': 9,
    'I-FIN': 10
}

In [69]:
import pandas as pd
from transformers import BertTokenizerFast

# Initialize the tokenize
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def group_tokens_to_sentences(df, max_length=512):
    sentences = []
    labels = []
    attention_masks = []
    token_type_ids = []
    original_sentences = []
    ner_tags = []

    current_sentence = []
    current_labels = []
    current_original_sentence = []
    current_ner_tags = []
    current_length = 0

    for _, row in df.iterrows():
        token = row['Token']
        label = row['Label']
        
        tokenized = tokenizer.tokenize(token)
        token_length = len(tokenized)
        
        if current_length + token_length + 2 > max_length:
            input_ids = tokenizer.convert_tokens_to_ids(current_sentence)
            attention_mask = [1] * len(input_ids)
            token_type_id = [0] * len(input_ids)
            
            # Padding
            padding_length = max_length - len(input_ids)
            input_ids += [tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            token_type_id += [0] * padding_length
            current_labels += [-100] * padding_length  # -100 for padding tokens
            
            sentences.append(input_ids)
            labels.append(current_labels)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)
            original_sentences.append(current_original_sentence)
            ner_tags.append(current_ner_tags)
            
            current_sentence = []
            current_labels = []
            current_original_sentence = []
            current_ner_tags = []
            current_length = 0
        
        current_sentence.extend(tokenized)
        current_labels.extend([label_to_index[label]] * token_length)
        current_original_sentence.append(token)
        current_ner_tags.append(label)
        current_length += token_length
    
    if current_sentence:
        input_ids = tokenizer.convert_tokens_to_ids(current_sentence)
        attention_mask = [1] * len(input_ids)
        token_type_id = [0] * len(input_ids)
        
        # Padding
        padding_length = max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        attention_mask += [0] * padding_length
        token_type_id += [0] * padding_length
        current_labels += [-100] * padding_length  # -100 for padding tokens
        
        sentences.append(input_ids)
        labels.append(current_labels)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        original_sentences.append(current_original_sentence)
        ner_tags.append(current_ner_tags)
    
    sentence_df = pd.DataFrame({
        'input_ids': sentences,
        'labels': labels,
        'attention_masks': attention_masks,
        'token_ids': token_type_ids,
        'sentence': original_sentences,
        'ner_tags': ner_tags
    })
    
    return sentence_df
       

In [70]:
charities_test_gold = group_tokens_to_sentences(charities_test_labeled)
charities_train_gold = group_tokens_to_sentences(charities_train_labeled)
charities_validation_gold = group_tokens_to_sentences(charities_validation_labeled)

nda_test_gold = group_tokens_to_sentences(nda_test_labeled)
nda_train_gold = group_tokens_to_sentences(nda_train_labeled)
nda_validation_gold = group_tokens_to_sentences(nda_validation_labeled)

fcc_test_gold = group_tokens_to_sentences(fcc_test_labeled)
fcc_train_gold = group_tokens_to_sentences(fcc_train_labeled)
fcc_validation_gold = group_tokens_to_sentences(fcc_validation_labeled)

rc_test_gold = group_tokens_to_sentences(rc_test_labeled)
rc_train_gold = group_tokens_to_sentences(rc_train_labeled)
rc_validation_gold = group_tokens_to_sentences(rc_validation_labeled)

s1_test_gold = group_tokens_to_sentences(s1_test_labeled)
s1_train_gold = group_tokens_to_sentences(s1_train_labeled)
s1_validation_gold = group_tokens_to_sentences(s1_validation_labeled)


In [71]:
def save_to_json(df, filename):
    df.to_json(filename, orient='records', lines=True)

save_to_json(charities_test_gold, 'charities_test_gold.json')
save_to_json(charities_train_gold, 'charities_train_gold.json')
save_to_json(charities_validation_gold, 'charities_validation_gold.json')

save_to_json(nda_test_gold, 'nda_test_gold.json')
save_to_json(nda_train_gold, 'nda_train_gold.json')
save_to_json(nda_validation_gold, 'nda_validation_gold.json')

save_to_json(fcc_test_gold, 'fcc_test_gold.json')
save_to_json(fcc_train_gold, 'fcc_train_gold.json')
save_to_json(fcc_validation_gold, 'fcc_validation_gold.json')

save_to_json(rc_test_gold, 'rc_test_gold.json')
save_to_json(rc_train_gold, 'rc_train_gold.json')
save_to_json(rc_validation_gold, 'rc_validation_gold.json')

save_to_json(s1_test_gold, 's1_test_gold.json')
save_to_json(s1_train_gold, 's1_train_gold.json')
save_to_json(s1_validation_gold, 's1_validation_gold.json')


## Notes
- input_ids passed to BERT for trainning
- attention_masks also passed to help BERT understand what tokens to pay attention too
- labels (the ner labels also passed to BERT)

In [68]:
charities_test_labeled_lists

Unnamed: 0,input_ids,ner_labels,attention_masks,token_ids,sentence,ner_tags
0,"[101, 5952, 3222, 2193, 1011, 2432, 1001, 1001...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[CLS], charity, commission, number, -, 2004, ...","[O, O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, I-O..."
1,"[10615, 1998, 2968, 1996, 5952, 2001, 2511, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[governance, and, management, the, charity, wa...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[5952, 2076, 1996, 2558, 2020, 2004, 4076, 102...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[charity, during, the, period, were, as, follo...","[O, O, O, O, O, O, O, O, B-PER, I-PER, I-PER, ..."
3,"[1001, 1001, 2002, 2050, 1001, 1001, 16215, 23...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[##hea, ##th, village, hall, for, use, by, loc...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[13226, 2502, 1001, 1001, 28177, 1006, 2013, 3...","[1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[vanessa, big, ##gs, (, from, 2nd, april, 2019...","[B-PER, I-PER, I-PER, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...,...
124,"[2004, 2256, 3472, 3855, 1999, 2197, 2095, 100...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[as, our, chairman, mentioned, in, last, year,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
125,"[20964, 1001, 1001, 1055, 2410, 2000, 2403, 48...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[auditor, ##s, 13, to, 14, statement, of, fina...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
126,"[9360, 11496, 7291, 15730, 5170, 23848, 1001, ...","[0, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[trustees, stephanie, owen, joanna, philip, ma...","[O, B-PER, I-PER, B-PER, I-PER, B-PER, I-PER, ..."
127,"[2043, 2057, 2056, 13407, 2000, 2256, 2336, 32...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[when, we, said, farewell, to, our, children, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
