## RealKIE: New Dataset with 5 datasets for entity extraction 

- S1: Dataset of S1 filings
- Charities: Kleister Charity Dataset
- Nda: Kleister NDA Dataset
- FCC: FCC Invoices
- Resource Contracts: Contracts of agreements for resources Dataset

To Do: For each dataset, for each doc, for each label, get the line number(s) the entities are on

In [2]:
import pandas as pd

In [155]:
# NDA
df_nda_val = pd.read_csv("nda/val.csv")
df_nda_test = pd.read_csv("nda/test.csv")
df_nda_train = pd.read_csv("nda/train.csv")

# Charity
df_charity_val = pd.read_csv("charities/val.csv")
df_charity_test = pd.read_csv("charities/test.csv")
df_charity_train = pd.read_csv("charities/train.csv")

#FCC invoices
df_fcc_val = pd.read_csv("fcc/val.csv")
df_fcc_test = pd.read_csv("fcc/test.csv")
df_fcc_train = pd.read_csv("fcc/train.csv")

# s1
df_s1_val = pd.read_csv("s1/val.csv")
df_s1_test = pd.read_csv("s1/test.csv")
df_s1_train = pd.read_csv("s1/train.csv")

# resource_contracts
df_rc_val = pd.read_csv("rc/val.csv")
df_rc_test = pd.read_csv("rc/test.csv")
df_rc_train = pd.read_csv("rc/train.csv")

**Important Note:** For these datasets, we actually only want the retriever models to be evaluated on the finding the instances that this dataset has labelled for us, as these are the "proper" instances. Otherwise as some names are mnetioned many times, it would be unfrair to evaluate a retriever model on its performance by sayign that it had to find all 10 instances of a name.

Also, we dont want our model returning the name on line 300, which is formatted weirdly. We want the proper, full instance of the name as labelled by these datasets. 

**Note:** Some of the labels go over multiple lines, which is good for us.


## Removing Empty Labels

In [156]:
def remove_empty_labels(df, dataset_name, split):
    # Filter out rows where labels are equal to '[]' and print them
    empty_labels_rows = df[df['labels'] == '[]']
    if not empty_labels_rows.empty:
        print(f"Removed rows from {dataset_name} {split}:")
        #print(empty_labels_rows)
    
    return df[df['labels'] != '[]']

# Apply the function to each dataset
df_nda_val = remove_empty_labels(df_nda_val, "NDA", "val")
df_nda_test = remove_empty_labels(df_nda_test, "NDA", "test")
df_nda_train = remove_empty_labels(df_nda_train, "NDA", "train")

df_charity_val = remove_empty_labels(df_charity_val, "Charity", "val")
df_charity_test = remove_empty_labels(df_charity_test, "Charity", "test")
df_charity_train = remove_empty_labels(df_charity_train, "Charity", "train")

df_fcc_val = remove_empty_labels(df_fcc_val, "FCC", "val")
df_fcc_test = remove_empty_labels(df_fcc_test, "FCC", "test")
df_fcc_train = remove_empty_labels(df_fcc_train, "FCC", "train")

df_s1_val = remove_empty_labels(df_s1_val, "S1", "val")
df_s1_test = remove_empty_labels(df_s1_test, "S1", "test")
df_s1_train = remove_empty_labels(df_s1_train, "S1", "train")

df_rc_val = remove_empty_labels(df_rc_val, "RC", "val")
df_rc_test = remove_empty_labels(df_rc_test, "RC", "test")
df_rc_train = remove_empty_labels(df_rc_train, "RC", "train")


Removed rows from Charity test:
Removed rows from Charity train:


## New Code adjusting for blank lines

In [4]:
import json

def process_line_numbers(text, entities, index):
    """
    Calculate the start and end character offsets and find the line numbers the entity is on.
    """
    
    text_lines = text.split('\n')
    
    # Calculate offsets
    offsets = []
    prev_end = 0
    for line in text_lines:
        start = prev_end 
        end = start + len(line) + 1  # add 1 for newlines
        prev_end = end
        offsets.append({"start": start, "end": end})
    
    # Count blank lines above each line (cumulative)
    n_blank_above = []
    n_blank_lines = 0
    for line in text_lines:
        n_blank_above.append(n_blank_lines)
        if not line:
            n_blank_lines += 1

    # Calculate new offsets after removing blank lines
    new_text_lines = [line for line in text_lines if line]
    new_offsets = []
    prev_end = 0
    for line in new_text_lines:
        start = prev_end 
        end = start + len(line) + 1  # add 1 for newlines
        prev_end = end
        new_offsets.append({"start": start, "end": end})

    # Process each entity
    processed_entities = []
    for entity in entities:
        # Find original line numbers for the entity
        line_numbers = []
        for i, offset in enumerate(offsets):
            if entity["start"] < offset["end"] and entity["end"] > offset["start"]:
                line_numbers.append(i)

        # Offsets for original line numbers
        entity_offsets = [offsets[n] for n in line_numbers]

        # Offsets for the start and end of the text
        text_offset_start = (line_numbers[0], entity_offsets[0]["end"] - entity["start"])
        text_offset_end = (line_numbers[-1], entity["end"] - entity_offsets[-1]["start"])

        # New line numbers based on the new text (with blank lines removed)
        new_line_number_start = line_numbers[0] - n_blank_above[line_numbers[0]]
        new_line_number_end = line_numbers[-1] - n_blank_above[line_numbers[-1]]
        
        #print(entity['text'],new_text_lines[new_line_number_start: new_line_number_end + 1])

        new_text_start = new_offsets[new_line_number_start]["end"] - text_offset_start[-1]
        new_text_end = new_offsets[new_line_number_end]["start"] + text_offset_end[-1]

        # Extract final entity text
        final_entity_text = "\n".join(new_text_lines)[new_text_start:new_text_end]
        #assert len(final_entity_text) == new_text_end - new_text_start

        # Copy entity and update fields
        new_entity = entity.copy()
        new_entity["text"] = final_entity_text  
        new_entity["line_numbers"] = list(range(new_line_number_start, new_line_number_end + 1))
        new_entity["start"] = new_text_start
        new_entity["end"] = new_text_end    
        processed_entities.append(new_entity)
        
        # Check correctness
        predicted_text = "\n".join([new_text_lines[n] for n in new_entity["line_numbers"]])
        is_correct = new_entity['text'] in predicted_text
        if not is_correct:
            print(f"Row: {index}, Query: {new_entity['label']}, Entity: {repr(new_entity['text'])}, Found: {predicted_text}, Correct: {is_correct}")
        #print(f"Entity: {entity['text']}, Found: {predicted_text}, Correct: {is_correct}")
        #print(f"Correct: {is_correct}")

    return {
        "text": new_text_lines,
        "labels": processed_entities
    }

'''
# Example processing loop
processed_data = []
# Do it for all datasets
for i in range(len(df_rc_train)):
    row = df_rc_train.iloc[i]
    text = row["text"]
    entities = json.loads(row["labels"])
    processed_entry = process_line_numbers(text, entities,i)
    processed_data.append(processed_entry)

processed_data
'''

NameError: name 'df_rc_train' is not defined

## Saving gold labelled

In [154]:
import pandas as pd
processed_data = []
for i in range(len(df_charity_val)):
    row = df_charity_val.iloc[i]
    text = row["text"]
    entities = json.loads(row["labels"])
    processed_entry = process_line_numbers(text, entities,i)
    processed_data.append(processed_entry)

# Convert processed_data list to DataFrame
df_processed_data = pd.DataFrame(processed_data)
df_processed_data.head()
# Save DataFrame to JSON file
df_processed_data.to_json("charities/val_gold.jsonl", orient='records', lines=True)


In [107]:
#load jsonlines file charities/train_gold.jsonl
data = []
with open('charities/train_gold.jsonl', "r") as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from line in {'charities/train_gold.jsonl'}: {e}")


## Code for transforming: Now in actual codebase 

In [None]:
# I think this could work for all datsets if it is consistent
def transform_data(data):
    # Extract text and labels information from the data
    document = data['text']
    label_info = data['labels']
    print(label_info)
    #get the queries and their labels
    query_labels = {}

    ## MAKE QUERY EXTRACTION ANOTHER FUNCTION SO CAN DO QUERY ENGINEERING IN FUTURE
    for label in label_info:
        query = label['label']
        line_numbers = label['line_numbers']

        # Initialize binary list for the query if not already initialized
        if query not in query_labels:
            query_labels[query] = [0] * len(document)

        # Set the corresponding line numbers to 1 for the current label
        for line_number in line_numbers:
            query_labels[query][line_number] = 1

    
    queries = list(query_labels.keys())
    labels = [query_labels[query] for query in queries]
    print(len(document), len(labels[0]))
    assert len(document) == len(labels[0]), "Document and label lengths do not match"
    
    return {
        'document': document, #already in a list
        'queries': queries,
        'labels': labels
    }

# Transform the data
#transformed_data = transform_data(data[0])
#print(data[0]['labels'])
#print(transformed_data)
#transformed_data['labels']

#for each row in data, transform the data and save to new list, print the row id as well
transformed_data = []
for i in range(len(data)):
    transformed_data.append(transform_data(data[i]))
    print(i)

'''
print(transformed_data['queries'][0]) #change the index
for i in range(len(transformed_data['labels'][1])): #charity name (doesnt matter what goes in the [] they all have same length)
    if transformed_data['labels'][5][i] == 1: #change the index
        print(transformed_data['document'][i])
'''     

In [118]:
print(len(data[209]['labels']))

0


In [120]:
test = pd.read_csv('charities/train.csv')

In [135]:
#print all rows where labels = []
for i in range(len(data)):
    if len(data[i]['labels']) == 0:
        print(i)
        print(data[i]['labels'])
        

209
[]
240
[]
245
[]


In [94]:
print(data[0]['labels'])

[{'label': 'Charity Name', 'start': 13628, 'end': 13650, 'text': 'ROTARY CLUB OF FARNHAM', 'line_numbers': [228]}, {'label': 'Year Ended', 'start': 13720, 'end': 13734, 'text': '30TH JUNE 2018', 'line_numbers': [231]}, {'label': 'Net Income at Current Year End', 'start': 14451, 'end': 14457, 'text': '£6,564', 'line_numbers': [260]}, {'label': 'Net Income at Previous Year End', 'start': 14458, 'end': 14464, 'text': '£9,176', 'line_numbers': [260]}, {'label': 'Charity Name', 'start': 736, 'end': 758, 'text': 'ROTARY CLUB OF FARNHAM', 'line_numbers': [44]}, {'label': 'Year Ended', 'start': 811, 'end': 823, 'text': '30 JUNE 2018', 'line_numbers': [47]}, {'label': 'Year Ended', 'start': 913, 'end': 925, 'text': '30 June 2018', 'line_numbers': [49]}, {'label': 'Trustee Name', 'start': 1331, 'end': 1349, 'text': 'M Atkinson CMG MBE', 'line_numbers': [55]}, {'label': 'Trustee Name', 'start': 1351, 'end': 1372, 'text': 'R W H Crawford OBE MA', 'line_numbers': [55]}, {'label': 'Trustee Name', 's

In [None]:
for label in data['labels']:
    print(label)

In [61]:
transformed_data['queries'][1]

'Year Ended'

In [58]:
transformed_data['document'][0]

['Charity Commission',
 'Number - 200435',
 'ROTARY CLUB OF FARNHAM',
 'BENEVOLENT FUND',
 'TRUSTEES REPORT AND ACCOUNTS',
 'FOR THE YEAR ENDED 30TH JUNE 2018',
 'ROTARY CLUB OF FARNHAM',
 'BENEVOLENT FUND',
 'CONTENTS',
 'Page',
 'Legal and administrative information I',
 "Trustees' report",
 "Statement of trustees' responsibilities",
 "Independent examiner's report",
 'Statement of financial activities',
 'Balance sheet',
 'Notes to the accounts',
 '2-4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'ROTARY CLUB OF FARNHAM',
 'BENEVOLENT FUND',
 'LEGAL AND ADMINISTRATIVE INFORMATION',
 'Trustees',
 'M. Atkinson, CMG, MBE',
 'R. W. H. Crawford, OBE, MA',
 'G. Hutton',
 'I. C. Sargeant, FCA',
 'Charity Number 200435',
 'Independent Examiner',
 'Bankers',
 'R. J. Smith , FCA',
 '88A West Street,',
 'Farnham,',
 'Surrey,',
 'GU9 7EN.',
 'Lloyds Bank plc',
 '75 Castle Street,',
 'Farnham,',
 'Surrey,',
 'GU9 7LT.',
 'ROTARY CLUB OF FARNHAM',
 'BENEVOLENT FUND',
 "TRUSTEES' REPORT",
 'FOR THE YEAR ENDED

In [65]:
len(transformed_data['labels'][1])

347

In [48]:
transformed_data['document'][0][346]

'_9'

In [20]:
data[0]

KeyError: 0

## Inspecting Charity

## Insepcting S1

### Note: If we split it by new lines, and some of them are blank how will this work with indexing it, i.e how do we deal with them

In [66]:
df_s1_val.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,document_path,image_files,labels,ocr,original_filename,text
0,0,0,0,0,s1/files/de7697b186b5cbf65c3cf7ffc448dc9c.pdf,"[""s1/images/de7697b186b5cbf65c3cf7ffc448dc9c/p...","[{""label"": ""Risk Clauses"", ""start"": 251249, ""e...",s1/ocr/de7697b186b5cbf65c3cf7ffc448dc9c.json.gz,e2371_s-1.pdf,As filed with the Securities and Exchange Comm...
1,1,1,1,1,s1/files/acc08fbe572bf2d3be146e1b0643ae42.pdf,"[""s1/images/acc08fbe572bf2d3be146e1b0643ae42/p...","[{""label"": ""(Header) Prospectus Summary"", ""sta...",s1/ocr/acc08fbe572bf2d3be146e1b0643ae42.json.gz,d142494ds1.pdf,Table of Contents\n\nAs filed with the Securit...
2,2,2,2,2,s1/files/9804840ee3128079d72094fed575431c.pdf,"[""s1/images/9804840ee3128079d72094fed575431c/p...","[{""label"": ""Prospectus Summary (1st Para)"", ""s...",s1/ocr/9804840ee3128079d72094fed575431c.json.gz,tm217792d1_s-1.pdf,As filed with the U.S. Securities and Exchange...
3,3,3,3,3,s1/files/049d0ff3d62fd2affd042c15ef9409df.pdf,"[""s1/images/049d0ff3d62fd2affd042c15ef9409df/p...","[{""label"": ""Company Officer"", ""start"": 512384,...",s1/ocr/049d0ff3d62fd2affd042c15ef9409df.json.gz,d113914ds1.pdf,Table of Contents\n\nAs filed with the U.S. Se...
4,4,4,4,4,s1/files/d1ba3c37f57000e999b092156b0a2f5b.pdf,"[""s1/images/d1ba3c37f57000e999b092156b0a2f5b/p...","[{""label"": ""Company Officer"", ""start"": 277118,...",s1/ocr/d1ba3c37f57000e999b092156b0a2f5b.json.gz,gmgi_s1.pdf,As filed with the U.S. Securities and Exchange...


In [78]:
len(df_s1_val), len(df_s1_test), len(df_s1_train)

(65, 65, 192)

In [200]:
for i, row in df_s1_val.iterrows():
    labels = json.loads(row["labels"])
    for label in labels:
        if label["label"] == "(Header) Prospectus Summary":
            print(i, label)

0 {'label': '(Header) Prospectus Summary', 'start': 36004, 'end': 36022, 'text': 'PROSPECTUS SUMMARY'}
1 {'label': '(Header) Prospectus Summary', 'start': 19918, 'end': 19925, 'text': 'SUMMARY'}
2 {'label': '(Header) Prospectus Summary', 'start': 13683, 'end': 13690, 'text': 'SUMMARY'}
3 {'label': '(Header) Prospectus Summary', 'start': 16338, 'end': 16345, 'text': 'SUMMARY'}
5 {'label': '(Header) Prospectus Summary', 'start': 16430, 'end': 16437, 'text': 'SUMMARY'}
6 {'label': '(Header) Prospectus Summary', 'start': 15099, 'end': 15106, 'text': 'SUMMARY'}
7 {'label': '(Header) Prospectus Summary', 'start': 12487, 'end': 12494, 'text': 'SUMMARY'}
8 {'label': '(Header) Prospectus Summary', 'start': 9239, 'end': 9257, 'text': 'Prospectus summary'}
9 {'label': '(Header) Prospectus Summary', 'start': 16034, 'end': 16041, 'text': 'SUMMARY'}
10 {'label': '(Header) Prospectus Summary', 'start': 26511, 'end': 26518, 'text': 'SUMMARY'}
11 {'label': '(Header) Prospectus Summary', 'start': 15170,