In [1]:
import re
import numpy as np
import unicodedata
from tqdm import tqdm
from nltk.tag import CRFTagger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from gensim.corpora import Dictionary
from functools import partial
import nltk

In [2]:
# Load data function
def load_data(filename):
    data = []
    sentence = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.strip().split()
            if not tokens:
                if sentence:
                    data.append(sentence)
                    sentence = []
            elif len(tokens) == 4:
                token, pos, chunk, ne = tokens
                sentence.append((token, pos, chunk, ne))
    if sentence:
        data.append(sentence)
    return data

# Load datasets
fin3_data = load_data('task3_fin3')
fin5_data = load_data('task3_fin5')
combined_data = fin3_data + fin5_data

In [3]:
# Split the data
train_set, test_set = train_test_split(combined_data, train_size=0.80, test_size=0.20, random_state=101)

# Separate tokens and tags
def separate_tokens_tags(data):
    toks, tags = [], []
    for sentence in data:
        sentence_toks, sentence_tags = zip(*[(token, ne) for token, pos, chunk, ne in sentence])
        toks.append(sentence_toks)
        tags.append(sentence_tags)
    return toks, tags

train_toks, train_tags = separate_tokens_tags(train_set)
test_toks, test_tags = separate_tokens_tags(test_set)


In [4]:
# Custom CRF Tagger with feature extraction including POS tags
class CustomCRFTagger(CRFTagger):
    def _get_features(self, tokens, idx):
        token = tokens[idx]
        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(r'\d', token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])

        # Current word
        feature_list.append("WORD_" + token)

        # Previous and Next Words
        if idx > 0:
            feature_list.append("PREVWORD_" + tokens[idx - 1])
        if idx < len(tokens) - 1:
            feature_list.append("NEXTWORD_" + tokens[idx + 1])

        # POS Tag
        if hasattr(self, '_pos_tags') and self._pos_tags:
            feature_list.append("POS_" + self._pos_tags[self._current_sentence_idx][idx])
        
        return feature_list

    def tag_sents_with_pos(self, sentences):
        # Get POS tags for each sentence
        pos_tagged_sents = [nltk.pos_tag(sentence) for sentence in sentences]
        self._pos_tags = [[tag for _, tag in pos_tagged_sent] for pos_tagged_sent in pos_tagged_sents]
        
        result = []
        for i, tokens in enumerate(sentences):
            self._current_sentence_idx = i
            result.append(self.tag(tokens))
        return result

In [5]:
# Training function
def train_CustomCRF_NER_tagger(train_set):
    tagger = CustomCRFTagger()
    tagger._feature_func = partial(CustomCRFTagger._get_features, tagger)
    tagger.train(train_set, 'model.crf.tagger')
    return tagger

# Prepare the training data with the custom feature extractor
train_set_features = [[(token, tag) for token, tag in zip(sentence, tags)] for sentence, tags in zip(train_toks, train_tags)]

In [6]:
# Train the CRF model
crf_tagger = train_CustomCRF_NER_tagger(train_set_features)

# Prepare the test data in the same format
test_set_features = [[(token, tag) for token, tag in zip(sentence, tags)] for sentence, tags in zip(test_toks, test_tags)]

# Predict tags for the test set
predicted_tags = crf_tagger.tag_sents_with_pos(test_toks)

# Debugging: Print some predictions to verify
print("Sample Predictions:")
for i in range(2):
    print(f"Tokens: {test_toks[i]}")
    print(f"True Tags: {test_tags[i]}")
    print(f"Predicted Tags: {predicted_tags[i]}")

Sample Predictions:
Tokens: ('No', 'Investment', 'shall', 'be', 'deemed', 'to', 'be', 'a', 'security', 'within', 'the', 'meaning', 'of', 'the', 'Securities', 'Act', 'of', '1933', 'or', 'the', 'Securities', 'Exchange', 'Act', 'of', '1934', '.')
True Tags: ('O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')
Predicted Tags: [('No', 'O'), ('Investment', 'O'), ('shall', 'O'), ('be', 'O'), ('deemed', 'O'), ('to', 'O'), ('be', 'O'), ('a', 'O'), ('security', 'O'), ('within', 'O'), ('the', 'O'), ('meaning', 'O'), ('of', 'O'), ('the', 'O'), ('Securities', 'O'), ('Act', 'O'), ('of', 'O'), ('1933', 'O'), ('or', 'O'), ('the', 'O'), ('Securities', 'O'), ('Exchange', 'O'), ('Act', 'O'), ('of', 'O'), ('1934', 'O'), ('.', 'O')]
Tokens: ('"', 'EQUIPMENT', 'ADVANCE', '"', 'is', 'defined', 'in', 'Section', '2', '.', '1', '.', '1', '.')
True Tags: ('O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')
Predicted T

In [10]:
# Adjust test_set and predicted_tags to have the correct format
test_set_correct_format = [[(token, tag) for token, tag in zip(tokens, tags)] for tokens, tags in zip(test_toks, test_tags)]
predicted_tags_correct_format = [[(token, tag) for token, tag in zip(tokens, [tag for token, tag in sentence])] for tokens, sentence in zip(test_toks, predicted_tags)]

cal_span_level_f1(test_set_correct_format, predicted_tags_correct_format)

# Token-level evaluation using sklearn classification report
flat_true_tags = [tag for sentence in test_tags for tag in sentence]
flat_pred_tags = [tag for sentence in predicted_tags for token, tag in sentence]

print(classification_report(flat_true_tags, flat_pred_tags, zero_division=0))


              precision    recall  f1-score   support

       I-LOC       0.98      0.34      0.50       134
      I-MISC       0.00      0.00      0.00         2
       I-ORG       0.78      0.59      0.68        79
       I-PER       0.99      0.94      0.97       197
           O       0.99      1.00      0.99     11303

    accuracy                           0.99     11715
   macro avg       0.75      0.57      0.63     11715
weighted avg       0.99      0.99      0.99     11715



## Method Implementation and Rationale

To implement the method, I first loaded and preprocessed the dataset containing financial agreements, ensuring each token was labeled with one of four NE types: location (LOC), miscellaneous (MISC), organization (ORG), and person (PER). Then split the dataset into training and testing sets using an 80-20 split.

For training, I employed a custom Conditional Random Fields (CRF) tagger with an extensive feature set, including word-level features and POS tags. The training process involved feature extraction, model training, and hyperparameter optimization. Finally, I tested the trained model on the test set and evaluated its performance using standard NER metrics.

Explain how your chosen method works and its main strengths and limitations. The CRF model is a probabilistic graphical model used for structured prediction. It considers the context of the entire sentence to predict the label for each token, making it well-suited for NER tasks. The main strengths of the CRF model include its ability to handle sequential data and incorporate various features for improved accuracy.
However, CRFs also have limitations, such as the potential for overfitting with high-dimensional feature spaces and the complexity involved in tuning hyperparameters. Additionally, CRFs may struggle with long-range dependencies that are better captured by models like transformers.

Detail the features you have chosen and explain why you chose them. I selected a diverse set of features to capture various linguistic and contextual cues:

• Capitalization: Indicates proper nouns, often used in names.

• Presence of numbers: Identifies dates, monetary values, and other numeric entities.

• Punctuation: Helps differentiate between tokens.

• Suffixes: Common suffixes can indicate specific types of entities.

• Current word, previous, and next words: Provides local context.

• POS tags: Adds syntactic information to improve predictions.

These features were chosen to capture both local and global contextual information, which is crucial for accurately identifying named entities. The financial agreements dataset poses several challenges:

• Complexity of financial language: Requires capturing domain-specific terminology and context.

• Long-range dependencies: Some entities span multiple tokens, making it difficult for simpler models to capture.

• Imbalanced NE types: Certain NE types are underrepresented, leading to potential biases in the model.


## Evaluation, Interpretation, and Discussion of Results


I used token-level precision, recall, and F1-score to evaluate the model’s performance. These metrics are standard in NER tasks and provide a clear indication of how well the model identifies and classifies named entities.


Results:

• Overall Accuracy: The overall accuracy of the model is very high at 99%. This is expected given that the majority of tokens belong to the ’O’ class (non-entity), which the model handles well.

• I-LOC: The precision for location entities is high (0.98), but the recall is relatively low (0.34), indicating that while the model is very precise when it predicts a location, it misses many actual locations.

• I-MISC: The model struggles with the ’I-MISC’ category, showing 0 precision, recall, and F1-score. This suggests that the model either did not predict any ’I-MISC’ entities or that its predictions were completely incorrect.

• I-ORG: For organization entities, the model achieves moderate precision (0.78) and recall (0.59), resulting in an F1-score of 0.68. This indicates a balanced performance but still room for improve- ment.

• I-PER: The model performs exceptionally well on person entities, with high precision (0.99), recall (0.94), and F1-score (0.97), suggesting it effectively identifies and classifies person names.

• Macro Avg: The macro-average F1-score is 0.63, indicating that the model’s performance varies significantly across different entity types.

• Weighted Avg: The weighted average F1-score of 0.99 reflects the high performance on the majority class ’O’, which skews the overall performance metrics.

Possible Areas for Improvement:

Advanced Feature Engineering:

Word Embeddings: Advanced embeddings such as BERT, GloVe, or Word2Vec to capture richer semantic relationships between words. These embeddings can help the model better understand the context of entities in financial agreements (Turton et al., 2021).

Model Enhancement:

• Transformer-based Models: Transformer-based models like BERT or RoBERTa, which have shown superior performance in various NLP tasks, including NER. These models can better handle the nuances and complexities of financial texts (Huneman, 2023).

• Experiment with ensemble methods combining multiple models.

• Experimental Process Improvement:

Hyperparameter Tuning: Extensive hyperparameter tuning using techniques like Grid Search or Random Search to find the optimal settings for the models could be employed. This could signifi- cantly improve model performance by finding the best configuration for the dataset (Arden Safitri, 2022).
Cross-Validation: Implementation of k-fold cross-validation could ensure that the model’s perfor- mance is robust and generalizable across different subsets of the dataset. This helps in mitigating the risk of overfitting and provides a more reliable estimate of model performance (Aghbalou et al., 2022).