# CS 475/675 Machine Learning: Project
## Goals:
### 4.1 Must accomplish
- Implement a robust data preprocessing pipeline to handle tokenization, feature extraction, and label encoding.
- Develop and train a machine learning model capable of accurately detecting PII types in student essays, achieving a competitive score on the evaluation metric.
- Generate predictions for the test set essays and submit them in the required format for evaluation.

### 4.2 Expect to accomplish
- Fine-tune the model architecture and hyperparameters to optimize performance on the provided training data.
- Conduct error analysis and model interpretation to identify common misclassifications and areas for improvement.
- Investigate the use of external datasets or pre-trained language models to enhance the model’s generalization capabilities.

### 4.3 Would like to accomplish
- Implement ensemble learning techniques, such as model averaging or stacking, to combine multiple base models and further boost detection accuracy and robustness.
- Investigate methods for handling imbalance class distributions, particularly for rare PII types.
- Develop visualization tools and techniques to facilitate the interpretation of model predictions.


# PreTrained DistilBERT Model

In [None]:
!pip install -U transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)


## Data loading

In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

2024-04-26 17:50:28.709545: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 17:50:28.709647: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 17:50:28.841715: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import json
import torch
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split

# Load your dataset
with open('/kaggle/input/pii-detection-removal-from-educational-data/train.json', 'r') as file:
    data = json.load(file)
    
# Extract tokens and labels from your data
tokens = [entry['tokens'] for entry in data]
labels = [entry['labels'] for entry in data]

# Create a dictionary for the Dataset
data_dict = {
    'tokens': tokens,
    'labels': labels
}

In [4]:
# Convert to Hugging Face dataset format
dataset = Dataset.from_dict(data_dict)

# Split the dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.2)
dataset = DatasetDict(train=train_test_split['train'], validation=train_test_split['test'])

# Define labels and map them
label_list = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 
               'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 
               'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 
               'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [5]:
print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


## Tokenization

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        max_length=512,  # Set maximum sequence length (DistilBERT's typical max)
        truncation=True,  # Ensure truncation is applied
        padding='max_length',  # Apply padding
        is_split_into_words=True
    )
    
    labels = []
    for i, doc_labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []
        for word_id in word_ids:
            # Only add labels for non-special tokens and avoid duplicated labels
            if word_id is None:
                label_ids.append(-100)  # For special tokens like [CLS], [SEP], [PAD]
            elif word_id != previous_word_id:
                label_ids.append(label2id.get(doc_labels[word_id], -100))
            else:
                # Use -100 for repeated subword tokens to ignore during loss calculation
                label_ids.append(-100) 
            previous_word_id = word_id
        labels.append(label_ids)
        
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


# Apply tokenization and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, 
                                remove_columns=dataset['train'].column_names)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/5445 [00:00<?, ? examples/s]

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

## Data Collation and Model Building

In [7]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, 
                                                        num_labels=len(label_list), 
                                                        id2label=id2label, label2id=label2id)

data_collator = DataCollatorForTokenClassification(tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Metrics

In [8]:
!pip install evaluate
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=1cd8638270bb8359c76407975ef4825f7a54fad740d1b3e77a7a1476124ca42d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [11]:
# Metrics for evaluation
import evaluate
from seqeval.metrics import precision_score, recall_score, f1_score
from seqeval.metrics import classification_report, accuracy_score

metric = evaluate.load('seqeval')

def compute_fbeta(precision, recall, beta=5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# better compute metrics with f beta and filtering for non O labels
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert logits to label names
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Remove 'O' labels from evaluation
    non_o_true_labels = [[label for label in doc if label != 'O'] for doc in true_labels]
    non_o_true_predictions = [[pred for pred, true in zip(doc, true_labels[i]) if true != 'O']
                              for i, doc in enumerate(true_predictions)]
    
    # Calculate the required metrics
    precision = precision_score(non_o_true_labels, non_o_true_predictions)
    recall = recall_score(non_o_true_labels, non_o_true_predictions)
    f1 = f1_score(non_o_true_labels, non_o_true_predictions)
    fbeta = compute_fbeta(precision, recall, beta=5)
    accuracy = accuracy_score(non_o_true_labels, non_o_true_predictions)
    
    # Calculate the required metrics
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "fbeta": fbeta,
        "accuracy": accuracy
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [12]:
# Start training
trainer.train()
# api key: 1e55439721e9c3f55077d1b7f5205ccf93924ca6

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Fbeta,Accuracy
1,No log,0.00119,0.855556,0.729858,0.787724,0.734005,0.823684
2,0.001100,0.0009,0.912281,0.739336,0.816754,0.744767,0.810526
3,0.000400,0.000911,0.911765,0.734597,0.813648,0.740129,0.813158


TrainOutput(global_step=1023, training_loss=0.0007578530171312545, metrics={'train_runtime': 508.6144, 'train_samples_per_second': 32.117, 'train_steps_per_second': 2.011, 'total_flos': 2134642871116800.0, 'train_loss': 0.0007578530171312545, 'epoch': 3.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.0009110897080972791,
 'eval_precision': 0.9117647058823529,
 'eval_recall': 0.7345971563981043,
 'eval_f1': 0.8136482939632546,
 'eval_fbeta': 0.7401285583103765,
 'eval_accuracy': 0.8131578947368421,
 'eval_runtime': 14.9012,
 'eval_samples_per_second': 91.402,
 'eval_steps_per_second': 5.771,
 'epoch': 3.0}

## Pipeline

In [14]:
from transformers import pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

## Analysis

In [21]:
from collections import Counter

def error_analysis(predictions, true_labels):
    errors = Counter()
    for pred, true in zip(predictions, true_labels):
        for p, t in zip(pred, true):
            if p != t:
                errors[(t, p)] += 1
    return errors

predictions, true_labels = get_predictions(tokenized_dataset['validation'])
error_counts = error_analysis(predictions, true_labels)

# Display the most common errors
print("Most common misclassifications:")
for (true, pred), count in error_counts.most_common(10):
    print(f"True: {true}, Predicted: {pred}, Count: {count}")

KeyboardInterrupt: 

# CRF Model

In [1]:
!pip install sklearn_crfsuite

[0m[31mERROR: Could not find a version that satisfies the requirement sklearn_crfsuite (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sklearn_crfsuite[0m[31m
[0m

## Data Loading and Preprocessing

In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
with open("/kaggle/input/pii-detection-removal-from-educational-data/train.json", "r") as file:
    data = json.load(file)

# Data extraction: Keeping tokens and labels grouped by documents
documents = [{'tokens': entry['tokens'], 'labels': entry['labels']} for entry in data]

# Split data into training and validation sets
train_docs, val_docs = train_test_split(documents, test_size=0.2, random_state=42)

## Feature Extraction

In [2]:
def token_features(token, index, tokens):
    """ Generate features for a single token """
    token_lower = token.lower()
    features = {
        'bias': 1.0,
        'token': token,
        'token.lower()': token_lower,
        'is_first': index == 0,
        'is_last': index == len(tokens) - 1,
        'is_capitalized': token[0].upper() == token[0],
        'is_all_caps': token.upper() == token,
        'is_all_lower': token.lower() == token,
        'prefix-1': token[0],
        'prefix-2': token[:2] if len(token) > 1 else token[0],
        'suffix-1': token[-1],
        'suffix-2': token[-2:] if len(token) > 1 else token[-1],
        'has_hyphen': '-' in token,
        'is_numeric': token.isdigit(),
    }
    if index > 0:
        token1 = tokens[index - 1]
        features.update({
            '-1:token': token1,
            '-1:token.lower()': token1.lower(),
        })
    if index < len(tokens) - 1:
        token1 = tokens[index + 1]
        features.update({
            '+1:token': token1,
            '+1:token.lower()': token1.lower(),
        })
    return features

In [3]:
# Helper function to generate features for CRF
def extract_features(doc):
    return [token_features(token, i, doc['tokens']) for i, token in enumerate(doc['tokens'])]

def extract_labels(doc):
    return doc['labels']

X_train = [extract_features(doc) for doc in train_docs]
y_train = [extract_labels(doc) for doc in train_docs]
X_val = [extract_features(doc) for doc in val_docs]
y_val = [extract_labels(doc) for doc in val_docs]

## Model Training

In [4]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.3.6


In [5]:
import sklearn_crfsuite
from sklearn_crfsuite import CRF
    
class SafeCRF(CRF):
    def __repr__(self):
        return "SafeCRF()"

# Initialize and train the CRF with the SafeCRF class
crf = SafeCRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

## Evaluation

In [10]:
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn_crfsuite.utils import flatten

# Predict on validation data
y_pred = crf.predict(X_val)

# Determine which labels correspond to non-'O' categories
labels = list(crf.classes_)
non_o_labels = [label for label in labels if label != 'O']

y_pred_flat = flatten(y_pred)
y_val_flat = flatten(y_val)

# Since we're working with CRFsuite, ensure labels are handled correctly
label_ids = [labels.index(label) for label in non_o_labels]

# Print classification report excluding 'O' label
print(classification_report(
    y_val_flat, 
    y_pred_flat, 
    labels=non_o_labels,  # Ensure only non-'O' labels are considered
    target_names=non_o_labels  # This will provide label names in the output
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

  B-NAME_STUDENT       0.88      0.65      0.75       263
  I-NAME_STUDENT       0.91      0.69      0.79       244
        B-ID_NUM       0.75      0.30      0.43        10
  B-URL_PERSONAL       0.76      0.68      0.72        28
         B-EMAIL       1.00      0.67      0.80         3
      B-USERNAME       0.00      0.00      0.00         0
     B-PHONE_NUM       0.00      0.00      0.00         2
     I-PHONE_NUM       0.00      0.00      0.00         3
  I-URL_PERSONAL       0.00      0.00      0.00         0
        I-ID_NUM       0.00      0.00      0.00         0
B-STREET_ADDRESS       0.00      0.00      0.00         1
I-STREET_ADDRESS       0.00      0.00      0.00        10

       micro avg       0.89      0.64      0.75       564
       macro avg       0.36      0.25      0.29       564
    weighted avg       0.86      0.64      0.74       564



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Calculate F1 score for non-'O' labels
f1_score_non_o = metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=non_o_labels)
print(f"F1 Score for Non-'O' labels: {f1_score_non_o}")

F1 Score for Non-'O' labels: 0.7352858761368722


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [13]:
# Precision and Recall for Non-'O' labels
precision = metrics.flat_precision_score(y_val, y_pred, average='weighted', labels=non_o_labels)
recall = metrics.flat_recall_score(y_val, y_pred, average='weighted', labels=non_o_labels)

print(f"Precision for Non-'O' labels: {precision}")
print(f"Recall for Non-'O' labels: {recall}")

  _warn_prf(average, modifier, msg_start, len(result))


Precision for Non-'O' labels: 0.8602719466780521
Recall for Non-'O' labels: 0.6436170212765957


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from sklearn.metrics import fbeta_score

# Calculate F beta score with beta=5 (recall weighted more heavily than precision)
f_beta = fbeta_score(y_val_flat, y_pred_flat, labels=non_o_labels, beta=5, average='weighted')

print(f"F-beta score with beta=5 for non-'O' labels: {f_beta}")

F-beta score with beta=5 for non-'O' labels: 0.6498175749001541


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## Error Analysis and Explainability

In [37]:
from sklearn_crfsuite.utils import flatten

# Flatten the predictions and true labels
y_pred_flat = flatten(y_pred)
y_val_flat = flatten(y_val)

# Extracting tokens for the validation set
# Assuming val_docs is a list of dictionaries with 'tokens' and 'labels' keys
tokens_val = [doc['tokens'] for doc in val_docs]  # List of lists of tokens for validation docs
labels_val = [doc['labels'] for doc in val_docs]  # List of lists of labels for validation docs

# Now flatten these for direct comparisons in mismatches (if needed)
tokens_val_flat = flatten(tokens_val)
labels_val_flat = flatten(labels_val)


# Find indices where predictions and true values differ
mismatches = [i for i, (y_pred, y_true) in enumerate(zip(y_pred_flat, y_val_flat)) if y_pred != y_true]

# Print some mismatches for review
print("Showing some mismatches:")
for i in mismatches[:20]:  # Show first 10 mismatches
    print(f"Token: '{tokens_val_flat[i]}', Predicted: '{y_pred_flat[i]}', True: '{y_val_flat[i]}'")

5884
Showing some mismatches:
Token: 'Tony', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Flores', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Tony', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Flores', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Hussain', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Mohammed', Predicted: 'B-NAME_STUDENT', True: 'I-NAME_STUDENT'
Token: 'Hussain', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Mohammed', Predicted: 'B-NAME_STUDENT', True: 'I-NAME_STUDENT'
Token: 'Nweze', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Stanley', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Sjoerd', Predicted: 'O', True: 'B-NAME_STUDENT'
Token: 'Van', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Der', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Wal', Predicted: 'O', True: 'I-NAME_STUDENT'
Token: 'Sergio', Predicted: 'B-NAME_STUDENT', True: 'O'
Token: 'Cruz', Predicted: 'I-NAME_STUDENT', True: 'O'
Token: 'Easyblood', Predicted: 'B-NAME_STUDENT', True: 'O

In [56]:
from sklearn_crfsuite.utils import flatten

# Function to explain predictions for a specific document
def explain_prediction(index):
    doc = X_val[index]
    true_labels = y_val[index]
    pred_labels = y_pred[index]

    print("Token\tTrue\tPred\tFeatures")
    for token, true_label, pred_label in zip(doc, true_labels, pred_labels):
        # Only display explanations for errors or upon specific conditions
        if true_label != pred_label:
            features = [f"{k}={v}" for k, v in token.items()]
            print(f"{token['token']}\t{true_label}\t{pred_label}\t{' '.join(features)}\n")

# Example usage: Explain predictions for the first document where an error occurs
for i in range(len(X_val)):
    if any(true_label != pred_label for true_label, pred_label in zip(y_val[i], y_pred[i])):
        print(f"Errors in document {i}:")
        explain_prediction(i)
        break # remove to explain more than 1 document

Errors in document 8:
Token	True	Pred	Features
Tony	B-NAME_STUDENT	O	bias=1.0 token=Tony token.lower()=tony is_first=True is_last=False is_capitalized=True is_all_caps=False is_all_lower=False prefix-1=T prefix-2=To suffix-1=y suffix-2=ny has_hyphen=False is_numeric=False +1:token=Flores +1:token.lower()=flores

Flores	I-NAME_STUDENT	O	bias=1.0 token=Flores token.lower()=flores is_first=False is_last=False is_capitalized=True is_all_caps=False is_all_lower=False prefix-1=F prefix-2=Fl suffix-1=s suffix-2=es has_hyphen=False is_numeric=False -1:token=Tony -1:token.lower()=tony +1:token=| +1:token.lower()=|

Tony	B-NAME_STUDENT	O	bias=1.0 token=Tony token.lower()=tony is_first=False is_last=False is_capitalized=True is_all_caps=False is_all_lower=False prefix-1=T prefix-2=To suffix-1=y suffix-2=ny has_hyphen=False is_numeric=False -1:token=

 -1:token.lower()=

 +1:token=Flores +1:token.lower()=flores

Flores	I-NAME_STUDENT	O	bias=1.0 token=Flores token.lower()=flores is_first=False is_l

## Submission on test data

In [62]:
# Load data
with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", "r") as file:
    test_data = json.load(file)

# Data extraction: Keeping tokens and labels grouped by documents
test_documents = [{'tokens': entry['tokens'], 'document': entry['document']} for entry in test_data]

In [63]:
def extract_features_for_test(doc):
    return [token_features(token, i, doc['tokens']) for i, token in enumerate(doc['tokens'])]

X_test = [extract_features_for_test(doc) for doc in test_documents]

In [64]:
# Assuming 'crf' is your trained CRF model
y_pred_test = [crf.predict_single(xseq) for xseq in X_test]

In [73]:
import csv

# Create submission.csv file
with open('/kaggle/working/submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row_id', 'document', 'token', 'label'])
    
    row_id = 0
    for doc_idx, (doc, pred_labels) in enumerate(zip(test_documents, y_pred_test)):
        document_id = doc['document']
        for token_idx, label in enumerate(pred_labels):
            if label != 'O':  # We include only PII labels
                writer.writerow([row_id, document_id, token_idx, label])
                row_id += 1