In [1]:
!pip install transformers torch tqdm nltk sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

In [3]:
!pip uninstall numpy

^C


In [9]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to read file content
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

# Function to parse annotations from .ann file
def parse_ann(ann_content):
    annotations = []
    for line in ann_content.strip().split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            ann_id = parts[0]
            label_info = parts[1]
            text = parts[2]
            label_info_parts = label_info.split()
            label = label_info_parts[0]
            start = int(label_info_parts[1].split(';')[0])
            end = int(label_info_parts[2].split(';')[0])
            annotations.append({
                'id': ann_id,
                'label': label,
                'start': start,
                'end': end,
                'text': text
            })
    return annotations

# Function to preprocess text and remove stop words and punctuation
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return ' '.join(filtered_tokens)

# Function to format text and annotations for BioBERT input
def format_biobert_input(text, annotations):
    tokens = word_tokenize(text)
    token_annotations = ['O'] * len(tokens)
    text_offset = 0

    for ann in annotations:
        ann_tokens = word_tokenize(ann['text'])
        ann_label = ann['label']

        while text_offset < len(tokens):
            if tokens[text_offset] == ann_tokens[0]:
                match = True
                for i in range(len(ann_tokens)):
                    if text_offset + i >= len(tokens) or tokens[text_offset + i] != ann_tokens[i]:
                        match = False
                        break
                if match:
                    for i in range(len(ann_tokens)):
                        if i == 0:
                            token_annotations[text_offset + i] = f'B-{ann_label}'
                        else:
                            token_annotations[text_offset + i] = f'I-{ann_label}'
                    text_offset += len(ann_tokens)
                    break
            text_offset += 1

    return tokens, token_annotations

# Function to process text and annotation files
def process_files(txt_file, ann_file):
    text = read_file(txt_file)
    ann_content = read_file(ann_file)
    annotations = parse_ann(ann_content)
    processed_text = preprocess_text(text)
    tokens, labels = format_biobert_input(processed_text, annotations)
    return tokens, labels

# Lists to store processed tokens and labels
tokend_text = []
cor_labels = []

# Function to process all files in a directory
def process_all_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_file = os.path.join(directory, filename)
            ann_file = txt_file.replace(".txt", ".ann")
            if os.path.exists(ann_file):
                tokens, labels = process_files(txt_file, ann_file)
                tokend_text.append(tokens)
                cor_labels.append(labels)

# Example usage:
directory = 'n2c2/n2c2/part2'
process_all_files(directory)

# Prepare the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=len(set(label for doc in cor_labels for label in doc)))

# Map labels to IDs
unique_labels = set(label for doc_labels in cor_labels for label in doc_labels)
label_map = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label_map.items()}

# Convert tokens and labels to strings for tokenizer
texts = [" ".join(tokens) for tokens in tokend_text]
labels = [[label_map[label] for label in doc_labels] for doc_labels in cor_labels]

# Create a custom dataset class
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        word_labels = self.labels[index]

        encoding = self.tokenizer(text,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len,
                                  is_split_into_words=True,
                                  return_tensors="pt")

        word_ids = encoding.word_ids(batch_index=0)

        # Create a mask and label array for the tokens
        labels = [-100 if word_id is None else word_labels[word_id] for word_id in word_ids]

        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.long)

        return item

# Prepare the datasets and dataloaders
MAX_LEN = 128
BATCH_SIZE = 8

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = NERDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = NERDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

print("Training completed.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [8]:
!pip install accelerate -U









In [3]:
import os
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report

# Define the labels
labels = ["O", "B-Drug", "I-Drug", "B-Strength", "I-Strength", "B-Form", "I-Form", "B-Dosage", "I-Dosage",
          "B-Duration", "I-Duration", "B-Frequency", "I-Frequency", "B-Route", "I-Route", "B-ADE", "I-ADE",
          "B-Reason", "I-Reason"]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

# Load the BioBERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-v1.1')
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=len(labels))

# Function to read data and create datasets
def read_data(text_file, ann_file):
    with open(text_file, 'r') as f:
        text = f.read()

    entities = []
    with open(ann_file, 'r') as f:
        for line in f:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                entity_info = parts[1].split()
                label = entity_info[0]
                start = int(entity_info[1].split(';')[0])  # Handle potential non-integer values
                end = int(entity_info[2].split(';')[0])  # Handle potential non-integer values
                entities.append((start, end, label))

    tokens = tokenizer.tokenize(text)
    token_offsets = tokenizer(text, return_offsets_mapping=True).offset_mapping
    token_labels = ["O"] * len(tokens)

    for start, end, label in entities:
        token_start = None
        token_end = None
        for i, (offset_start, offset_end) in enumerate(token_offsets):
            if offset_start == start:
                token_start = i
            if offset_end == end:
                token_end = i
            if token_start is not None and token_end is not None:
                break

        if token_start is not None and token_end is not None:
            token_labels[token_start] = f"B-{label}"
            for i in range(token_start + 1, token_end + 1):
                token_labels[i] = f"I-{label}"

    return tokens, token_labels

# Read all data files
def read_all_data(data_dir):
    all_tokens = []
    all_labels = []

    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            base_name = filename[:-4]
            text_file = os.path.join(data_dir, filename)
            ann_file = os.path.join(data_dir, base_name + '.ann')
            tokens, labels = read_data(text_file, ann_file)
            all_tokens.append(tokens)
            all_labels.append(labels)

    return all_tokens, all_labels

# Prepare data
data_dir = 'n2c2/n2c2/part2'  # replace with your folder path
all_tokens, all_labels = read_all_data(data_dir)

max_length = 512

# Function to encode the tokens and labels
def encode_tokens_and_labels(tokens, labels, max_length):
    encodings = tokenizer(tokens, is_split_into_words=True, truncation=True, padding='max_length', max_length=max_length)
    encoded_labels = [label2id[label] for label in labels] + [label2id['O']] * (max_length - len(labels))
    encodings['labels'] = encoded_labels
    return encodings

encoded_data = [encode_tokens_and_labels(tokens, labels, max_length) for tokens, labels in zip(all_tokens, all_labels)]

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings[idx].items()}
        return item

dataset = NERDataset(encoded_data)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # reduce if you encounter memory issues
    per_device_eval_batch_size=8,   # reduce if you encounter memory issues
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
predictions, true_labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Convert predictions and labels to the appropriate format for evaluation
true_labels = [[id2label[label_id.item()] for label_id in labels] for labels in true_labels]
true_predictions = [[id2label[p] for p in prediction] for prediction in predictions]

# Remove padding
true_labels = [[label for label in labels if label != 'O'] for labels in true_labels]
true_predictions = [[pred for pred in prediction if pred != 'O'] for prediction in true_predictions]

print(classification_report(true_labels, true_predictions))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (6016 > 512). Running this sequence through the model will result in indexing errors
  np.object,


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [5]:
import os
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report

# Define the labels
labels = ["O", "B-Drug", "I-Drug", "B-Strength", "I-Strength", "B-Form", "I-Form", "B-Dosage", "I-Dosage",
          "B-Duration", "I-Duration", "B-Frequency", "I-Frequency", "B-Route", "I-Route", "B-ADE", "I-ADE",
          "B-Reason", "I-Reason"]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

# Load the BioBERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-v1.1')
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=len(labels))

# Function to read data and create datasets
def read_data(text_file, ann_file):
    with open(text_file, 'r') as f:
        text = f.read()

    entities = []
    with open(ann_file, 'r') as f:
        for line in f:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                entity_info = parts[1].split()
                label = entity_info[0]
                start = int(entity_info[1].split(';')[0])  # Handle potential non-integer values
                end = int(entity_info[2].split(';')[0])  # Handle potential non-integer values
                entities.append((start, end, label))

    tokens = tokenizer.tokenize(text)
    token_offsets = tokenizer(text, return_offsets_mapping=True).offset_mapping
    token_labels = ["O"] * len(tokens)

    for start, end, label in entities:
        token_start = None
        token_end = None
        for i, (offset_start, offset_end) in enumerate(token_offsets):
            if offset_start == start:
                token_start = i
            if offset_end == end:
                token_end = i
            if token_start is not None and token_end is not None:
                break

        if token_start is not None and token_end is not None:
            token_labels[token_start] = f"B-{label}"
            for i in range(token_start + 1, token_end + 1):
                token_labels[i] = f"I-{label}"

    return tokens, token_labels

# Read all data files
def read_all_data(data_dir):
    all_tokens = []
    all_labels = []

    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            base_name = filename[:-4]
            text_file = os.path.join(data_dir, filename)
            ann_file = os.path.join(data_dir, base_name + '.ann')
            tokens, labels = read_data(text_file, ann_file)
            all_tokens.append(tokens)
            all_labels.append(labels)

    return all_tokens, all_labels

# Prepare data
data_dir = 'n2c2/n2c2/part2'  # replace with your folder path
all_tokens, all_labels = read_all_data(data_dir)

max_length = 512

# Function to encode the tokens and labels
def encode_tokens_and_labels(tokens, labels, max_length):
    encodings = tokenizer(tokens, is_split_into_words=True, truncation=True, padding='max_length', max_length=max_length)
    encoded_labels = [label2id[label] for label in labels] + [label2id['O']] * (max_length - len(labels))
    encodings['labels'] = encoded_labels
    return encodings

encoded_data = [encode_tokens_and_labels(tokens, labels, max_length) for tokens, labels in zip(all_tokens, all_labels)]

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        return item

dataset = NERDataset(encoded_data)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # reduce if you encounter memory issues
    per_device_eval_batch_size=8,   # reduce if you encounter memory issues
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
predictions, true_labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Convert predictions and labels to the appropriate format for evaluation
true_labels = [[id2label[label_id.item()] for label_id in labels] for labels in true_labels]
true_predictions = [[id2label[pred] for pred in prediction] for prediction in predictions]

# Remove padding from predictions and labels for evaluation
true_labels = [[label for label in label_seq if label != 'O'] for label_seq in true_labels]
true_predictions = [[pred for pred in pred_seq if pred != 'O'] for pred_seq in true_predictions]

print(classification_report(true_labels, true_predictions))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (6016 > 512). Running this sequence through the model will result in indexing errors
  np.object,


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [4]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

# Function to read file content
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

# Function to parse annotations from .ann file
def parse_ann(ann_content):
    annotations = []
    for line in ann_content.strip().split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            ann_id = parts[0]
            label_info = parts[1]
            text = parts[2]
            label_info_parts = label_info.split()
            label = label_info_parts[0]
            start = int(label_info_parts[1].split(';')[0])
            end = int(label_info_parts[2].split(';')[0])
            annotations.append({
                'id': ann_id,
                'label': label,
                'start': start,
                'end': end,
                'text': text
            })
    return annotations

# Function to preprocess text and remove stop words and punctuation
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return filtered_tokens

# Function to format text and annotations for BioBERT input
def format_biobert_input(text, annotations):
    tokens = preprocess_text(text)
    token_annotations = ['O'] * len(tokens)
    text_offset = 0

    for ann in annotations:
        ann_tokens = word_tokenize(ann['text'])
        ann_label = ann['label']

        while text_offset < len(tokens):
            try:
                if tokens[text_offset] == ann_tokens[0]:
                    match = True
                    for i in range(len(ann_tokens)):
                        if text_offset + i >= len(tokens) or tokens[text_offset + i] != ann_tokens[i]:
                            match = False
                            break
                    if match:
                        for i in range(len(ann_tokens)):
                            if i == 0:
                                token_annotations[text_offset + i] = f'B-{ann_label}'
                            else:
                                token_annotations[text_offset + i] = f'I-{ann_label}'
                        text_offset += len(ann_tokens)
                        break
                text_offset += 1
            except:
                print(ann_tokens)
                print(tokens[text_offset])

    return tokens, token_annotations

# Function to split tokens and labels into chunks of up to max_length
def split_into_chunks(tokens, labels, max_length=509):
    chunks = []
    chunk_labels = []
    current_chunk = []
    current_chunk_labels = []
    current_length = 0

    for i in range(len(tokens)):
        current_chunk.append(tokens[i])
        current_chunk_labels.append(labels[i])
        current_length += 1

        if current_length >= max_length:
            # Ensure that we do not split entities and the last label is 'O'
            while i < len(tokens) and not labels[i] == 'O':
                current_chunk.append(tokens[i])
                current_chunk_labels.append(labels[i])
                current_length += 1
                i += 1

            chunks.append(current_chunk)
            chunk_labels.append(current_chunk_labels)
            current_chunk = []
            current_chunk_labels = []
            current_length = 0

    if current_chunk:
        chunks.append(current_chunk)
        chunk_labels.append(current_chunk_labels)

    return chunks, chunk_labels

# Function to process text and annotation files
def process_files(txt_file, ann_file):
    text = read_file(txt_file)
    ann_content = read_file(ann_file)
    annotations = parse_ann(ann_content)
    tokens, labels = format_biobert_input(text, annotations)
    token_chunks, label_chunks = split_into_chunks(tokens, labels)
    return token_chunks, label_chunks

# Lists to store processed tokens and labels
tokend_text = []
cor_labels = []

# Function to process all files in a directory
def process_all_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_file = os.path.join(directory, filename)
            ann_file = txt_file.replace(".txt", ".ann")
            if os.path.exists(ann_file):
                token_chunks, label_chunks = process_files(txt_file, ann_file)
                tokend_text.extend(token_chunks)
                cor_labels.extend(label_chunks)

# Example usage:
directory = '/content/data'
process_all_files(directory)

# Prepare the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=len(set(label for doc in cor_labels for label in doc)))

# Define labels
labels = ["O", "B-Drug", "I-Drug", "B-Strength", "I-Strength", "B-Form", "I-Form", "B-Dosage", "I-Dosage",
          "B-Duration", "I-Duration", "B-Frequency", "I-Frequency", "B-Route", "I-Route", "B-ADE", "I-ADE",
          "B-Reason", "I-Reason"]

# Map labels to IDs
label_map = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label_map.items()}

# Convert tokens and labels to strings for tokenizer
texts = tokend_text
labels = [[label_map[label] for label in doc_labels] for doc_labels in cor_labels]

# Create a custom dataset class
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        word_labels = self.labels[index]

        encoding = self.tokenizer(text,
                                  truncation=True,
                                  padding='max_length',  # Ensure padding
                                  max_length=self.max_len,
                                  is_split_into_words=True,
                                  return_tensors='pt')

        word_ids = encoding.word_ids(batch_index=0)

        # Create a mask and label array for the tokens
        labels = [-100 if word_id is None else word_labels[word_id] for word_id in word_ids]

        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.long)

        return item

# Prepare the datasets and dataloaders
MAX_LEN = 512
BATCH_SIZE = 16

# 10-fold cross-validation setup
kf = KFold(n_splits=10, shuffle=True, random_state=42)
all_metrics = {
    'accuracy': [],
    'precision_micro': [],
    'recall_micro': [],
    'f1_micro': [],
    'precision_macro': [],
    'recall_macro': [],
    'f1_macro': []
}

for fold, (train_index, val_index) in enumerate(kf.split(texts)):
    print(f"Fold {fold + 1}")

    train_texts = [texts[i] for i in train_index]
    val_texts = [texts[i] for i in val_index]
    train_labels = [labels[i] for i in train_index]
    val_labels = [labels[i] for i in val_index]

    train_dataset = NERDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = NERDataset(val_texts, val_labels, tokenizer, MAX_LEN)

    training_args = TrainingArguments(
        output_dir=f'./results/fold_{fold + 1}',
        num_train_epochs=3,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/fold_{fold + 1}',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    # Save the model after training each fold
    trainer.save_model(f'./model/fold_{fold + 1}')

    # Evaluation
    predictions, labels, _ = trainer.predict(val_dataset)
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []

    for i in range(len(labels)):
        true_labels.extend(labels[i])
        pred_labels.extend(predictions[i])

    # Remove ignored index (-100)
    true_labels = [label for label in true_labels if label != -100]
    pred_labels = [pred for label, pred in zip(true_labels, pred_labels) if label != -100]

    accuracy = accuracy_score(true_labels, pred_labels)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(true_labels, pred_labels, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, pred_labels, average='macro')

    all_metrics['accuracy'].append(accuracy)
    all_metrics['precision_micro'].append(precision_micro)
    all_metrics['recall_micro'].append(recall_micro)
    all_metrics['f1_micro'].append(f1_micro)
    all_metrics['precision_macro'].append(precision_macro)
    all_metrics['recall_macro'].append(recall_macro)
    all_metrics['f1_macro'].append(f1_macro)

    print(f"Accuracy: {accuracy}")
    print(f"Micro Precision: {precision_micro}, Micro Recall: {recall_micro}, Micro F1: {f1_micro}")
    print(f"Macro Precision: {precision_macro}, Macro Recall: {recall_macro}, Macro F1: {f1_macro}")

# Calculate mean and standard deviation of metrics
metrics_mean_std = {metric: (np.mean(all_metrics[metric]), np.std(all_metrics[metric])) for metric in all_metrics}

print("\nMetrics Mean and Standard Deviation:")
for metric, (mean, std) in metrics_mean_std.items():
    print(f"{metric.capitalize()} - Mean: {mean}, Std: {std}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1


RuntimeError: false INTERNAL ASSERT FAILED at "C:\\cb\\pytorch_1000000000000\\work\\c10/cuda/CUDAGraphsC10Utils.h":74, please report a bug to PyTorch. Unknown CUDA graph CaptureStatus1893637952

In [1]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

# Function to read file content
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

# Function to parse annotations from .ann file
def parse_ann(ann_content):
    annotations = []
    for line in ann_content.strip().split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            ann_id = parts[0]
            label_info = parts[1]
            text = parts[2]
            label_info_parts = label_info.split()
            label = label_info_parts[0]
            start = int(label_info_parts[1].split(';')[0])
            end = int(label_info_parts[2].split(';')[0])
            annotations.append({
                'id': ann_id,
                'label': label,
                'start': start,
                'end': end,
                'text': text
            })
    return annotations

# Function to preprocess text and remove stop words and punctuation
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return filtered_tokens

# Function to format text and annotations for BioBERT input
def format_biobert_input(text, annotations):
    tokens = preprocess_text(text)
    token_annotations = ['O'] * len(tokens)
    text_offset = 0

    for ann in annotations:
        ann_tokens = word_tokenize(ann['text'])
        ann_label = ann['label']

        while text_offset < len(tokens):
            try:
                if tokens[text_offset] == ann_tokens[0]:
                    match = True
                    for i in range(len(ann_tokens)):
                        if text_offset + i >= len(tokens) or tokens[text_offset + i] != ann_tokens[i]:
                            match = False
                            break
                    if match:
                        for i in range(len(ann_tokens)):
                            if i == 0:
                                token_annotations[text_offset + i] = f'B-{ann_label}'
                            else:
                                token_annotations[text_offset + i] = f'I-{ann_label}'
                        text_offset += len(ann_tokens)
                        break
                text_offset += 1
            except:
                print(ann_tokens)
                print(tokens[text_offset])

    return tokens, token_annotations

# Function to split tokens and labels into chunks of up to max_length
def split_into_chunks(tokens, labels, max_length=509):
    chunks = []
    chunk_labels = []
    current_chunk = []
    current_chunk_labels = []
    current_length = 0

    for i in range(len(tokens)):
        current_chunk.append(tokens[i])
        current_chunk_labels.append(labels[i])
        current_length += 1

        if current_length >= max_length:
            # Ensure that we do not split entities and the last label is 'O'
            while i < len(tokens) and not labels[i] == 'O':
                current_chunk.append(tokens[i])
                current_chunk_labels.append(labels[i])
                current_length += 1
                i += 1

            chunks.append(current_chunk)
            chunk_labels.append(current_chunk_labels)
            current_chunk = []
            current_chunk_labels = []
            current_length = 0

    if current_chunk:
        chunks.append(current_chunk)
        chunk_labels.append(current_chunk_labels)

    return chunks, chunk_labels

# Function to process text and annotation files
def process_files(txt_file, ann_file):
    text = read_file(txt_file)
    ann_content = read_file(ann_file)
    annotations = parse_ann(ann_content)
    tokens, labels = format_biobert_input(text, annotations)
    token_chunks, label_chunks = split_into_chunks(tokens, labels)
    return token_chunks, label_chunks

# Lists to store processed tokens and labels
tokend_text = []
cor_labels = []

# Function to process all files in a directory
def process_all_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_file = os.path.join(directory, filename)
            ann_file = txt_file.replace(".txt", ".ann")
            if os.path.exists(ann_file):
                token_chunks, label_chunks = process_files(txt_file, ann_file)
                tokend_text.extend(token_chunks)
                cor_labels.extend(label_chunks)

# Example usage:
directory = 'n2c2/n2c2/part2'
process_all_files(directory)

# Prepare the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=len(set(label for doc in cor_labels for label in doc)))

# Define labels
labels = ["O", "B-Drug", "I-Drug", "B-Strength", "I-Strength", "B-Form", "I-Form", "B-Dosage", "I-Dosage",
          "B-Duration", "I-Duration", "B-Frequency", "I-Frequency", "B-Route", "I-Route", "B-ADE", "I-ADE",
          "B-Reason", "I-Reason"]

# Map labels to IDs
label_map = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label_map.items()}

# Convert tokens and labels to strings for tokenizer
texts = tokend_text
labels = [[label_map[label] for label in doc_labels] for doc_labels in cor_labels]

# Create a custom dataset class
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        word_labels = self.labels[index]

        encoding = self.tokenizer(text,
                                  truncation=True,
                                  padding='max_length',  # Ensure padding
                                  max_length=self.max_len,
                                  is_split_into_words=True,
                                  return_tensors='pt')

        word_ids = encoding.word_ids(batch_index=0)

        # Create a mask and label array for the tokens
        labels = [-100 if word_id is None else word_labels[word_id] for word_id in word_ids]

        item = {key: val.squeeze().to(device) for key, val in encoding.items()}  # Move to GPU
        item['labels'] = torch.tensor(labels, dtype=torch.long).to(device)  # Move to GPU

        return item

# Prepare the datasets and dataloaders
MAX_LEN = 512
BATCH_SIZE = 16

# Custom training loop with 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
all_metrics = {
    'accuracy': [],
    'precision_micro': [],
    'recall_micro': [],
    'f1_micro': [],
    'precision_macro': [],
    'recall_macro': [],
    'f1_macro': []
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_one_fold(train_dataloader, val_dataloader, fold):
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(train_dataloader) * 3
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in train_dataloader:
            model.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        print(f"Fold {fold}, Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

def evaluate_one_fold(val_dataloader):
    model.eval()
    true_labels = []
    pred_labels = []

    for batch in val_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = inputs['labels']

        for i in range(len(labels)):
            true_labels.extend(labels[i].cpu().numpy())
            pred_labels.extend(predictions[i].cpu().numpy())

    true_labels = [label for label in true_labels if label != -100]
    pred_labels = [pred for label, pred in zip(true_labels, pred_labels) if label != -100]

    accuracy = accuracy_score(true_labels, pred_labels)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(true_labels, pred_labels, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, pred_labels, average='macro')

    return accuracy, precision_micro, recall_micro, f1_micro, precision_macro, recall_macro, f1_macro

for fold, (train_index, val_index) in enumerate(kf.split(texts)):
    print(f"Fold {fold + 1}")

    train_texts = [texts[i] for i in train_index]
    val_texts = [texts[i] for i in val_index]
    train_labels = [labels[i] for i in train_index]
    val_labels = [labels[i] for i in val_index]

    train_dataset = NERDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = NERDataset(val_texts, val_labels, tokenizer, MAX_LEN)

    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
    val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

    train_one_fold(train_dataloader, val_dataloader, fold + 1)

    accuracy, precision_micro, recall_micro, f1_micro, precision_macro, recall_macro, f1_macro = evaluate_one_fold(val_dataloader)

    all_metrics['accuracy'].append(accuracy)
    all_metrics['precision_micro'].append(precision_micro)
    all_metrics['recall_micro'].append(recall_micro)
    all_metrics['f1_micro'].append(f1_micro)
    all_metrics['precision_macro'].append(precision_macro)
    all_metrics['recall_macro'].append(recall_macro)
    all_metrics['f1_macro'].append(f1_macro)

    # Save the model after training each fold
    model_save_path = f'./model/fold_{fold + 1}'
    os.makedirs(model_save_path, exist_ok=True)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    print(f"Accuracy: {accuracy}")
    print(f"Micro Precision: {precision_micro}, Micro Recall: {recall_micro}, Micro F1: {f1_micro}")
    print(f"Macro Precision: {precision_macro}, Macro Recall: {recall_macro}, Macro F1: {f1_macro}")

# Calculate mean and standard deviation of metrics
metrics_mean_std = {metric: (np.mean(all_metrics[metric]), np.std(all_metrics[metric])) for metric in all_metrics}

print("\nMetrics Mean and Standard Deviation:")
for metric, (mean, std) in metrics_mean_std.items():
    print(f"{metric.capitalize()} - Mean: {mean}, Std: {std}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.