In [15]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'

# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)


In [12]:
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
df['clean_text'] = df['text'].apply(lambda x: tokenizer(x, padding='max_length', truncation=True, return_tensors='pt'))

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)




In [13]:
from torch.utils.data import Dataset, DataLoader
import torch

class MedicalDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Split the data into training and testing sets
train_size = int(0.9 * len(df))
test_size = len(df) - train_size

train_texts, test_texts = df['encoded_text'][:train_size], df['encoded_text'][train_size:]
train_labels, test_labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values[:train_size], df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values[train_size:]

# Stack tokenized tensors into a single tensor for each set
train_encoded_texts = {key: torch.cat([item[key] for item in train_texts], dim=0) for key in train_texts[0].keys()}
test_encoded_texts = {key: torch.cat([item[key] for item in test_texts], dim=0) for key in test_texts[0].keys()}

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


KeyError: 0

In [14]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Initialize the BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
epochs = 4
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [5]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import string
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'
# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation but keep numbers
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_texts(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Apply preprocessing to the text column and tokenize
df['clean_text'] = df['text'].apply(preprocess_text)
df['encoded_text'] = df['clean_text'].apply(lambda x: tokenize_texts(x))

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# Convert encoded labels to numpy arrays
labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values

# Split the data into training and testing sets
train_size = int(0.9 * len(df))
test_size = len(df) - train_size

train_texts, test_texts = df['encoded_text'][:train_size], df['encoded_text'][train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

# Convert the DataFrame of tokenized texts to a dictionary of tensors
def stack_tokenized_tensors(tokenized_texts):
    keys = tokenized_texts.iloc[0].keys()
    stacked_tensors = {key: torch.cat([tokenized_texts.iloc[i][key] for i in range(len(tokenized_texts))], dim=0) for key in keys}
    return stacked_tensors

train_encoded_texts = stack_tokenized_tensors(train_texts)
test_encoded_texts = stack_tokenized_tensors(test_texts)

class MedicalDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize the BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
epochs = 4
device = torch.device('cuda')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.append(logits.cpu().numpy())
        true_labels.append(batch['labels'].cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

pred_labels = (predictions > 0.5).astype(int)

# Print classification report
print(classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']))

# Predict new data
new_data = ["New patient condition description including numbers like 120/80 for blood pressure."]
new_data_tokenized = tokenize_texts(new_data)

model.eval()
with torch.no_grad():
    new_data_tokenized = {key: val.to(device) for key, val in new_data_tokenized.items()}
    outputs = model(**new_data_tokenized)
    logits = outputs.logits
    pred_labels = (logits > 0.5).int()
    print(pred_labels)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 8.00 GiB total capacity; 13.75 GiB already allocated; 0 bytes free; 13.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'


# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation but keep numbers
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_texts(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Apply preprocessing to the text column and tokenize
df['clean_text'] = df['text'].apply(preprocess_text)
df['encoded_text'] = df['clean_text'].apply(lambda x: tokenize_texts(x))

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# Convert encoded labels to numpy arrays
labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values

# Split the data into training and testing sets
train_size = int(0.9 * len(df))
test_size = len(df) - train_size

train_texts, test_texts = df['encoded_text'][:train_size], df['encoded_text'][train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

# Convert the DataFrame of tokenized texts to a dictionary of tensors
def stack_tokenized_tensors(tokenized_texts):
    keys = tokenized_texts.iloc[0].keys()
    stacked_tensors = {key: torch.cat([tokenized_texts.iloc[i][key] for i in range(len(tokenized_texts))], dim=0) for key in keys}
    return stacked_tensors

train_encoded_texts = stack_tokenized_tensors(train_texts)
test_encoded_texts = stack_tokenized_tensors(test_texts)

class MedicalDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
epochs = 4
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

# Evaluate the model
model.eval()
predictions, true_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**batch)
        logits = outputs.logits
        predictions.append(logits.cpu().numpy())
        true_labels.append(labels.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

pred_labels = (predictions > 0.5).astype(int)

# Print classification report
print(classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'labels'

In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'

# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation but keep numbers
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data without specifying max_length
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Apply preprocessing to the text column and tokenize
df['clean_text'] = df['text'].apply(preprocess_text)
df['encoded_text'] = df['clean_text'].apply(lambda x: tokenize_texts(x))

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# Convert encoded labels to numpy arrays
labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values

# Split the data into training and testing sets
train_size = int(0.9 * len(df))
test_size = len(df) - train_size

train_texts, test_texts = df['encoded_text'][:train_size], df['encoded_text'][train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

# Convert the DataFrame of tokenized texts to a dictionary of tensors
def stack_tokenized_tensors(tokenized_texts):
    keys = tokenized_texts.iloc[0].keys()
    stacked_tensors = {key: torch.cat([tokenized_texts.iloc[i][key] for i in range(len(tokenized_texts))], dim=0) for key in keys}
    return stacked_tensors

train_encoded_texts = stack_tokenized_tensors(train_texts)
test_encoded_texts = stack_tokenized_tensors(test_texts)

class MedicalDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

# Define the data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create DataLoader objects with the data collator
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator, num_workers=3)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=data_collator, num_workers=3)

# Initialize the BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
epochs = 4
device = torch.device('cuda')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**batch)
        logits = outputs.logits
        predictions.append(logits.cpu().numpy())
        true_labels.append(labels.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

pred_labels = (predictions > 0.5).astype(int)

# Print classification report
print(classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']))

# # Predict new data
# new_data = ["New patient condition description including numbers like 120/80 for blood pressure."]
# new_data_tokenized = tokenize_texts(new_data)
#
# model.eval()
# with torch.no_grad():
#     new_data_tokenized = {key: val.to(device) for key, val in new_data_tokenized.items()}
#     outputs = model(**new_data_tokenized)
#     logits = outputs.logits
#     pred_labels = (logits > 0.5).int()
#     print(pred_labels)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.cuda.amp import autocast, GradScaler
import string

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'

# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation but keep numbers
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Initialize the BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Tokenize the text data without specifying max_length
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Apply preprocessing to the text column and tokenize
df['clean_text'] = df['text'].apply(preprocess_text)
df['encoded_text'] = df['clean_text'].apply(lambda x: tokenize_texts(x))

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# Convert encoded labels to numpy arrays
labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values

# Split the data into training and testing sets
train_size = int(0.9 * len(df))
test_size = len(df) - train_size

train_texts, test_texts = df['encoded_text'][:train_size], df['encoded_text'][train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

# Convert the DataFrame of tokenized texts to a dictionary of tensors
def stack_tokenized_tensors(tokenized_texts):
    keys = tokenized_texts.iloc[0].keys()
    stacked_tensors = {key: torch.cat([tokenized_texts.iloc[i][key] for i in range(len(tokenized_texts))], dim=0) for key in keys}
    return stacked_tensors

train_encoded_texts = stack_tokenized_tensors(train_texts)
test_encoded_texts = stack_tokenized_tensors(test_texts)

class MedicalDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

# Define the data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create DataLoader objects with the data collator
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=data_collator, num_workers=4)

# Initialize the BioBERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop with mixed precision
epochs = 4
device = torch.device('cuda')
model.to(device)

scaler = GradScaler()  # For mixed precision

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast():  # Mixed precision
            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        with autocast():  # Mixed precision
            outputs = model(**batch)
            logits = outputs.logits

        predictions.append(logits.cpu().numpy())
        true_labels.append(labels.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

pred_labels = (predictions > 0.5).astype(int)

# Print classification report
print(classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']))

# # Predict new data
# new_data = ["New patient condition description including numbers like 120/80 for blood pressure."]
# new_data_tokenized = tokenize_texts(new_data)
#
# model.eval()
# with torch.no_grad():
#     new_data_tokenized = {key: val.to(device) for key, val in new_data_tokenized.items()}
#     with autocast():  # Mixed precision
#         outputs = model(**new_data_tokenized)
#         logits = outputs.logits
#     pred_labels = (logits > 0.5).int()
#     print(pred_labels)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import string
import numpy as np
import os
import xml.etree.ElementTree as ET
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import DataCollatorWithPadding
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, KFold

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to parse a single XML file
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

# Directory containing the XML files
xml_dir = 'n2c2/n2c2/part1'
# List to store parsed data
data = []

# Parse all XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation but keep numbers
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply preprocessing to the text column
df['clean_text'] = df['text'].apply(preprocess_text)

# Encode the labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# Convert encoded labels to numpy arrays
labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values
texts = df['clean_text'].tolist()

# Initialize the BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Tokenize the text data without specifying max_length
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

class MedicalDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Define the data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Function to train and evaluate the model
def train_and_evaluate(train_index, val_index, model, tokenizer, device, optimizer, scheduler):
    labels = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']].values
    texts = df['clean_text'].tolist()

    # Split the data into training and validation sets
    train_texts, val_texts = [texts[i] for i in train_index], [texts[i] for i in val_index]
    train_labels, val_labels = labels[train_index], labels[val_index]

    train_encoded_texts = tokenize_texts(train_texts)
    val_encoded_texts = tokenize_texts(val_texts)

    train_dataset = MedicalDataset(train_encoded_texts, train_labels)
    val_dataset = MedicalDataset(val_encoded_texts, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator, num_workers=4, pin_memory=True, prefetch_factor=2)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator, num_workers=4, pin_memory=True, prefetch_factor=2)

    model.to(device)

    # Number of gradient accumulation steps
    accumulation_steps = 4  # Adjust based on your GPU memory

    # Training loop
    epochs = 100
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()  # Move optimizer.zero_grad() outside the batch loop

        for step, batch in enumerate(train_loader):
            # Move the batch to the device
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss = loss / accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

    # Evaluate the model on validation set
    val_report = evaluate(model, val_loader, device)
    print(val_report)

# Save the trained model and tokenizer
def save_model(model, tokenizer, model_save_path):
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

# Function to evaluate the model
def evaluate(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch, labels=labels)
            logits = outputs.logits

            predictions.append(logits.cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    pred_labels = (predictions > 0.5).astype(int)

    return classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES'])

# Initialize the model, optimizer, and scheduler
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(texts) * 4 // 8  # Adjust for effective batch size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(texts)):
    print(f"Fold {fold + 1}")
    train_and_evaluate(train_index, val_index, model, tokenizer, device, optimizer, scheduler)

# Save the model after cross-validation
save_model(model, tokenizer, 'path_to_save_final_model_after_cv')

# Split the data into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.125, random_state=42)  # 0.125 * 0.8 = 0.1

train_encoded_texts = tokenize_texts(train_texts)
val_encoded_texts = tokenize_texts(val_texts)
test_encoded_texts = tokenize_texts(test_texts)

train_dataset = MedicalDataset(train_encoded_texts, train_labels)
val_dataset = MedicalDataset(val_encoded_texts, val_labels)
test_dataset = MedicalDataset(test_encoded_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator, num_workers=4, pin_memory=True, prefetch_factor=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator, num_workers=4, pin_memory=True, prefetch_factor=2)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator, num_workers=4, pin_memory=True, prefetch_factor=2)

# Function to train on the combined training and validation sets
def train_final_model(model, train_loader, val_loader, optimizer, scheduler, device):
    epochs = 4
    accumulation_steps = 4  # Adjust based on your GPU memory

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss = loss / accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

        # Evaluate on the validation set
        val_report = evaluate(model, val_loader, device)
        print(val_report)

# Train the model on the combined training and validation sets
train_final_model(model, train_loader, val_loader, optimizer, scheduler, device)

# Save the final model
save_model(model, tokenizer, 'path_to_save_final_model')

# Evaluate on the test set
test_report = evaluate(model, test_loader, device)
print(test_report)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1
