In [2]:
pip install transformers datasets torch seqeval scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split

In [50]:
# ✅ Load the BIO-labeled dataset
bio_dataset_path = r"C:\Users\subha\OneDrive\Desktop\sdoh_bio_dataset_corrected.csv"
df = pd.read_csv(bio_dataset_path)

# ✅ Ensure dataset has correct columns
required_columns = {"Sentence_ID", "Token", "BIO_Tag"}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset must contain columns: {required_columns}")

# ✅ Remove NaN values in the Token column
df = df.dropna(subset=["Token"])  # Drop rows where Token is NaN

In [51]:
# ✅ Convert dataset into a format suitable for training
sentence_group = df.groupby("Sentence_ID")
sentences = sentence_group["Token"].apply(list).tolist()
labels = sentence_group["BIO_Tag"].apply(list).tolist()

# ✅ Mapping BIO labels to IDs
unique_labels = sorted(set(df["BIO_Tag"].dropna()))  # Drop NaN from label list
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# ✅ Convert labels to numerical IDs
label_ids = [[label2id[label] for label in sent_labels] for sent_labels in labels]

# ✅ Remove any empty sentences after processing
filtered_sentences = []
filtered_labels = []
for sent, lbl in zip(sentences, label_ids):
    if sent:  # Avoid empty sentence lists
        filtered_sentences.append(sent)
        filtered_labels.append(lbl)


In [52]:
# ✅ Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(filtered_sentences, filtered_labels, test_size=0.2)

# ✅ Load BioBERT Tokenizer
model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [53]:
# ✅ Tokenization Function
def tokenize_and_align_labels(texts, labels):
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        previous_word = None
        new_labels = []

        for word_idx in word_ids:
            if word_idx is None:
                new_labels.append(-100)  # Ignore padding
            elif word_idx != previous_word:
                new_labels.append(label[word_idx])  # Assign correct label
            else:
                new_labels.append(label[word_idx])  # Keep same label for subwords
            previous_word = word_idx

        aligned_labels.append(new_labels)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# ✅ Tokenize Data
train_data = tokenize_and_align_labels(train_texts, train_labels)
val_data = tokenize_and_align_labels(val_texts, val_labels)

In [54]:
# ✅ Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# ✅ Load BioBERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_labels))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
# ✅ Training Arguments
training_args = TrainingArguments(
    output_dir="./biobert_ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
)



In [56]:
# ✅ Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# ✅ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [57]:
# ✅ Train the model
trainer.train()


                                                
 20%|██        | 21/105 [06:16<21:47, 15.56s/it]

{'eval_loss': 0.5440341234207153, 'eval_runtime': 29.0827, 'eval_samples_per_second': 2.888, 'eval_steps_per_second': 0.206, 'epoch': 1.0}


                                                
 40%|████      | 42/105 [12:27<16:13, 15.45s/it]

{'eval_loss': 0.25504136085510254, 'eval_runtime': 29.126, 'eval_samples_per_second': 2.884, 'eval_steps_per_second': 0.206, 'epoch': 2.0}


                                                
 60%|██████    | 63/105 [18:44<10:47, 15.43s/it]

{'eval_loss': 0.17006881535053253, 'eval_runtime': 29.1767, 'eval_samples_per_second': 2.879, 'eval_steps_per_second': 0.206, 'epoch': 3.0}


                                                
 80%|████████  | 84/105 [24:54<05:23, 15.42s/it]

{'eval_loss': 0.15451069176197052, 'eval_runtime': 29.1449, 'eval_samples_per_second': 2.882, 'eval_steps_per_second': 0.206, 'epoch': 4.0}


                                                 
100%|██████████| 105/105 [31:09<00:00, 15.63s/it]

{'eval_loss': 0.15062932670116425, 'eval_runtime': 29.093, 'eval_samples_per_second': 2.887, 'eval_steps_per_second': 0.206, 'epoch': 5.0}


100%|██████████| 105/105 [31:12<00:00, 17.83s/it]

{'train_runtime': 1872.5267, 'train_samples_per_second': 0.889, 'train_steps_per_second': 0.056, 'train_loss': 0.338874998546782, 'epoch': 5.0}





TrainOutput(global_step=105, training_loss=0.338874998546782, metrics={'train_runtime': 1872.5267, 'train_samples_per_second': 0.889, 'train_steps_per_second': 0.056, 'total_flos': 108777558355200.0, 'train_loss': 0.338874998546782, 'epoch': 5.0})

In [59]:
# ✅ Save the trained model
trainer.save_model(r"C:\Users\subha\Documents\MyTrainedModels\biobert_ner_trained")
tokenizer.save_pretrained(r"C:\Users\subha\Documents\MyTrainedModels\biobert_ner_trained")

('C:\\Users\\subha\\Documents\\MyTrainedModels\\biobert_ner_trained\\tokenizer_config.json',
 'C:\\Users\\subha\\Documents\\MyTrainedModels\\biobert_ner_trained\\special_tokens_map.json',
 'C:\\Users\\subha\\Documents\\MyTrainedModels\\biobert_ner_trained\\vocab.txt',
 'C:\\Users\\subha\\Documents\\MyTrainedModels\\biobert_ner_trained\\added_tokens.json',
 'C:\\Users\\subha\\Documents\\MyTrainedModels\\biobert_ner_trained\\tokenizer.json')

In [60]:
from transformers import pipeline

# ✅ Load trained BioBERT NER model
model_path = r"C:\Users\subha\Documents\MyTrainedModels\biobert_ner_trained"
ner_pipeline = pipeline("ner", model=model_path, tokenizer=model_path)

# ✅ Sample test sentence (modify with real examples)
test_text = "The patient has a history of smoking and lacks stable housing."

# ✅ Run inference
predictions = ner_pipeline(test_text)

# ✅ Print Raw Predictions
print("🔍 Raw Predictions:")
for pred in predictions:
    print(pred)





Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔍 Raw Predictions:
{'entity': 'LABEL_6', 'score': 0.8570857, 'index': 1, 'word': 'the', 'start': 0, 'end': 3}
{'entity': 'LABEL_13', 'score': 0.40886027, 'index': 2, 'word': 'patient', 'start': 4, 'end': 11}
{'entity': 'LABEL_13', 'score': 0.9901569, 'index': 3, 'word': 'has', 'start': 12, 'end': 15}
{'entity': 'LABEL_13', 'score': 0.991353, 'index': 4, 'word': 'a', 'start': 16, 'end': 17}
{'entity': 'LABEL_13', 'score': 0.99341375, 'index': 5, 'word': 'history', 'start': 18, 'end': 25}
{'entity': 'LABEL_13', 'score': 0.99405825, 'index': 6, 'word': 'of', 'start': 26, 'end': 28}
{'entity': 'LABEL_13', 'score': 0.99385756, 'index': 7, 'word': 'smoking', 'start': 29, 'end': 36}
{'entity': 'LABEL_13', 'score': 0.99123317, 'index': 8, 'word': 'and', 'start': 37, 'end': 40}
{'entity': 'LABEL_13', 'score': 0.98907906, 'index': 9, 'word': 'lacks', 'start': 41, 'end': 46}
{'entity': 'LABEL_13', 'score': 0.9870452, 'index': 10, 'word': 'stable', 'start': 47, 'end': 53}
{'entity': 'LABEL_13', 's

In [61]:
# ✅ Mapping Label IDs to Actual Labels (if necessary)
id2label = {0: "O", 1: "B-SOCIAL_SUPPORT", 2: "I-SOCIAL_SUPPORT", 3: "B-ALCOHOL_USE", 4: "I-ALCOHOL_USE", 5: "B-HOUSING", 6: "I-HOUSING", 7: "B-MENTAL_HEALTH", 8: "I-MENTAL_HEALTH", 9: "B-EMPLOYMENT", 10: "I-EMPLOYMENT", 11: "B-TRANSPORTATION", 12: "I-TRANSPORTATION", 13: "B-INCOME", 14: "I-INCOME", 15: "B-FOOD_SECURITY", 16: "I-FOOD_SECURITY"}

# ✅ Format output
formatted_predictions = []
for entity in predictions:
    formatted_predictions.append({
        "word": entity["word"],
        "start": entity["start"],
        "end": entity["end"],
        "score": entity["score"],
        "entity": id2label.get(int(entity["entity"].replace("LABEL_", "")), "UNKNOWN")  # Convert LABEL_X to readable
    })

# ✅ Display formatted results
print("\n✅ **Formatted Predictions:**")
for pred in formatted_predictions:
    print(pred)



✅ **Formatted Predictions:**
{'word': 'the', 'start': 0, 'end': 3, 'score': 0.8570857, 'entity': 'I-HOUSING'}
{'word': 'patient', 'start': 4, 'end': 11, 'score': 0.40886027, 'entity': 'B-INCOME'}
{'word': 'has', 'start': 12, 'end': 15, 'score': 0.9901569, 'entity': 'B-INCOME'}
{'word': 'a', 'start': 16, 'end': 17, 'score': 0.991353, 'entity': 'B-INCOME'}
{'word': 'history', 'start': 18, 'end': 25, 'score': 0.99341375, 'entity': 'B-INCOME'}
{'word': 'of', 'start': 26, 'end': 28, 'score': 0.99405825, 'entity': 'B-INCOME'}
{'word': 'smoking', 'start': 29, 'end': 36, 'score': 0.99385756, 'entity': 'B-INCOME'}
{'word': 'and', 'start': 37, 'end': 40, 'score': 0.99123317, 'entity': 'B-INCOME'}
{'word': 'lacks', 'start': 41, 'end': 46, 'score': 0.98907906, 'entity': 'B-INCOME'}
{'word': 'stable', 'start': 47, 'end': 53, 'score': 0.9870452, 'entity': 'B-INCOME'}
{'word': 'housing', 'start': 54, 'end': 61, 'score': 0.9906048, 'entity': 'B-INCOME'}
{'word': '.', 'start': 61, 'end': 62, 'score': 

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
#CHECKSSSSS
from sklearn.model_selection import train_test_split

# 🔍 Debug Split
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2)

print(f"✅ train_texts sample: {train_texts[:3]}")
print(f"✅ val_texts sample: {val_texts[:3]}")
print(f"✅ Total train texts: {len(train_texts)}, Total val texts: {len(val_texts)}")


✅ train_texts sample: [['he', 'has', 'no', 'history', 'of', 'drinking'], ['heroin'], ['test', 'results', 'given', 'to', 'patient', 'or', 'family', 'member', nan, nan, 'patient', 'or', 'family', 'member', 'assisted', 'with']]
✅ val_texts sample: [[nan, 'homeless.'], ['n', nan, 'recommendations', nan, 'patient', 'and', 'family', 'verbalize', 'understanding?'], ['heroin']]
✅ Total train texts: 333, Total val texts: 84


In [48]:
import numpy as np

# ✅ Remove NaN tokens
clean_sentences = [[token for token in sent if isinstance(token, str)] for sent in sentences]
clean_labels = [[label for label in sent_labels] for sent_labels in labels]  # Labels shouldn't have NaN

# ✅ Remove empty sentences
filtered_sentences = []
filtered_labels = []
for sent, lbl in zip(clean_sentences, clean_labels):
    if sent:  # Only add if sentence is non-empty
        filtered_sentences.append(sent)
        filtered_labels.append(lbl)

# ✅ Re-run train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(filtered_sentences, filtered_labels, test_size=0.2)

print(f"✅ Total train texts: {len(train_texts)}, Total val texts: {len(val_texts)}")


✅ Total train texts: 333, Total val texts: 84
