In [None]:
import os
import re
import json
import torch
import pandas as pd
import subprocess
import multiprocessing
import uuid
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report

# ✅ CONFIGURATION (SET YOUR PATHS)
input_root = r"C:\Users\subha\Box\Input"
output_root = r"C:\Users\subha\Box\Output_Deidentified_Notes"
scrubber_exe = r"C:\Users\subha\Downloads\scrubber\scrubber.19.0411W\scrubber.19.0411W.exe"

cleaned_notes_folder = r"C:\Users\subha\Box\Pipeline\Cleaned_Notes"
sdoh_output_csv = r"C:\Users\subha\Box\Pipeline\sdoh_aria_results.csv"
bio_output_csv = r"C:\Users\subha\OneDrive\Desktop\sdoh_bio_dataset_corrected.csv"
model_save_path = r"C:\Users\subha\Documents\MyTrainedModels\biobert_ner_trained"

model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load NER Model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
model.to(device)

# ✅ STEP 1: RUN NLM SCRUBBER FOR DE-IDENTIFICATION
# (Code remains unchanged)

# ✅ STEP 2: CLEAN NOTES (REMOVE METADATA)
# (Code remains unchanged)

# ✅ STEP 3: EXTRACT SDOH ENTITIES
# (Code remains unchanged)

# ✅ STEP 4: CONVERT TO BIO FORMAT

def clean_token(token):
    """Removes unnecessary symbols and keeps only meaningful words."""
    return re.sub(r"[^a-zA-Z0-9,.!?-]", "", token)

def convert_to_bio_csv(df, output_csv_path):
    """Converts extracted SDOH dataset into BIO format with proper Sentence_ID tracking."""
    bio_data = []
    sentence_id = 0
    last_text = None

    for _, row in df.iterrows():
        text = str(row["Extracted_Text"]).strip()
        category = str(row["SDOH_Category"]).strip()

        if pd.isna(text) or pd.isna(category) or text == "":
            continue  # Skip empty text

        # Increment sentence_id only when text changes
        if text != last_text:
            sentence_id += 1
            last_text = text

        tokens = text.split()
        tokens = [clean_token(token) for token in tokens if token.strip()]

        if len(tokens) == 0:
            continue  # Skip if no valid tokens remain

        bio_tags = ["O"] * len(tokens)  # Default to "O"
        bio_tags[0] = f"B-{category}"
        for i in range(1, len(tokens)):
            bio_tags[i] = f"I-{category}"

        for token, tag in zip(tokens, bio_tags):
            bio_data.append([sentence_id, token, tag])

    bio_df = pd.DataFrame(bio_data, columns=["Sentence_ID", "Token", "BIO_Tag"])
    bio_df.to_csv(output_csv_path, index=False, encoding="utf-8")
    print(f"✅ BIO dataset saved as CSV at: {output_csv_path}")

# ✅ Convert dataset to BIO format and save as CSV
df = pd.read_csv(sdoh_output_csv)
convert_to_bio_csv(df, bio_output_csv)

# ✅ Tokenization Function
def tokenize_and_align_labels(texts, labels):
    """Tokenizes and aligns BIO labels for Named Entity Recognition (NER) model training."""
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        previous_word = None
        new_labels = []

        for word_idx in word_ids:
            if word_idx is None:
                new_labels.append(-100)  # Ignore padding
            elif word_idx != previous_word:
                new_labels.append(label[word_idx])  # Assign correct label
            else:
                new_labels.append(label[word_idx])  # Keep same label for subwords
            previous_word = word_idx

        aligned_labels.append(new_labels)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# ✅ STEP 5: TRAIN BIOBERT
def train_biobert():
    """Trains a BioBERT model on the BIO-formatted dataset."""
    df = pd.read_csv(bio_output_csv)
    sentences = df.groupby("Sentence_ID")["Token"].apply(list).tolist()
    labels = df.groupby("Sentence_ID")["BIO_Tag"].apply(list).tolist()
    label2id = {label: i for i, label in enumerate(sorted(set(df["BIO_Tag"])))}
    label_ids = [[label2id[l] for l in lbl] for lbl in labels]
    train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, label_ids, test_size=0.2)

    # ✅ Tokenize Data
    train_data = tokenize_and_align_labels(train_texts, train_labels)
    val_data = tokenize_and_align_labels(val_texts, val_labels)

    train_dataset = Dataset.from_dict(train_data)
    val_dataset = Dataset.from_dict(val_data)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id))
    training_args = TrainingArguments(output_dir="./biobert_ner", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=5, weight_decay=0.01)
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)
    trainer.train()
    trainer.save_model(model_save_path)
    print(f"✅ Model saved at {model_save_path}")
