In [1]:
!pip install pyarrow==6.0.1 --quiet
!pip install datasets transformers torch  seqeval --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.17.0 requires pyarrow>=10.0.1, but you have pyarrow 6.0.1 which is incompatible.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 6.0.1 which is incompatible.
datasets 3.0.0 requires pyarrow>=15.0.0, but you have pyarrow 6.0.1 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0m

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
import numpy as np
from typing import Optional
from sklearn.model_selection import train_test_split

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create a synthetic dataset
def create_synthetic_dataset(n_samples=1000):
    diagnoses = ['Diabetes', 'Hypertension', 'Asthma', 'Migraine', 'Flu']
    symptoms = {
        'Diabetes': ['Frequent urination', 'Excessive thirst', 'Unexplained weight loss', 'Increased hunger', 'Blurred vision'],
        'Hypertension': ['Headache', 'Shortness of breath', 'Nosebleeds', 'Chest pain', 'Dizziness'],
        'Asthma': ['Wheezing', 'Coughing', 'Chest tightness', 'Shortness of breath', 'Rapid breathing'],
        'Migraine': ['Severe headache', 'Nausea', 'Sensitivity to light', 'Sensitivity to sound', 'Vision changes'],
        'Flu': ['Fever', 'Cough', 'Sore throat', 'Runny nose', 'Body aches']
    }

    data = []
    for _ in range(n_samples):
        diagnosis = np.random.choice(diagnoses)
        symptom_list = symptoms[diagnosis]
        data.append({
            'Symptom1': symptom_list[0],
            'Symptom2': symptom_list[1],
            'Symptom3': symptom_list[2],
            'Symptom4': symptom_list[3],
            'Symptom5': symptom_list[4],
            'Diagnosis': diagnosis
        })

    return pd.DataFrame(data)

# Create the synthetic dataset
df = create_synthetic_dataset()
print(df.head())

# Choose a specific diagnosis to focus on (e.g., 'Diabetes')
target_diagnosis = 'Diabetes'

# Prepare binary labels
df['label'] = (df['Diagnosis'] == target_diagnosis).astype(int)

# Split the dataset
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# Define the model
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Custom Dataset
class MedicalDiagnosisDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        symptoms = f"Symptom1: {row['Symptom1']}, Symptom2: {row['Symptom2']}, Symptom3: {row['Symptom3']}, Symptom4: {row['Symptom4']}, Symptom5: {row['Symptom5']}"
        label = row['label']

        encoding = self.tokenizer.encode_plus(
            symptoms,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = MedicalDiagnosisDataset(train_df, tokenizer)
val_dataset = MedicalDiagnosisDataset(val_df, tokenizer)
test_dataset = MedicalDiagnosisDataset(test_df, tokenizer)

# Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Custom Trainer class
class ContiguousTrainer(Trainer):
    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
        if self.model is None:
            raise ValueError("Model is not set. Train or load a model before saving.")

        # Ensure all tensors are contiguous
        for name, param in self.model.named_parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

        # Call the parent class's save_model method
        super().save_model(output_dir, _internal_call)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    gradient_accumulation_steps=4,
    fp16=True,
    max_grad_norm=1.0,
)

# Initialize Trainer
trainer = ContiguousTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_medical_model")

# Evaluate the model on the test set
test_loader = DataLoader(test_dataset, batch_size=16)
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Not Diabetes', 'Diabetes']))

# Test the model on sample texts
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from typing import Optional


# Function to make a prediction
def predict(symptoms):
    inputs = tokenizer(symptoms, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
    return 'Diabetes' if predicted_class == 1 else 'Not Diabetes', probabilities[0][predicted_class].item()

# Test the model on sample texts with expected outputs
sample_symptoms_and_diagnoses = [
    {
        "symptoms": "Symptom1: Frequent urination, Symptom2: Excessive thirst, Symptom3: Unexplained weight loss, Symptom4: Increased hunger, Symptom5: Blurred vision",
        "expected": "Diabetes"
    },
    {
        "symptoms": "Symptom1: Fever, Symptom2: Cough, Symptom3: Fatigue, Symptom4: Loss of taste, Symptom5: Shortness of breath",
        "expected": "Not Diabetes"
    },
    {
        "symptoms": "Symptom1: Chest pain, Symptom2: Shortness of breath, Symptom3: Nausea, Symptom4: Lightheadedness, Symptom5: Cold sweat",
        "expected": "Not Diabetes"
    },
    {
        "symptoms": "Symptom1: Increased thirst, Symptom2: Frequent urination, Symptom3: Slow-healing sores, Symptom4: Blurred vision, Symptom5: Unexplained weight loss",
        "expected": "Diabetes"
    }
]

model.eval()
correct_predictions = 0
for sample in sample_symptoms_and_diagnoses:
    symptoms = sample["symptoms"]
    expected = sample["expected"]
    prediction, confidence = predict(symptoms)

    print(f"Symptoms: {symptoms}")
    print(f"Expected: {expected}")
    print(f"Predicted: {prediction}")
    print(f"Confidence: {confidence:.4f}")
    print(f"Correct: {'Yes' if prediction == expected else 'No'}")
    print("---")

    if prediction == expected:
        correct_predictions += 1

# Calculate overall accuracy on sample data
accuracy = correct_predictions / len(sample_symptoms_and_diagnoses)
print(f"\nOverall accuracy on sample data: {accuracy:.2%}")

Using device: cuda
          Symptom1  Symptom2              Symptom3              Symptom4  \
0  Severe headache    Nausea  Sensitivity to light  Sensitivity to sound   
1            Fever     Cough           Sore throat            Runny nose   
2            Fever     Cough           Sore throat            Runny nose   
3         Wheezing  Coughing       Chest tightness   Shortness of breath   
4            Fever     Cough           Sore throat            Runny nose   

          Symptom5 Diagnosis  
0   Vision changes  Migraine  
1       Body aches       Flu  
2       Body aches       Flu  
3  Rapid breathing    Asthma  
4       Body aches       Flu  
Training set size: 700
Validation set size: 150
Test set size: 150


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.0163,0.007841,1.0,1.0,1.0,1.0



Classification Report:
              precision    recall  f1-score   support

Not Diabetes       1.00      1.00      1.00       118
    Diabetes       1.00      1.00      1.00        32

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

Symptoms: Symptom1: Frequent urination, Symptom2: Excessive thirst, Symptom3: Unexplained weight loss, Symptom4: Increased hunger, Symptom5: Blurred vision
Expected: Diabetes
Predicted: Diabetes
Confidence: 0.9822
Correct: Yes
---
Symptoms: Symptom1: Fever, Symptom2: Cough, Symptom3: Fatigue, Symptom4: Loss of taste, Symptom5: Shortness of breath
Expected: Not Diabetes
Predicted: Not Diabetes
Confidence: 0.9940
Correct: Yes
---
Symptoms: Symptom1: Chest pain, Symptom2: Shortness of breath, Symptom3: Nausea, Symptom4: Lightheadedness, Symptom5: Cold sweat
Expected: Not Diabetes
Predicted: Not Diabetes
Confidence: 0.9942
Correct: Yes
---
Sympt