In [3]:
!pip install transformers datasets torch scikit-learn



In [4]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

In [5]:
from google.colab import files
uploaded = files.upload()

Saving hybrid_dataset.csv to hybrid_dataset.csv


In [6]:
df = pd.read_csv('hybrid_dataset.csv')

In [7]:
df.head()

Unnamed: 0,device_id,source,device_name,manufacturer,processor,memory,form_factor,os,programming_language,frameworks,...,tasks,input_format,preprocessing,storage,output_format,ui_type,ui_features,hospital_systems,apis,approval_year
0,synthetic_0000,synthetic,Brady and Sons CT Vision,Davis-Lowe,Intel Xeon Gold 6330,128GB,Hybrid cloud,Linux Ubuntu 22.04,Java,TensorFlow;PyTorch,...,Organ boundary detection;MRI brain segmentation,DICOM,Segmentation;Resizing,Cloud PACS,FHIR;HL7,Desktop Client (Linux),Visualization;Reporting;3D Reconstruction,EMR,FHIR API,2025
1,synthetic_0001,synthetic,Fleming-Mcclure UltraScan AI,Myers LLC,ARM Cortex-A72,8GB,Embedded device,QNX Neutrino RTOS,C++,TensorFlow,...,CT scan anomaly detection;Image classification,DICOM,Normalization;Segmentation;Noise Reduction,Local,HL7,Touch Control Panel,Reporting;3D Reconstruction,EHR;RIS,DICOM API,2018
2,synthetic_0002,synthetic,Perez Group MRI Assist,Brown Ltd,NVIDIA H100,128GB,Cloud service,Cloud-native Kubernetes,Python,ONNX Runtime;Keras,...,Lesion detection;Tumor localization,DICOM,Segmentation;Resizing;Noise Reduction,Local,FHIR;JSON,Web Dashboard,Visualization;Reporting;3D Reconstruction,EHR;EMR,Custom REST API;FHIR API,2023
3,synthetic_0003,synthetic,Townsend-Castillo CardioTrack,"Williams, Salinas and Martinez",AMD EPYC 7763,64GB,Rack server,Windows Server 2019,Python,PyTorch,...,Cross-modality analysis;Image + signal fusion,DICOM,Normalization;Segmentation,Local,HL7,Web Dashboard,Visualization;Reporting;Alert System,PACS,FHIR API;HL7 API,2021
4,synthetic_0004,synthetic,"Williams, Stewart and Mendoza CardioTrack",Warren Ltd,,32GB,Software only,Linux Ubuntu 22.04,Python,PyTorch;ONNX Runtime,...,Tumor localization;Lesion detection,DICOM,Normalization;Segmentation;Noise Reduction,Cloud PACS,FHIR,Web Dashboard,Reporting;3D Reconstruction;Alert System,EHR,FHIR API;HL7 API;Custom REST API,2019


In [8]:
# Example: we’ll predict "form_factor" based on device description
df["text"] = (
    df["device_name"] + " " + df["manufacturer"] + " " +
    df["processor"] + " " + df["frameworks"]
)
df["label"] = df["form_factor"].astype("category").cat.codes   # convert to numeric

In [9]:
# Remove rows with NaN in the 'text' column before splitting
df.dropna(subset=['text'], inplace=True)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

class FDA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = FDA_Dataset(train_encodings, train_labels)
val_dataset = FDA_Dataset(val_encodings, val_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Load BERT Model
num_labels = len(set(df["label"]))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Training
training_args = TrainingArguments(
    output_dir="./results",  # Ensure output directory is set
    logging_dir="./logs",    # Explicitly set logging directory
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs= 3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5396,0.546054
2,0.537,0.539653
3,0.5275,0.533617


TrainOutput(global_step=7086, training_loss=0.53713621515828, metrics={'train_runtime': 1458.8217, 'train_samples_per_second': 77.691, 'train_steps_per_second': 4.857, 'total_flos': 2155092760373616.0, 'train_loss': 0.53713621515828, 'epoch': 3.0})

In [None]:
model.save_pretrained("./fda_bert_model")
tokenizer.save_pretrained("./fda_bert_model")

In [None]:
!zip -r fda_bert_model.zip fda_bert_model

# Download to local machine
from google.colab import files
files.download("fda_bert_model.zip")