In [None]:
# Enable autoreload for updated modules
%load_ext autoreload
%autoreload 2

from torch.utils.data import DataLoader
import torch

import sys

# check whether run in Colab
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !git clone https://github.com/semilleroCV/breastcatt.git
    sys.path.append('./breastcatt')
else:
    sys.path.append('..')
from transformers import AutoModel, AutoTokenizer, AutoConfig
from breastcatt.dataset import PromptDataset
from breastcatt.linear_probe import evaluate_linear_probe

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
include_demographic=True
include_personal_history=True
include_medical_history=True
include_protocol=False

dataset = PromptDataset(r"/home/guillermo/ssd/Github/BreastCATT/data/prompts",
                        r"/home/guillermo/ssd/Github/BreastCATT/data/patient_labels.json",
                        include_demographic=include_demographic,
                        include_personal_history=include_personal_history,
                        include_medical_history=include_medical_history,
                        include_protocol=include_protocol)
print(dataset[10])

# Use DataLoader to load the data in batches
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

{'prompt': 'Patient is 79 years old of white race. Has eating habits low in fat. Complaints reported were no. Additionally, patient reported no. No information provided for radiotherapy, use of hormone replacement.', 'label': 0}


## GatorTron-Base

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer= AutoTokenizer.from_pretrained('UFNLP/gatortron-base')
config=AutoConfig.from_pretrained('UFNLP/gatortron-base')
model=AutoModel.from_pretrained('UFNLP/gatortron-base')
model = model.to(device)

Con edad y raza. Ejemplo: 'Patient is 73 years old of white race.'

In [8]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 53.57%
Iteration 2/10 - Accuracy: 70.24%
Iteration 3/10 - Accuracy: 70.24%
Iteration 4/10 - Accuracy: 59.52%
Iteration 5/10 - Accuracy: 69.05%
Iteration 6/10 - Accuracy: 71.43%
Iteration 7/10 - Accuracy: 71.43%
Iteration 8/10 - Accuracy: 69.05%
Iteration 9/10 - Accuracy: 71.43%
Iteration 10/10 - Accuracy: 71.43%
Average Accuracy: 67.74% ± 5.82%


Con edad, raza e historia personal. Ejemplo: 'Patient is 73 years old of white race. Has eating habits fatty diet. Menarche occurred at 13 years old. Complaints reported were pain when using cream deodorant. Additionally, patient reported no and presence of glands lymph nodes in the left breast.'

In [6]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 89.29%
Iteration 2/10 - Accuracy: 95.24%
Iteration 3/10 - Accuracy: 89.29%
Iteration 4/10 - Accuracy: 90.48%
Iteration 5/10 - Accuracy: 92.86%
Iteration 6/10 - Accuracy: 95.24%
Iteration 7/10 - Accuracy: 85.71%
Iteration 8/10 - Accuracy: 94.05%
Iteration 9/10 - Accuracy: 94.05%
Iteration 10/10 - Accuracy: 91.67%
Average Accuracy: 91.79% ± 2.94%


Con edad, raza, historia personal e historia médica. Ejemplo: 'Patient is 44 years old of black race. Has a high-fat diet. Last menstruation was at 34 years old and menarche occurred at 15 years old. Additionally, patient reported wart. No information provided for mammography, biopsy, plastic surgery, prosthesis, radiotherapy, use of hormone replacement, nipple changes.'

In [15]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 89.29%
Iteration 2/10 - Accuracy: 91.67%
Iteration 3/10 - Accuracy: 91.67%
Iteration 4/10 - Accuracy: 90.48%
Iteration 5/10 - Accuracy: 88.10%
Iteration 6/10 - Accuracy: 90.48%
Iteration 7/10 - Accuracy: 88.10%
Iteration 8/10 - Accuracy: 89.29%
Iteration 9/10 - Accuracy: 90.48%
Iteration 10/10 - Accuracy: 90.48%
Average Accuracy: 90.00% ± 1.21%


Con edad, raza, historia personal, historia médica e información protocolaria. Ejemplo: 'Patient is 69 years old of mestizo race. Has eating habits low in fat. Last menstruation was at 60 years old. Complaints reported were yes. Additionally, patient reported no lumps were found in the touch exam and feels an uncomfortable to lie down. No information provided for nipple changes. In the clinical history, mammography was reported as yes, biopsy was no, plastic surgery was no, prosthesis was no, radiotherapy was no, use of hormone replacement was yes. Regarding the protocol, a body temperature of 36.50°C, alcohol consumption was no, coffee drinking was no, smoking was no, physical exercise was no.
'

In [20]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 86.90%
Iteration 2/10 - Accuracy: 90.48%
Iteration 3/10 - Accuracy: 88.10%
Iteration 4/10 - Accuracy: 88.10%
Iteration 5/10 - Accuracy: 86.90%
Iteration 6/10 - Accuracy: 86.90%
Iteration 7/10 - Accuracy: 88.10%
Iteration 8/10 - Accuracy: 85.71%
Iteration 9/10 - Accuracy: 89.29%
Iteration 10/10 - Accuracy: 90.48%
Average Accuracy: 88.10% ± 1.51%


Con características seleccionadas basandonos en la sección 2.3 del paper Multi-input convolutional neural network for breast cancer detection using thermal images and clinical data

In [4]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 84.52%
Iteration 2/10 - Accuracy: 88.10%
Iteration 3/10 - Accuracy: 91.67%
Iteration 4/10 - Accuracy: 91.67%
Iteration 5/10 - Accuracy: 95.24%
Iteration 6/10 - Accuracy: 90.48%
Iteration 7/10 - Accuracy: 89.29%
Iteration 8/10 - Accuracy: 83.33%
Iteration 9/10 - Accuracy: 88.10%
Iteration 10/10 - Accuracy: 92.86%
Average Accuracy: 89.52% ± 3.48%


## CLIP

In [41]:
from transformers import CLIPTokenizer, CLIPModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model = model.to(device)

Con edad y raza. Ejemplo: 'Patient is 73 years old of white race.'

In [28]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 70.24%
Iteration 2/10 - Accuracy: 65.48%
Iteration 3/10 - Accuracy: 71.43%
Iteration 4/10 - Accuracy: 70.24%
Iteration 5/10 - Accuracy: 66.67%
Iteration 6/10 - Accuracy: 65.48%
Iteration 7/10 - Accuracy: 70.24%
Iteration 8/10 - Accuracy: 55.95%
Iteration 9/10 - Accuracy: 67.86%
Iteration 10/10 - Accuracy: 67.86%
Average Accuracy: 67.14% ± 4.23%


Con edad, raza e historia personal. Ejemplo: 'Patient is 73 years old of white race. Has eating habits fatty diet. Menarche occurred at 13 years old. Complaints reported were pain when using cream deodorant. Additionally, patient reported no and presence of glands lymph nodes in the left breast.'

In [42]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Token indices sequence length is longer than the specified maximum sequence length for this model (78 > 77). Running this sequence through the model will result in indexing errors


Iteration 1/10 - Accuracy: 91.67%
Iteration 2/10 - Accuracy: 90.48%
Iteration 3/10 - Accuracy: 89.29%
Iteration 4/10 - Accuracy: 85.71%
Iteration 5/10 - Accuracy: 88.10%
Iteration 6/10 - Accuracy: 88.10%
Iteration 7/10 - Accuracy: 91.67%
Iteration 8/10 - Accuracy: 82.14%
Iteration 9/10 - Accuracy: 88.10%
Iteration 10/10 - Accuracy: 92.86%
Average Accuracy: 88.81% ± 3.02%


Probar con un prompt más largo no es posible debido al max_length=77 impuesto en CLIP

## Bio_ClinicalBERT

In [43]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer= AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
config=AutoConfig.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model=AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = model.to(device)

Con edad y raza. Ejemplo: 'Patient is 73 years old of white race.'

In [46]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 65.48%
Iteration 2/10 - Accuracy: 67.86%
Iteration 3/10 - Accuracy: 69.05%
Iteration 4/10 - Accuracy: 69.05%
Iteration 5/10 - Accuracy: 66.67%
Iteration 6/10 - Accuracy: 66.67%
Iteration 7/10 - Accuracy: 65.48%
Iteration 8/10 - Accuracy: 66.67%
Iteration 9/10 - Accuracy: 67.86%
Iteration 10/10 - Accuracy: 67.86%
Average Accuracy: 67.26% ± 1.22%


Con edad, raza e historia personal. Ejemplo: 'Patient is 73 years old of white race. Has eating habits fatty diet. Menarche occurred at 13 years old. Complaints reported were pain when using cream deodorant. Additionally, patient reported no and presence of glands lymph nodes in the left breast.'

In [49]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 84.52%
Iteration 2/10 - Accuracy: 90.48%
Iteration 3/10 - Accuracy: 82.14%
Iteration 4/10 - Accuracy: 86.90%
Iteration 5/10 - Accuracy: 89.29%
Iteration 6/10 - Accuracy: 95.24%
Iteration 7/10 - Accuracy: 88.10%
Iteration 8/10 - Accuracy: 85.71%
Iteration 9/10 - Accuracy: 86.90%
Iteration 10/10 - Accuracy: 91.67%
Average Accuracy: 88.10% ± 3.57%


Con edad, raza, historia personal e historia médica. Ejemplo: 'Patient is 44 years old of black race. Has a high-fat diet. Last menstruation was at 34 years old and menarche occurred at 15 years old. Additionally, patient reported wart. No information provided for mammography, biopsy, plastic surgery, prosthesis, radiotherapy, use of hormone replacement, nipple changes.'

In [52]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 78.57%
Iteration 2/10 - Accuracy: 79.76%
Iteration 3/10 - Accuracy: 82.14%
Iteration 4/10 - Accuracy: 78.57%
Iteration 5/10 - Accuracy: 80.95%
Iteration 6/10 - Accuracy: 80.95%
Iteration 7/10 - Accuracy: 84.52%
Iteration 8/10 - Accuracy: 79.76%
Iteration 9/10 - Accuracy: 75.00%
Iteration 10/10 - Accuracy: 89.29%
Average Accuracy: 80.95% ± 3.65%


Con edad, raza, historia personal, historia médica e información protocolaria. Ejemplo: 'Patient is 69 years old of mestizo race. Has eating habits low in fat. Last menstruation was at 60 years old. Complaints reported were yes. Additionally, patient reported no lumps were found in the touch exam and feels an uncomfortable to lie down. No information provided for nipple changes. In the clinical history, mammography was reported as yes, biopsy was no, plastic surgery was no, prosthesis was no, radiotherapy was no, use of hormone replacement was yes. Regarding the protocol, a body temperature of 36.50°C, alcohol consumption was no, coffee drinking was no, smoking was no, physical exercise was no.
'

In [55]:
avg_acc, std_acc = evaluate_linear_probe(10, dataloader, model, tokenizer, device)

Iteration 1/10 - Accuracy: 80.95%
Iteration 2/10 - Accuracy: 85.71%
Iteration 3/10 - Accuracy: 75.00%
Iteration 4/10 - Accuracy: 76.19%
Iteration 5/10 - Accuracy: 75.00%
Iteration 6/10 - Accuracy: 75.00%
Iteration 7/10 - Accuracy: 76.19%
Iteration 8/10 - Accuracy: 86.90%
Iteration 9/10 - Accuracy: 76.19%
Iteration 10/10 - Accuracy: 80.95%
Average Accuracy: 78.81% ± 4.32%
