In [85]:
# Enable autoreload for updated modules
%load_ext autoreload
%autoreload 2

from src.utils import generate_category_prompts
import pandas as pd
from src.dataset import MultiPromptDataset
from src.clip import ClipOpenai
from torch.utils.data import DataLoader
import torch
import numpy as np  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  from .autonotebook import tqdm as notebook_tqdm


In [86]:
data = pd.read_json(r"/home/guillermo/ssd/Github/BreastCATT/data/prompts/1.json")
generate_category_prompts(data)

{'risk_factors': 'The patient is 76 years old. Her eating habits are low in fat. Radiotherapy treatment is reported as no. Hormone replacement therapy is indicated as no.',
 'complementary_features': 'Prosthesis status is no. Signals of warts on the breasts are described as yes, both breasts.',
 'protocol_features': 'Body temperature is recorded as 34.90°C.'}

In [87]:
dataset = MultiPromptDataset('/home/guillermo/ssd/Github/BreastCATT/data/prompts', '/home/guillermo/ssd/Github/BreastCATT/data/patient_labels.json')

# Use DataLoader to load the data in batches
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [40]:
dataset[10]

163


{'prompts': {'risk_factors': 'The patient is 79 years old. Her eating habits are described as low in fat. Menarche occurred at 14 years old. Menopause occurred yes,.',
  'complementary_features': 'No complementary features were reported.',
  'protocol_features': 'No protocol features were reported.'},
 'label': 0}

Algunas pacientes poseen verrugas pero esto aparece en la columna de "signs", sin embargo no se "unificaron" en una sola, por ejemplo la paciente 363. 

Algunas si lo tienen explicito en la columna "is there signal of wart on breast", por ejemplo en la paciente 1.

In [88]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = ClipOpenai(device)

In [89]:
# Calculate features

all_features = []
all_labels = []

for data in dataloader:
  prompts = data["prompts"]  # get the prompt string from the batch
  prompts = (
    prompts['risk_factors'] +
    prompts['complementary_features'] +
    prompts['protocol_features']
  )
  label = data["label"][0]
  
  with torch.no_grad():
      # model expects a list of prompt(s)
      text_features = model(prompts)
  text_features = text_features.reshape(1, -1)
  all_features.append(text_features)
  all_labels.append(label.unsqueeze(0))

all_features = torch.cat(all_features).cpu().float().numpy()
all_labels = torch.cat(all_labels).cpu().numpy()
print(all_features.shape, all_labels.shape)

(278, 1536) (278,)


In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.3,
                                                    random_state=42, stratify=all_labels)

In [82]:
# Distribution in the training set
unique_train, counts_train = np.unique(y_train, return_counts=True)
print("Training set distribution:")
for label, count in zip(unique_train, counts_train):
    print(f"Label {label}: {count} samples")

# Distribution in the test set
unique_test, counts_test = np.unique(y_test, return_counts=True)
print("\nTest set distribution:")
for label, count in zip(unique_test, counts_test):
    print(f"Label {label}: {count} samples")

Training set distribution:
Label 0: 124 samples
Label 1: 70 samples

Test set distribution:
Label 0: 54 samples
Label 1: 30 samples


In [91]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0,C=0.4, max_iter=1000, verbose=1)
classifier.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  9.23989D-01

At iterate   50    f=  4.50325D-01    |proj g|=  9.73568D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537     86     99      1     0     0   5.335D-05   4.502D-01
  F =  0.45022349114999510     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


 This problem is unconstrained.


In [92]:
predictions = classifier.predict(X_test)
accuracy = np.mean((y_test == predictions).astype(float)) * 100.
print(accuracy)

78.57142857142857
