In [None]:
import os
import torch
from google.cloud import storage
from transformers import BloomTokenizerFast, BloomForCausalLM
from transformers import BloomTokenizerFast
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import json
import torch.nn as nn
from transformers import BloomForSequenceClassification
from joblib import load

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/steel-climber-398320-634bf855feea.json'

In [None]:
def download_folder(bucket_name, source_folder, destination_dir):
    """Downloads a folder from the bucket."""
    storage_client = storage.Client('steel-climber-398320')  # Project of the bucket. Must be included
    bucket = storage_client.bucket(bucket_name)

    # List all blobs in the bucket
    blobs = bucket.list_blobs(prefix=source_folder)  # Add prefix to optimize listing

    for blob in blobs:
        if not blob.name.endswith('/'):  # Skip directories
            # Construct the local file path
            local_file_path = os.path.join(destination_dir, blob.name[len(source_folder):])
            # Create local path folders
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            # Download file by file
            blob.download_to_filename(local_file_path)
            # Keep track
            print(f"Blob {blob.name} downloaded to {local_file_path}.")

# Get the model
bucket_name = 'finetunned_760m_bloom'
source_folder = "bloom_model_fusion/"
destination_dir = "/content/finetuned_model"

download_folder(bucket_name, source_folder, destination_dir)


Blob bloom_model_fusion/custom_bloom_model.bin downloaded to /content/finetuned_model/custom_bloom_model.bin.
Blob bloom_model_fusion/label_encoder_mapping.json downloaded to /content/finetuned_model/label_encoder_mapping.json.
Blob bloom_model_fusion/preprocessor.joblib downloaded to /content/finetuned_model/preprocessor.joblib.
Blob bloom_model_fusion/special_tokens_map.json downloaded to /content/finetuned_model/special_tokens_map.json.
Blob bloom_model_fusion/tokenizer.json downloaded to /content/finetuned_model/tokenizer.json.
Blob bloom_model_fusion/tokenizer_config.json downloaded to /content/finetuned_model/tokenizer_config.json.


In [None]:
#customBloomModel definition
#we need to include this here, because the model architecture is not saved
#and we are using a personalized model architecture to perform model fusion
class CustomBloomModel(nn.Module):
    def __init__(self, bloom_model, num_additional_features, num_labels):
        super(CustomBloomModel, self).__init__()
        self.bloom_model = bloom_model
        self.num_labels = num_labels

        # Define the additional neural network
        self.additional_nn = nn.Sequential(
            nn.Linear(51, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, num_labels)
        )

    def forward(self, input_ids, attention_mask, additional_features, labels=None):
        model_output = self.bloom_model(input_ids, attention_mask=attention_mask)
        logits = model_output.logits

        # Concatenate Bloom model output with additional features
        combined_features = torch.cat((logits, additional_features), dim=1)
        final_logits = self.additional_nn(combined_features)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(final_logits.view(-1, self.num_labels), labels.view(-1))
            return loss, final_logits
        else:
            return final_logits


#load the tokenizer
tokenizer = BloomTokenizerFast.from_pretrained("/content/finetuned_model")

#load the OneHotEncoder
preprocessor = load('/content/finetuned_model/preprocessor.joblib')

num_additional_features = 3
num_labels = 48

#load the pre-trained BLOOM model
bloom_pretrained_model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=num_labels)

#load the Custom Model
model_path = "/content/finetuned_model/custom_bloom_model.bin"
custom_model = CustomBloomModel(bloom_model=bloom_pretrained_model, num_additional_features=num_additional_features, num_labels=num_labels)
custom_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
custom_model.eval()

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomBloomModel(
  (bloom_model): BloomForSequenceClassification(
    (transformer): BloomModel(
      (word_embeddings): Embedding(250880, 1024)
      (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
          )
     

In [None]:
input_data = {
    "symptoms": "I have a head ache and have back pain",
    "age": 30,
    "gender": "M"
}

# Dummy DataFrame for preprocessing
df = pd.DataFrame([input_data])
age_sex_data = preprocessor.transform(df[['age', 'gender']])

# Tokenize symptoms
max_length = 512  # Example length, adjust as needed
tokenized_input = tokenizer(input_data["symptoms"], return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)

# Since we are using CPU
tokenized_input = {k: v for k, v in tokenized_input.items()}

# Make prediction
with torch.no_grad():
    logits = custom_model(
        input_ids=tokenized_input['input_ids'],
        attention_mask=tokenized_input['attention_mask'],
        additional_features=torch.tensor(age_sex_data, dtype=torch.float)
    )

tensor([[-2.6361e+00, -4.9862e-01,  3.5639e-01, -4.0827e+00, -2.0414e+00,
         -1.6602e+00, -1.6674e+00, -1.5698e-02, -6.7632e+00, -7.1057e+00,
         -8.1212e+00, -1.9796e+00, -5.7114e+00,  1.1391e+00, -1.8258e+00,
          1.2902e+00, -2.0986e+00, -1.4231e+00, -7.1882e+00,  3.1911e+00,
          7.9038e+00,  8.2600e+00,  3.9708e+00, -7.1225e+00, -7.2012e+00,
          1.2772e-03,  9.4341e-01, -4.8472e+00, -3.1579e+00, -4.3232e-01,
         -1.9563e+00, -5.2866e+00, -1.5424e+00,  7.5999e+00, -3.8996e+00,
         -1.5304e+00, -6.5662e+00, -4.7386e-01, -1.7548e+00,  1.2431e+00,
         -3.8188e+00, -1.0732e+00, -1.8186e+00, -4.1354e+00, -1.2537e+01,
         -1.2128e+00, -4.1330e+00,  5.0212e+00]])


In [None]:
#load the label encoder mapping
with open('/content/finetuned_model/label_encoder_mapping.json', 'r') as file:
    label_encoder_mapping = json.load(file)

#invert the mapping
index_to_label_mapping = {v: k for k, v in label_encoder_mapping.items()}

#calculate probabilities using softmax
probabilities = torch.softmax(logits, dim=-1)

#get the top 5 predictions
top5_prob, top5_indices = torch.topk(probabilities, 5)
top5_indices = top5_indices[0].tolist()
top5_prob = top5_prob[0].tolist()

#top 5 predictions and probs
print("Top 5 likely diagnoses:")
for i in range(len(top5_indices)):
    label = index_to_label_mapping[top5_indices[i]]
    prob = top5_prob[i]
    print(f"{i+1}: ICD-10 Code: {label}, Probability: {prob:.4f}")


Top 5 likely diagnoses:
1: ICD-10 Code: J45, Probability: 0.4382
2: ICD-10 Code: J38.5, Probability: 0.3069
3: ICD-10 Code: a15, Probability: 0.2265
4: ICD-10 Code: j44.1, Probability: 0.0172
5: ICD-10 Code: J47, Probability: 0.0060
