In [1]:
import torch
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from PIL import Image
import pandas as pd
import os
from io import BytesIO
import base64

# Load the dataset
df = pd.read_csv("/content/multimodal_dataset_with_images.csv")

TEXT_COLUMN = "Symptoms_Description"
IMAGE_COLUMN = "Image_Data_Base64"  # Use 'Image_Data_Base64' for base64 encoded images

# Initialize tokenizer for BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text data
def tokenize_text(text):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Preprocess images from base64
def preprocess_base64_image(base64_string):
    image_bytes = base64.b64decode(base64_string)
    image = Image.open(BytesIO(image_bytes)).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    return transform(image).unsqueeze(0)

# Example usage
text_tokens = tokenize_text(df[TEXT_COLUMN].iloc[0])  # Tokenize text
image_tensor = preprocess_base64_image(df[IMAGE_COLUMN].iloc[0])  # Preprocess base64 image

print("Image Shape:", image_tensor.shape, "Text Tokens:", text_tokens)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Image Shape: torch.Size([1, 3, 224, 224]) Text Tokens: {'input_ids': tensor([[  101,  5729, 14978,  1998,  4487, 29212,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [2]:
from transformers import AutoModel
import torch
import torch.nn as nn

class MultiModalModel(nn.Module):
    def __init__(self, text_model_name, image_feature_size, numerical_input_size, output_classes):
        super().__init__()

        # Load pre-trained text encoder (e.g., BERT)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)

        # Define a simple image feature extractor
        self.image_fc = nn.Linear(image_feature_size, 512)

        # Numerical feature processing
        self.fc_numeric = nn.Linear(numerical_input_size, 128)

        # Final classifier combining all modalities
        self.fc_combined = nn.Linear(768 + 512 + 128, output_classes)

    def forward(self, text_tokens, image_features, numerical_data):
        # Encode text using the text model
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]

        # Process image features
        image_features = self.image_fc(image_features)

        # Process numerical data
        numeric_features = self.fc_numeric(numerical_data)

        # Concatenate all features
        combined = torch.cat((text_features, image_features, numeric_features), dim=1)

        # Pass through the final classifier
        return self.fc_combined(combined)

# Example of defining the model
text_model_name = "bert-base-uncased"
image_feature_size = 2048  # Example feature size from an image encoder like ResNet
numerical_input_size = 3  # Number of numerical features (Heart Rate, Temperature, WBC Count)
output_classes = 10  # Example number of output classes

# Create the model instance
model = MultiModalModel(
    text_model_name=text_model_name,
    image_feature_size=image_feature_size,
    numerical_input_size=numerical_input_size,
    output_classes=output_classes
)

print("Model Ready:", model)


Model Ready: MultiModalModel(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [3]:
import os
import torch

# Save the trained model
def save_model(model, file_name="multi_modal_model.pth"):
    torch.save(model.state_dict(), file_name)
    print("Model saved successfully")

# Verify model file
def verify_model_file(file_name="multi_modal_model.pth"):
    if os.path.exists(file_name):
        print("Model file found")
    else:
        print("Model file is missing. Train and save it again.")

# Save and verify the model
save_model(model, "multi_modal_model.pth")
verify_model_file("multi_modal_model.pth")


Model saved successfully
Model file found


In [17]:
streamlit_code = """
import streamlit as st
import torch
import base64
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from io import BytesIO
import pandas as pd

st.title("Multi-Modal Prediction")

# Define numerical columns
NUMERICAL_COLUMNS = ["Heart_Rate_bpm", "Body_Temperature_F", "WBC_Count_10^3/uL"]

# Load trained model
class MultiModalModel(torch.nn.Module):
    def __init__(self, text_model_name, numerical_input_size, image_feature_size, output_classes):
        super(MultiModalModel, self).__init__()
        self.fc_combined = torch.nn.Linear(numerical_input_size + image_feature_size, output_classes)

    def forward(self, text_tokens, image_tensor, numerical_data):
        combined_features = torch.cat((image_tensor.view(image_tensor.size(0), -1), numerical_data), dim=1)
        return self.fc_combined(combined_features)

model = MultiModalModel(
    text_model_name="bert-base-uncased",
    numerical_input_size=len(NUMERICAL_COLUMNS),
    image_feature_size=2048,  # Adjusted based on model checkpoint
    output_classes=10  # Adjusted based on model checkpoint
)

model.load_state_dict(torch.load("multi_modal_model.pth", map_location=torch.device("cpu")), strict=False)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Input fields
symptoms = st.text_area("Enter Symptoms")
heart_rate = st.number_input("Heart Rate (bpm)", min_value=50, max_value=200, value=80)
temperature = st.number_input("Body Temperature (°F)", min_value=90.0, max_value=110.0, value=98.6)
wbc_count = st.number_input("WBC Count (10^3/uL)", min_value=2.0, max_value=20.0, value=7.0)
base64_image = st.text_area("Paste Base64 Image Data")

def decode_base64_image(base64_string):
    try:
        image_data = base64.b64decode(base64_string)
        image = Image.open(BytesIO(image_data)).convert("RGB")
        return image
    except Exception:
        st.error("Invalid Base64 image data")
        return None

if st.button("Predict"):
    if symptoms and base64_image:
        image = decode_base64_image(base64_image)
        if image:
            transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
            image_tensor = transform(image).unsqueeze(0)

            text_tokens = tokenizer(symptoms, return_tensors="pt", padding=True, truncation=True)
            numerical_data = torch.tensor([[heart_rate, temperature, wbc_count]], dtype=torch.float32)

            with torch.no_grad():
                prediction = model(text_tokens, image_tensor, numerical_data)

            diagnosis = torch.argmax(prediction, dim=1).item()
            st.success(f"Prediction Class: {diagnosis}")
    else:
        st.warning("Please enter symptoms and provide an image in Base64 format")
"""

# Save the script
with open("streamlit_app.py", "w") as f:
    f.write(streamlit_code)

print("streamlit_app.py has been created successfully!")


streamlit_app.py has been created successfully!


In [11]:
import streamlit as st
import torch
import base64
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from io import BytesIO
import pandas as pd

st.title("Multi-Modal Prediction")

# Define numerical columns
NUMERICAL_COLUMNS = ["Heart_Rate_bpm", "Body_Temperature_F", "WBC_Count_10^3/uL"]

# Load dataset for reference
#df = pd.read_csv("multimodal_dataset_with_images.csv")

# Load trained model
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    numerical_input_size=len(NUMERICAL_COLUMNS),
    image_feature_size=2048,  # Adjusted based on model checkpoint error
    output_classes=10  # Adjusted based on model checkpoint error
)

model.load_state_dict(torch.load("multi_modal_model.pth", map_location=torch.device("cpu")), strict=False)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Input fields
symptoms = st.text_area("Enter Symptoms")
heart_rate = st.number_input("Heart Rate (bpm)", min_value=50, max_value=200, value=80)
temperature = st.number_input("Body Temperature (°F)", min_value=90.0, max_value=110.0, value=98.6)
wbc_count = st.number_input("WBC Count (10^3/uL)", min_value=2.0, max_value=20.0, value=7.0)
base64_image = st.text_area("Paste Base64 Image Data")

def decode_base64_image(base64_string):
    try:
        image_data = base64.b64decode(base64_string)
        image = Image.open(BytesIO(image_data)).convert("RGB")
        return image
    except Exception as e:
        st.error("Invalid Base64 image data")
        return None

if st.button("Predict"):
    if symptoms and base64_image:
        image = decode_base64_image(base64_image)
        if image:
            transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
            image_tensor = transform(image).unsqueeze(0)

            text_tokens = tokenizer(symptoms, return_tensors="pt", padding=True, truncation=True)
            numerical_data = torch.tensor([[heart_rate, temperature, wbc_count]], dtype=torch.float32)

            with torch.no_grad():
                prediction = model(text_tokens, image_tensor, numerical_data)

            diagnosis = torch.argmax(prediction, dim=1).item()
            st.success(f"Prediction Class: {diagnosis}")
    else:
        st.warning("Please enter symptoms and provide an image in Base64 format")

  model.load_state_dict(torch.load("multi_modal_model.pth", map_location=torch.device("cpu")), strict=False)
2025-02-07 05:10:04.145 Session state does not function when running a script without `streamlit run`


In [None]:
!streamlit run streamlit_app.py
