In [None]:
import os   # handling the files
import pickle # storing numpy features
import numpy as np
from tqdm.notebook import tqdm # how much data is process till now

from tensorflow.keras.applications.vgg16 import VGG16 , preprocess_input # extract features from image data.
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add

In [None]:
! kaggle datasets download -d virajbagal/roco-dataset


In [None]:
!unzip roco-dataset.zip

In [None]:
BASE_DIR = '/content/all_data/train/radiology'
WORKING_DIR = '/content/'

In [None]:
import csv

# Path to the CSV file
input_csv_path = '/content/all_data/train/radiology/traindata.csv'
# Path to the output text file
output_txt_path = '/content/all_data/train/radiology/output.txt'

# Open the CSV file and the output text file
with open(input_csv_path, 'r') as csv_file, open(output_txt_path, 'w') as txt_file:
    csv_reader = csv.DictReader(csv_file)  # Read the CSV file as a dictionary

    for row in csv_reader:
        name = row['name']  # Get the 'name' column
        caption = row['caption']  # Get the 'caption' column

        # Write to the text file in the format: name,caption
        txt_file.write(f"{name},{caption}")

print(f"Name and caption have been written to {output_txt_path}.")


In [None]:
# Load vgg16 Model
model = VGG16()

# restructure model
model = Model(inputs = model.inputs , outputs = model.layers[-2].output)

# Summerize
print(model.summary())

In [None]:
# extract features from image
features = {}
directory = os.path.join(BASE_DIR, 'images')

count=0
for img_name in tqdm(os.listdir(directory)):
    if count==50:
      break
    count+=1
    # load the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]
    # store feature
    features[image_id] = feature
    print(feature.shape)

In [None]:
# store features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [None]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
with open(os.path.join(BASE_DIR, 'output.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue

    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    if image_id not in features.keys():
      continue
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)

In [None]:
mapping

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# before preprocess of text
mapping['PMC3639690_CRIM']

In [None]:
# preprocess the text
clean(mapping)

In [None]:
# before preprocess of text
mapping['PMC3639690_CRIM']

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# encoder model
# image feature layers

inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)


# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
plot_model(model, show_shapes=True)

In [None]:
from sklearn.model_selection import train_test_split

dataset_keys = list(mapping.keys())  # Keys for all images in the dataset

# Split the dataset into training, validation, and testing sets
train_keys, val_keys = train_test_split(dataset_keys, test_size=0.2, random_state=42)


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # Initialize storage
    X1, X2, y = [], [], []
    n = 0

    while True:  # Infinite generator
        for key in data_keys:
            captions = mapping[key]
            for caption in captions:
                # Encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # Collect inputs and outputs
                    X1.append(features[key][0])  # Features
                    X2.append(in_seq)  # Input sequence
                    y.append(out_seq)  # Output sequence

            # Yield batch when size matches batch_size
            n += 1
            if n == batch_size:
                if len(X1) == 0:  # Skip empty batches
                    continue
                # Convert to TensorFlow tensors
                X1_tensor = tf.convert_to_tensor(np.array(X1), dtype=tf.float32)
                X2_tensor = tf.convert_to_tensor(np.array(X2), dtype=tf.float32)
                y_tensor = tf.convert_to_tensor(np.array(y), dtype=tf.float32)

                yield (X1_tensor, X2_tensor), y_tensor

                # Reset storage
                X1, X2, y = [], [], []
                n = 0


output_signature = (
    (
        tf.TensorSpec(shape=(None, 4096), dtype=tf.float32),  # Features
        tf.TensorSpec(shape=(None, max_length), dtype=tf.float32),  # Input sequences
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32),  # Output sequences
)
batch_size=32
def safe_data_generator():
    try:
        yield from data_generator(train_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    except Exception as e:
        print(f"Error in generator: {e}")
# Create a dataset from the generator
train_dataset = tf.data.Dataset.from_generator(
    safe_data_generator,
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 4096), dtype=tf.float32),
            tf.TensorSpec(shape=(None, max_length), dtype=tf.float32),
        ),
        tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32),
    )
)



steps_per_epoch = len(train_keys) // 32
model.fit(train_dataset, epochs=200, steps_per_epoch=steps_per_epoch, verbose=1)


In [None]:
# save the model
model.save(WORKING_DIR+'/best_model.keras')

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import keras

def load_trained_model(model_path):
    """Load a pre-trained model from an .h5 file."""
    model = load_model(model_path,safe_mode=True)
    return model

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    model=keras.models.load_model("/content/best_model.keras")
    # add start tag for generation process
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in tqdm(train_keys):
    # get actual caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))


In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join(BASE_DIR, "images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
import os

# Define the directory path
directory_path = '/content/all_data/train/radiology/images/'

# Check if the directory exists
if os.path.exists(directory_path):
    # List all files and directories
    all_files = os.listdir(directory_path)

    # Print the first 5 items
    print("First 5 files or directories:")
    for item in all_files[:5]:  # Get the first 5
        print(item)
else:
    print(f"Directory not found: {directory_path}")


In [None]:
features

In [None]:
generate_caption("PMC3353704_DRJ-9-233-g004.jpg")

In [None]:
generate_caption("PMC3639690_CRIM.EM2013-198617.003.jpg")

In [None]:
generate_caption("PMC5603107_CRIM2017-3531823.001.jpg")

In [None]:
generate_caption("PMC3854574_10-1055-s-0033-1337123-i1200056-3.jpg")

In [None]:
# Install required packages
!pip install -q flask flask-ngrok python-dotenv google-generativeai pillow pyngrok

from flask import Flask, render_template_string, request, jsonify
from flask_ngrok import run_with_ngrok
import google.generativeai as genai
from PIL import Image
import io
import base64
from pyngrok import ngrok

# Initialize Flask app
app = Flask(__name__)

# Configure ngrok with your authtoken
NGROK_AUTH_TOKEN = "2sXaFpQ0oFUgu0weIUw0rXpiZXx_6ifSgEBj4JjZtW7KkvrNo"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Initialize ngrok tunnel
public_url = ngrok.connect(5000).public_url
print(" * Public URL:", public_url)
run_with_ngrok(app)

# Configure Gemini
GOOGLE_API_KEY = "AIzaSyALV-PhCw5QDYcXqKL1raV2doonlBiBrkA"
genai.configure(api_key=GOOGLE_API_KEY)

# Set up the model
model = genai.GenerativeModel('gemini-1.5-flash')

# HTML template with improved functionality
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8"/>
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  <title>AI Medical Image Captioning</title>
  <style>
    .home {
      text-decoration: none;
      background-color: #007bff;
      color: white;
      padding: 10px 20px;
      border-radius: 8px;
      font-family: Arial, sans-serif;
      font-size: 16px;
      position: absolute;
      top: 20px;
      left: 20px;
    }

    body {
      font-family: 'Poppins', sans-serif;
      background: linear-gradient(135deg, #0a0a0a, #363535);
      color: rgb(20, 20, 20);
      margin: 0;
      padding: 0;
      text-align: center;
      background-image: url("https://static.vecteezy.com/system/resources/previews/037/246/957/large_2x/ai-generated-medical-advertisment-background-with-copy-space-free-photo.jpg");
      background-size: cover;
      background-position: center;
      min-height: 100vh;
    }

    .container {
      background: rgba(116, 103, 103, 0.2);
      padding: 30px;
      border-radius: 15px;
      max-width: 800px;
      margin: 80px auto;
      box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
      backdrop-filter: blur(10px);
    }

    h1 {
      font-size: 32px;
      margin-bottom: 5px;
      color: #fff;
    }

    .subtitle {
      font-size: 14px;
      margin-bottom: 20px;
      font-style: italic;
      color: #f0f0f0;
    }

    .upload-box {
      background: rgba(249, 236, 236, 0.3);
      padding: 30px;
      border-radius: 10px;
      margin-bottom: 20px;
    }

    input[type="file"] {
      background: rgb(247, 237, 237);
      padding: 15px;
      border-radius: 5px;
      border: none;
      cursor: pointer;
      display: block;
      margin: 20px auto;
      width: 90%;
      max-width: 400px;
    }

    .btn {
      background: #4CAF50;
      color: white;
      padding: 12px 25px;
      border: none;
      cursor: pointer;
      border-radius: 5px;
      font-size: 16px;
      transition: 0.3s;
      margin: 10px 5px;
    }

    .language-selector {
      background: #007bff;
      color: white;
      padding: 10px 15px;
      border: none;
      border-radius: 5px;
      font-size: 16px;
      margin: 10px 0;
    }

    .btn:hover {
      background: #45a049;
      transform: scale(1.05);
    }

    .btn:disabled {
      background: #cccccc;
      cursor: not-allowed;
    }

    #status {
      font-size: 16px;
      margin: 15px 0;
      color: #fff;
      min-height: 20px;
    }

    .result {
      display: flex;
      flex-wrap: wrap;
      gap: 20px;
      justify-content: center;
      align-items: flex-start;
      margin-top: 20px;
    }

    #preview {
      max-width: 100%;
      max-height: 300px;
      border-radius: 10px;
      margin-top: 15px;
      box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.3);
      display: none;
    }

    .caption-text {
      font-size: 16px;
      color: white;
      background-color: rgba(255, 255, 255, 0.1);
      padding: 20px;
      border-radius: 10px;
      max-width: 400px;
      text-align: left;
      margin-top: 15px;
    }

    .loading {
      border: 4px solid #f3f3f3;
      border-top: 4px solid #3498db;
      border-radius: 50%;
      width: 30px;
      height: 30px;
      animation: spin 1s linear infinite;
      margin: 20px auto;
      display: none;
    }

    @keyframes spin {
      0% { transform: rotate(0deg); }
      100% { transform: rotate(360deg); }
    }
  </style>
</head>

<body>


  <div class="container">
    <h1>🩺 AI Medical Image Captioning</h1>
    <p class="subtitle">Upload an image and generate accurate medical captions</p>

    <div class="upload-box">
      <!-- Language Selection Dropdown -->
      <select id="language" class="language-selector">
        <option value="english">English</option>
        <option value="hindi">हिंदी (Hindi)</option>
      </select>

      <input type="file" id="imageInput" accept="image/*">
      <button class="btn" id="uploadBtn">Upload Image</button>
      <button class="btn" id="generateBtn" disabled>Generate Caption</button>
      <div class="loading" id="loadingSpinner"></div>
      <div id="status"></div>
    </div>

    <div class="result">
      <img id="preview"/>
      <div class="caption-text">
        <strong>Caption:</strong> <span id="caption"></span>
      </div>
    </div>
  </div>

  <script>
    let uploadedImage = null;

    document.getElementById('imageInput').addEventListener('change', function(e) {
      const file = e.target.files[0];
      if (file) {
        const reader = new FileReader();
        reader.onload = function(event) {
          document.getElementById('preview').src = event.target.result;
          document.getElementById('preview').style.display = 'block';
          document.getElementById('generateBtn').disabled = false;
          uploadedImage = file;
          document.getElementById('status').textContent = 'Image ready for caption generation';
        };
        reader.readAsDataURL(file);
      }
    });

    document.getElementById('generateBtn').addEventListener('click', async function() {
      if (!uploadedImage) return;

      const generateBtn = document.getElementById('generateBtn');
      const loadingSpinner = document.getElementById('loadingSpinner');
      const status = document.getElementById('status');
      const language = document.getElementById('language').value;

      generateBtn.disabled = true;
      loadingSpinner.style.display = 'block';
      status.textContent = 'Generating caption...';

      try {
        const formData = new FormData();
        formData.append('file', uploadedImage);
        formData.append('language', language);

        const response = await fetch('/generate', {
          method: 'POST',
          body: formData
        });

        const result = await response.json();

        if (result.error) {
          status.textContent = 'Error: ' + result.error;
        } else {
          document.getElementById('caption').textContent = result.caption;
          status.textContent = language === 'english'
            ? 'Caption generated successfully!'
            : 'कैप्शन सफलतापूर्वक उत्पन्न हुआ!';
        }
      } catch (error) {
        status.textContent = 'Error: ' + error.message;
      } finally {
        loadingSpinner.style.display = 'none';
        generateBtn.disabled = false;
      }
    });

    document.getElementById('uploadBtn').addEventListener('click', function() {
      document.getElementById('imageInput').click();
    });
  </script>
</body>
</html>
"""

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/generate', methods=['POST'])
def generate_caption():
    if 'file' not in request.files:
        return jsonify({'error': 'No file uploaded'}), 400

    file = request.files['file']
    language = request.form.get('language', 'english')

    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    try:
        # Process image
        img_bytes = file.read()
        img = Image.open(io.BytesIO(img_bytes))

       # Language-specific prompts with training details
        prompts = {
            'english': [
                        " You are a medical image captioning assistant trained on the ROCO dataset, which contains radiology images and their associated clinical captions. Generate a detailed and clinically relevant caption similar in style to the ROCO dataset, such as: startseq an ap chest x ray in the trauma bay showed no obvious pneumothorax hemothorax or bullet fragment endseq Please maintain the medical terminology and tone used in ROCO-style captions",

                  ],
            'hindi': [
               " You are a medical image captioning assistant trained on the ROCO dataset, which contains radiology images and their associated clinical captions. Generate a detailed and clinically relevant caption similar in style to the ROCO dataset, such as: startseq an ap chest x ray in the trauma bay showed no obvious pneumothorax hemothorax or bullet fragment endseq Please maintain the medical terminology and tone used in ROCO-style captions translate the caption generated in hindi",
               img
              ]
            }



        response = model.generate_content(prompts[language])

        # Get the text response
        if response.candidates and response.candidates[0].content.parts:
            caption = response.candidates[0].content.parts[0].text
        else:
            caption = "Could not generate caption for this image." if language == 'english' else "इस छवि के लिए कैप्शन उत्पन्न नहीं किया जा सका"

        return jsonify({'caption': caption})

    except Exception as e:
        error_msg = str(e) if language == 'english' else "त्रुटि: कैप्शन उत्पन्न करने में विफल"
        return jsonify({'error': error_msg}), 500

print("\n\n * IMPORTANT: Click the ngrok link below to access your app!")
app.run()