In [1]:
# import packages


from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
import requests
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from datasets import load_dataset
import torch.nn as nn
from transformers import ViTFeatureExtractor
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer       = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [7]:
from transformers import VisionEncoderDecoderModel

model_path = 'fine_tuned_model'

model_trained = VisionEncoderDecoderModel.from_pretrained(model_path)

In [9]:
import gradio as gr

def my_function(image):
    # Preprocess the image
    pixel_values   = image_processor(image, return_tensors ="pt").pixel_values
    plt.imshow(np.asarray(image))   # Display the image using matplotlib
    plt.show()
    # Generate a caption using the fine-tuned model
    generated_ids  = model_trained.generate(
    pixel_values,
    do_sample=True,
    max_new_tokens = 30,
    top_k=5)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

iface = gr.Interface(
    fn=my_function,
    inputs="image",       # Input type: 'image' (user uploads an image)
    outputs="text",      # Output type: 'text' (generated caption)
    title="Image Caption",  
    description="Description of my interface",  
    live=True,           
    theme="default"     ) 

iface.launch()  # Launch the interface


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


