In [23]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.schema import BaseOutputParser
from langchain.schema import StrOutputParser
chat=ChatOpenAI(
    temperature=0.1,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)

In [32]:
class CustomOutputParser(BaseOutputParser):
    def parse(self,text):
        items=text.split("-")
        parsed_result={}
        for item in items:
            if not item.strip():
                continue
            if ":" in item:
                key,value=item.split(":",1)
                parsed_result[key.strip()]=value.strip()
        return parsed_result


In [33]:
vision_encoder_prompt=ChatPromptTemplate.from_messages([
    ("system","You are a ViT (Vision Transformer), and you are going to receive a prompt as an image description. Let's assume that this prompt is an image input. You have to turn this image into a vision feature. The output will be sent to LLM decoder."),
    ("human","{description}")
])

vision_encoder_chain=vision_encoder_prompt|chat|StrOutputParser()|CustomOutputParser()

In [34]:
llm_decoder_prompt=ChatPromptTemplate.from_messages([
    ("system","You are a image captioning model. You will receive a dictionary of vision feature from vision encoder, and what you have to do is create a warm explanation for the blind. Based on the vision features, you have to create a description about the scene for the blind."),
    ("human","Here are the visual features extracted from an image {visual_features}"),
])
llm_decoder_chain=llm_decoder_prompt|chat|StrOutputParser()

In [35]:
mllm_chain={"visual_features":vision_encoder_chain}|llm_decoder_chain
mllm_chain.invoke({
    "description": "A heavy downpour in a city at night. The wet asphalt reflects the pink and blue neon signs of the shops. Cars are rushing by, leaving long trails of red taillights."
})

[The ViT processes the image input and extracts the following vision feature:]

- Scene: Heavy downpour in a city at night
- Elements: Wet asphalt reflecting pink and blue neon signs of shops, cars rushing by with long trails of red taillightsImagine standing in the middle of a bustling city on a rainy night. The sound of raindrops hitting the ground fills the air as you feel the wet asphalt beneath your feet. The neon signs of shops around you cast a pink and blue glow, reflecting off the shiny surface of the road. Cars speed past, leaving behind long trails of red taillights that blur together in the downpour. Despite the darkness of the night, the city is alive with movement and color, creating a vibrant and dynamic scene.

'Imagine standing in the middle of a bustling city on a rainy night. The sound of raindrops hitting the ground fills the air as you feel the wet asphalt beneath your feet. The neon signs of shops around you cast a pink and blue glow, reflecting off the shiny surface of the road. Cars speed past, leaving behind long trails of red taillights that blur together in the downpour. Despite the darkness of the night, the city is alive with movement and color, creating a vibrant and dynamic scene.'