In [1]:
# Step 1: Install Tesseract-OCR and necessary libraries
!pip install pytesseract
!pip install langchain==0.1.12
!pip install langchain-openai==0.0.8
!pip install langchain-community==0.0.29
!pip install streamlit==1.32.2
!pip install pytesseract==0.3.10 # For extracting text from images (OCR)
!pip install pillow==9.4.0 # To work with image files
!pip install chromadb==0.4.24 # For storing embeddings and vectors
!pip install pyngrok==7.1.5
!pip install cohere

Collecting protobuf<5,>=3.20 (from streamlit==1.32.2)
  Using cached protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Using cached protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.0
    Uninstalling protobuf-5.29.0:
      Successfully uninstalled protobuf-5.29.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.28.2 requires protobuf<6.0,>=5.0, but you have protobuf 4.25.5 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-4.25.5
Collecting protobuf (from onnxruntime>=1.14.1->chromadb==0.4.24)
  Using cached protobuf-5.29.0-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Using cached protobuf-5.29.0-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Installing collect

In [2]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import os

# os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"
os.environ['COHERE_API_KEY'] = "YOUR_API_KEY"

In [None]:
%%writefile app.py

# Required imports
import streamlit as st
import tempfile
import os
import pytesseract
from PIL import Image
from cohere import Client as CohereClient
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import CohereEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from operator import itemgetter
import streamlit as st
import tempfile
import os
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration

# Set up the Streamlit page
st.set_page_config(page_title="Image QA Chatbot", page_icon="")
st.title("Welcome to Image QA RAG Chatbot")

# Cohere API Key
os.environ['COHERE_API_KEY'] = "YOUR_API_KEY"  # Replace with your actual Cohere API key
cohere_api_key = os.getenv("COHERE_API_KEY")
cohere_client = CohereClient(api_key=cohere_api_key, timeout=60)

class StreamHandler(BaseCallbackHandler):
  def __init__(self, container, initial_text=""):
    self.container = container
    self.text = initial_text

  def on_llm_new_token(self, token: str, **kwargs) -> None:
    self.text += token
    self.container.markdown(self.text)

def classify_image(image_path):
    # Use Tesseract OCR to detect text
    text = pytesseract.image_to_string(image_path)
    if len(text.strip()) > 30:  # Heuristic: If significant text is found
        return "text"
    return "natural"

# Function to extract text from text-heavy images using Tesseract OCR
def extract_text_from_image(image_path):
    # Using Tesseract to extract text from the image
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text.strip()

# Function to extract image descriptions
def get_image_description(image_path):
    # Using BLIP for image captioning
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    outputs = model.generate(**inputs)
    return processor.decode(outputs[0], skip_special_tokens=True)

# Function to generate responses using Cohere
def generate_text_with_cohere(prompt, model="command-xlarge-nightly", temperature=0.8):
    response = cohere_client.generate(
        model=model,
        prompt=prompt,
        temperature=temperature,
        max_tokens=1000  # Adjust based on token limits
    )
    return response.generations[0].text.strip()

# QA prompt template
qa_template = """
Analyze the provided text and context to answer the user's question.
Focus on interpreting any tabular data or numerical findings, if mentioned.
If the answer is not clear, respond with "I don't know."

Extracted Text or Image Description:
{description}

Question:
{question}

Answer:
"""

# Function to handle the QA logic
def qa_with_cohere(description, question):
    # Create the prompt with the template
    prompt = qa_template.format(description=description, question=question)
    # Generate a response using Cohere
    return generate_text_with_cohere(prompt)

# Streamlit UI to accept image uploads
uploaded_file = st.sidebar.file_uploader(
    label="Upload an image file", type=["png", "jpg", "jpeg"]
)
if not uploaded_file:
    st.info("Please upload an image to continue.")
    st.stop()

class PostMessageHandler(BaseCallbackHandler):
  def __init__(self, msg: st.write):
    BaseCallbackHandler.__init__(self)
    self.msg = msg
    self.sources = []

# Store conversation history in Streamlit session state
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")

# Render current messages from StreamlitChatMessageHistory
for msg in streamlit_msg_history.messages:
  st.chat_message(msg.type).write(msg.content)


# Save the uploaded file temporarily
temp_dir = tempfile.TemporaryDirectory()
temp_filepath = os.path.join(temp_dir.name, uploaded_file.name)
with open(temp_filepath, "wb") as f:
    f.write(uploaded_file.getvalue())

# Display the uploaded image
st.image(temp_filepath, caption="Uploaded Image", use_column_width=True)

# Extract the image description
with st.spinner("Analyzing the image..."):
    image_type = classify_image(temp_filepath)

if image_type == "text":
    # Extract text from the image
    extracted_text = extract_text_from_image(temp_filepath)
    st.success("Text Analyzed!")
    input_for_qa = extracted_text
else:
    # Generate an image description for natural images
    image_description = get_image_description(temp_filepath)
    st.success("Image Analyzed!")
    input_for_qa = image_description

if user_question := st.chat_input("Ask a question about the image:"):
    st.chat_message("human").write(user_question)
  # This is where response from the LLM is shown
    with st.spinner("Generating response..."):
        response = qa_with_cohere(input_for_qa, user_question)
        # Display the AI's response
        st.chat_message("ai").write(response)

Overwriting app.py


In [5]:
!streamlit run app.py --server.port=6985 &>./logs.txt &

In [None]:
from pyngrok import ngrok
import yaml

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken
ngrok.set_auth_token("YOUR_API_KEY")

# Open an HTTPs tunnel on port XXXX which you get from your `logs.txt` file
ngrok_tunnel = ngrok.connect(6985)
print("Streamlit App:", ngrok_tunnel.public_url)

Streamlit App: https://e942-104-196-217-108.ngrok-free.app
