In [None]:
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration

def find_and_warp_chessboard(image_path, debug=False):
    """
    Reads an image, detects the largest 4-sided contour (assumed to be the chessboard),
    and returns a warped (top-down) view of the board.
    """
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Could not read image from {image_path}")

    orig = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Edge detection
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)

    # Find contours and sort by area (largest first)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    board_contour = None
    for cnt in contours:
        perimeter = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * perimeter, True)
        if len(approx) == 4:
            board_contour = approx
            break

    if board_contour is None:
        raise ValueError("No 4-sided chessboard contour could be found.")

    # Reorder the corners for perspective transform
    pts = board_contour.reshape(4, 2).astype("float32")
    rect = np.zeros((4, 2), dtype="float32")
    s = np.sum(pts, axis=1)
    rect[0] = pts[np.argmin(s)]   # top-left
    rect[2] = pts[np.argmax(s)]   # bottom-right
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]  # top-right
    rect[3] = pts[np.argmax(diff)]  # bottom-left

    (tl, tr, br, bl) = rect
    widthA = np.linalg.norm(br - bl)
    widthB = np.linalg.norm(tr - tl)
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.linalg.norm(tr - br)
    heightB = np.linalg.norm(tl - bl)
    maxHeight = max(int(heightA), int(heightB))

    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]
    ], dtype="float32")

    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(orig, M, (maxWidth, maxHeight))

    if debug:
        cv2.drawContours(orig, [board_contour], -1, (0, 255, 0), 3)
        cv2.imshow("Detected Board Contour", orig)
        cv2.imshow("Warped Board", warped)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    return warped

def query_open_source_vqa(pil_image, question):
    """
    Uses the open-source BLIP-2 model (Salesforce/blip2-flan-t5-xl) to answer a question about the image.
    BLIP-2 is more advanced than BLIP-VQA-base and can better handle complex queries.
    """
    processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl",
                                                          torch_dtype=torch.float16,
                                                          device_map="auto")
    inputs = processor(images=pil_image, text=question, return_tensors="pt")
    generated_ids = model.generate(**inputs)
    answer = processor.decode(generated_ids[0], skip_special_tokens=True)
    return answer

if __name__ == "__main__":
    image_path = "chessboard.jpg"  # Replace with your chessboard image path

    # 1. Detect and warp the chessboard to obtain a top-down view
    warped_board = find_and_warp_chessboard(image_path, debug=False)

    # 2. Convert the warped board from OpenCV's BGR to a PIL RGB image
    warped_board_rgb = cv2.cvtColor(warped_board, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(warped_board_rgb)

    # 3. Query BLIP-2 with a prompt asking for the chessboard position in FEN notation
    question = (
        "Analyze this chessboard image and return the position of all the pieces in standard FEN notation. "
        "List empty squares as '-' and separate ranks with '/'."
    )
    answer = query_open_source_vqa(pil_image, question)

    print("BLIP-2 response:")
    print(answer)


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

BLIP-2 response:
a - b - c - d - e -


In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.16.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [6]:
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import gradio as gr

def find_and_warp_chessboard_from_image(image, debug=False):
    """
    Given an input image (numpy array in BGR format), detect the chessboard
    and return a warped top-down view.
    """
    orig = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Edge detection
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)

    # Find contours and sort by area (largest first)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    board_contour = None
    for cnt in contours:
        perimeter = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * perimeter, True)
        if len(approx) == 4:
            board_contour = approx
            break

    if board_contour is None:
        raise ValueError("No 4-sided chessboard contour could be found.")

    # Reorder corners for a consistent perspective transform
    pts = board_contour.reshape(4, 2).astype("float32")
    rect = np.zeros((4, 2), dtype="float32")
    s = np.sum(pts, axis=1)
    rect[0] = pts[np.argmin(s)]   # top-left
    rect[2] = pts[np.argmax(s)]     # bottom-right
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]  # top-right
    rect[3] = pts[np.argmax(diff)]  # bottom-left

    (tl, tr, br, bl) = rect
    widthA = np.linalg.norm(br - bl)
    widthB = np.linalg.norm(tr - tl)
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.linalg.norm(tr - br)
    heightB = np.linalg.norm(tl - bl)
    maxHeight = max(int(heightA), int(heightB))

    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]
    ], dtype="float32")

    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(orig, M, (maxWidth, maxHeight))

    if debug:
        cv2.drawContours(orig, [board_contour], -1, (0, 255, 0), 3)
        cv2.imshow("Detected Board Contour", orig)
        cv2.imshow("Warped Board", warped)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    return warped

def query_blip2(pil_image, question):
    """
    Uses the open-source BLIP-2 model (Salesforce/blip2-flan-t5-xl) to answer a question about the image.
    """
    processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl",
                                                          torch_dtype=torch.float16,
                                                          device_map="auto")
    inputs = processor(images=pil_image, text=question, return_tensors="pt")
    generated_ids = model.generate(**inputs)
    answer = processor.decode(generated_ids[0], skip_special_tokens=True)
    return answer

def process_image(pil_img):
    """
    Processes an input PIL image: converts it to a BGR numpy array,
    detects and warps the chessboard, then queries BLIP-2 to extract the chess position.
    """
    # Convert PIL (RGB) to numpy array and then to BGR for OpenCV
    img_np = np.array(pil_img)
    img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)

    try:
        warped_board = find_and_warp_chessboard_from_image(img_bgr, debug=False)
    except Exception as e:
        return f"Error processing chessboard: {e}"

    # Convert warped board to PIL image (RGB)
    warped_board_rgb = cv2.cvtColor(warped_board, cv2.COLOR_BGR2RGB)
    pil_warped = Image.fromarray(warped_board_rgb)

    # Define the prompt asking for the chessboard position in FEN notation
    question = (
        "Analyze this chessboard image and return the position of all the pieces in standard FEN notation. "
        "Use '-' for empty squares and separate ranks with '/'."
    )
    answer = query_blip2(pil_warped, question)
    return answer

iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="Chessboard Position Extractor",
    description="Upload an image of a chessboard. The model will detect the board and use BLIP-2 to return the position in FEN notation."
)

if __name__ == "__main__":
    iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d4642881c880e22f15.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
