Skip to content

Commit 71c413e

Browse files
authored
Re-compress image first and display grid lines
There was a VERY weird bug where the TIFF orientation of a photo could cause the wrong coordinates to be returned. Working around that bug by first doing a round-trip through custom canvas JPEG compression. Also added grid to make it easier to debug. Many prompt sessions got here, most recently this one: https://gist.github.com/simonw/0a8b8e76cd73b1bfb72daa716641a57e
1 parent 9b6cdd5 commit 71c413e

File tree

1 file changed

+108
-10
lines changed

1 file changed

+108
-10
lines changed

gemini-bbox-tool.html

+108-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<head>
44
<meta charset="UTF-8">
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6-
<title>Gemini API Image Processor with Bounding Box Visualization</title>
6+
<title>Gemini API Image Bounding Box Visualization</title>
77
<script type="module">
88
import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
99
import { marked } from "https://esm.run/marked";
@@ -38,6 +38,38 @@
3838
});
3939
}
4040

41+
function resizeAndCompressImage(file) {
42+
return new Promise((resolve) => {
43+
const reader = new FileReader();
44+
reader.onload = function(event) {
45+
const img = new Image();
46+
img.onload = function() {
47+
const canvas = document.createElement('canvas');
48+
const ctx = canvas.getContext('2d');
49+
50+
let width = img.width;
51+
let height = img.height;
52+
53+
if (width > 1000) {
54+
height = Math.round((height * 1000) / width);
55+
width = 1000;
56+
}
57+
58+
canvas.width = width;
59+
canvas.height = height;
60+
61+
ctx.drawImage(img, 0, 0, width, height);
62+
63+
canvas.toBlob((blob) => {
64+
resolve(new File([blob], "compressed_image.jpg", { type: "image/jpeg" }));
65+
}, 'image/jpeg', 0.7);
66+
};
67+
img.src = event.target.result;
68+
};
69+
reader.readAsDataURL(file);
70+
});
71+
}
72+
4173
async function processImageAndPrompt() {
4274
const fileInput = document.getElementById('imageInput');
4375
const promptInput = document.getElementById('promptInput');
@@ -52,7 +84,8 @@
5284

5385
try {
5486
const model = await getGenerativeModel({ model: "gemini-1.5-pro" });
55-
const imagePart = await fileToGenerativePart(fileInput.files[0]);
87+
const compressedImage = await resizeAndCompressImage(fileInput.files[0]);
88+
const imagePart = await fileToGenerativePart(compressedImage);
5689

5790
const result = await model.generateContent([promptInput.value, imagePart]);
5891
const response = await result.response;
@@ -63,7 +96,7 @@
6396
// Extract coordinates from the response
6497
const coordinates = extractCoordinates(text);
6598
if (coordinates.length > 0) {
66-
displayImageWithBoundingBoxes(fileInput.files[0], coordinates);
99+
displayImageWithBoundingBoxes(compressedImage, coordinates);
67100
}
68101
} catch (error) {
69102
resultDiv.innerHTML = `Error: ${error.message}`;
@@ -82,21 +115,86 @@
82115
const image = new Image();
83116
image.onload = function() {
84117
const canvas = document.getElementById('canvas');
85-
canvas.width = image.width;
86-
canvas.height = image.height;
118+
canvas.width = image.width + 100;
119+
canvas.height = image.height + 100;
87120
const ctx = canvas.getContext('2d');
88-
ctx.drawImage(image, 0, 0);
121+
122+
// Draw the image
123+
ctx.drawImage(image, 80, 20);
89124

125+
// Draw grid lines
126+
ctx.strokeStyle = 'rgba(255, 0, 0, 0.5)'; // Red with 50% opacity
127+
ctx.lineWidth = 1;
128+
129+
// Vertical grid lines
130+
for (let i = 0; i <= 1000; i += 100) {
131+
const x = 80 + i / 1000 * image.width;
132+
ctx.beginPath();
133+
ctx.moveTo(x, 20);
134+
ctx.lineTo(x, image.height + 20);
135+
ctx.stroke();
136+
}
137+
138+
// Horizontal grid lines
139+
for (let i = 0; i <= 1000; i += 100) {
140+
const y = 20 + (1000 - i) / 1000 * image.height;
141+
ctx.beginPath();
142+
ctx.moveTo(80, y);
143+
ctx.lineTo(image.width + 80, y);
144+
ctx.stroke();
145+
}
146+
147+
// Draw bounding boxes
90148
const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
91149
coordinates.forEach((box, index) => {
92-
const [ymin, xmin, ymax, xmax] = box.map(coord => (1000 - coord) / 1000);
150+
const [ymin, xmin, ymax, xmax] = box.map(coord => coord / 1000);
151+
93152
const width = (xmax - xmin) * image.width;
94153
const height = (ymax - ymin) * image.height;
95154

96155
ctx.strokeStyle = colors[index % colors.length];
97156
ctx.lineWidth = 5;
98-
ctx.strokeRect(xmin * image.width, ymin * image.height, width, height);
157+
ctx.strokeRect(xmin * image.width + 80, ymin * image.height + 20, width, height);
99158
});
159+
160+
// Draw axes and labels
161+
ctx.strokeStyle = '#000000';
162+
ctx.lineWidth = 1;
163+
ctx.font = '26px Arial';
164+
ctx.textAlign = 'right';
165+
166+
// Y-axis
167+
ctx.beginPath();
168+
ctx.moveTo(80, 20);
169+
ctx.lineTo(80, image.height + 20);
170+
ctx.stroke();
171+
172+
// Y-axis labels and ticks
173+
for (let i = 0; i <= 1000; i += 100) {
174+
const y = 20 + (1000 - i) / 1000 * image.height;
175+
ctx.fillText(i.toString(), 75, y + 5);
176+
ctx.beginPath();
177+
ctx.moveTo(75, y);
178+
ctx.lineTo(80, y);
179+
ctx.stroke();
180+
}
181+
182+
// X-axis
183+
ctx.beginPath();
184+
ctx.moveTo(80, image.height + 20);
185+
ctx.lineTo(image.width + 80, image.height + 20);
186+
ctx.stroke();
187+
188+
// X-axis labels and ticks
189+
ctx.textAlign = 'center';
190+
for (let i = 0; i <= 1000; i += 100) {
191+
const x = 80 + i / 1000 * image.width;
192+
ctx.fillText(i.toString(), x, image.height + 40);
193+
ctx.beginPath();
194+
ctx.moveTo(x, image.height + 20);
195+
ctx.lineTo(x, image.height + 25);
196+
ctx.stroke();
197+
}
100198
};
101199
image.src = event.target.result;
102200
};
@@ -129,9 +227,9 @@
129227
</style>
130228
</head>
131229
<body>
132-
<h1>Gemini API Image Processor with Bounding Box Visualization</h1>
230+
<h1>Optimized Gemini API Image Processor with Bounding Box Visualization</h1>
133231
<input type="file" id="imageInput" accept="image/*">
134-
<textarea id="promptInput">Return bounding boxes as [ymin, xmin, ymax, xmax]
232+
<textarea id="promptInput">Return bounding boxes as JSON arrays [ymin, xmin, ymax, xmax]
135233
</textarea>
136234
<button id="submitBtn">Process</button>
137235
<div id="result"></div>

0 commit comments

Comments
 (0)