In [3]:
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image, ImageTk
import numpy as np
import cv2
import pyttsx3

from modules.yolo_module import YOLODetector
from modules.midas_module import MiDaSDepth
from modules.smolvlm_module import describe_image

class VisionAssistTTS(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("AI Vision Assist with Navigation")
        self.geometry("900x600")

        # UI elements
        self.btn_load = tk.Button(self, text="Load Image", command=self.load_file)
        self.btn_load.pack(pady=10)

        self.canvas = tk.Canvas(self, width=640, height=360, bg="black")
        self.canvas.pack()

        self.txt_info = tk.Text(self, wrap="word", height=10)
        self.txt_info.pack(fill="both", expand=True, padx=10, pady=10)

        # Models
        self.yolo = YOLODetector()
        self.midas = MiDaSDepth()
        self.engine = pyttsx3.init()

    def load_file(self):
        path = filedialog.askopenfilename(
            filetypes=[("Image files", "*.jpg *.jpeg *.png"), ("All files", "*.*")]
        )
        if not path:
            return

        self.canvas.delete("all")
        self.txt_info.delete("1.0", tk.END)

        self.process_image(path)

    def speak(self, text):
        self.engine.say(text)
        self.engine.runAndWait()

    def process_image(self, path):
        img = Image.open(path).resize((640, 360))
        self.photo = ImageTk.PhotoImage(img, master=self)
        self.canvas.create_image(0, 0, anchor="nw", image=self.photo)

        # Run detection & depth
        objects = self.yolo.detect(img)
        depth_array, depth_vis = self.midas.estimate_depth(img)
        caption = describe_image(path)

        # Track object instances
        summary = {}
        distances = []

        info = f"Scene Summary:\n{caption}\n\nObjects Summary:\n"
        for obj in objects:
            label = obj["label"]
            x1, y1, x2, y2 = obj["bbox"]
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
            distance = float(depth_array[cy, cx])
            distances.append((label, (x1, y1, x2, y2), distance))
            summary[label] = summary.get(label, 0) + 1

        for k, v in summary.items():
            info += f" - {k}: {v}\n"

        # Closest and farthest object
        if distances:
            closest = min(distances, key=lambda x: x[2])
            farthest = max(distances, key=lambda x: x[2])
            info += f"\nClosest Object:\n - {closest[0]} at {closest[2]:.2f}m\n"
            info += f"Farthest Object:\n - {farthest[0]} at {farthest[2]:.2f}m\n"

            # Directional guidance (simple version)
            center_x = (closest[1][0] + closest[1][2]) // 2
            if center_x < 213:
                direction = "to your left"
            elif center_x < 426:
                direction = "straight ahead"
            else:
                direction = "to your right"
            nav_msg = f"{closest[0].capitalize()} detected {direction}, about {closest[2]:.1f} meters away."
            info += f"\nGuidance:\n - {nav_msg}"
            self.speak(f"{caption}. {nav_msg}.")
        else:
            self.speak(caption)

        self.txt_info.insert(tk.END, info)

        # Show depth map
        depth_win = tk.Toplevel(self)
        depth_win.title("Depth Map")
        if isinstance(depth_vis, Image.Image):
            dv_img = depth_vis.resize((320, 180))
        else:
            dv_img = Image.fromarray((depth_vis * 255).astype(np.uint8)).resize((320, 180))

        dv_photo = ImageTk.PhotoImage(dv_img, master=depth_win)
        lbl = tk.Label(depth_win, image=dv_photo)
        lbl.image = dv_photo
        lbl.pack()

if __name__ == "__main__":
    app = VisionAssistTTS()
    app.mainloop()


Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master
Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master



0: 384x640 2 persons, 4 cars, 210.5ms
Speed: 1.9ms preprocess, 210.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


In [24]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PIL import Image, ImageTk
import numpy as np
import cv2
import pyttsx3
import math

from modules.yolo_module import YOLODetector
from modules.midas_module import MiDaSDepth
from modules.smolvlm_module import describe_image, describe_video, plan_navigation

# Helper to compute left/center/right direction based on x-coordinate
def compute_direction(cx, width):
    if cx < width * 0.33:
        return "left"
    elif cx > width * 0.66:
        return "right"
    else:
        return "center"

class VisionApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("AI Vision Assistant - Basic Navigation")
        self.geometry("900x650")

        # UI Elements
        tk.Button(self, text="Load Image/Video", command=self.load_file).pack(pady=10)
        self.canvas = tk.Canvas(self, width=640, height=360, bg="black")
        self.canvas.pack()
        self.txt_info = tk.Text(self, wrap="word", height=10)
        self.txt_info.pack(fill="both", expand=True, padx=10, pady=10)
        self.obj_dropdown = ttk.Combobox(self, state="readonly")
        self.obj_dropdown.pack(pady=5)
        self.obj_dropdown.bind("<<ComboboxSelected>>", self.on_object_select)

        # Models and state
        self.yolo = YOLODetector()
        self.midas = MiDaSDepth()
        self.tts = pyttsx3.init()
        self.selected_image = None
        self.depth_map = None
        self.objects = []  # list of dicts {label, bbox}

    def load_file(self):
        path = filedialog.askopenfilename(
            filetypes=[("Media files", "*.jpg *.jpeg *.png *.mp4 *.avi"), ("All files", "*.*")]
        )
        if not path:
            return
        # Clear previous
        self.canvas.delete("all")
        self.txt_info.delete("1.0", tk.END)
        if path.lower().endswith((".mp4", ".avi")):
            self.process_video(path)
        else:
            self.process_image(path)

    def process_image(self, path):
        # Load and display image
        img = Image.open(path).convert("RGB").resize((640, 360))
        self.selected_image = img.copy()
        self.photo = ImageTk.PhotoImage(img, master=self)
        self.canvas.create_image(0, 0, anchor='nw', image=self.photo)

        # Run YOLO detection
        detections = self.yolo.detect(img)
        self.objects = [{"label": d['label'], "bbox": d['bbox']} for d in detections]

        # Run MiDaS depth estimation
        depth_arr, depth_vis = self.midas.estimate_depth(img)
        self.depth_map = depth_arr

        # Show depth map
        depth_win = tk.Toplevel(self)
        depth_win.title("Depth Map")
        if isinstance(depth_vis, Image.Image):
            dv_img = depth_vis.resize((320, 180))
        else:
            dv_img = Image.fromarray((depth_vis * 255).astype(np.uint8)).resize((320, 180))
        dv_photo = ImageTk.PhotoImage(dv_img, master=depth_win)
        tk.Label(depth_win, image=dv_photo).pack()
        depth_win.image = dv_photo

        # Generate scene caption
        caption = describe_image(path)
        info = f"Caption:\n{caption}\n\nObjects Detected:\n"
        for idx, obj in enumerate(self.objects):
            info += f"{idx}. {obj['label']} at {obj['bbox']}\n"
        self.txt_info.insert(tk.END, info)

        # Populate dropdown
        labels = [f"{obj['label']} ({i})" for i, obj in enumerate(self.objects)]
        self.obj_dropdown['values'] = labels
        if labels:
            self.obj_dropdown.set(labels[0])

    def on_object_select(self, event):
        idx = self.obj_dropdown.current()
        if idx < 0 or idx >= len(self.objects) or self.depth_map is None:
            return
        obj = self.objects[idx]
        label = obj['label']
        x1, y1, x2, y2 = obj['bbox']

        # Compute center pixel of object
        cx = (x1 + x2) // 2
        cy = (y1 + y2) // 2
        h, w = self.depth_map.shape
        cx = max(0, min(cx, w-1))
        cy = max(0, min(cy, h-1))

        # Get distance (relative units)
        dist = round(float(self.depth_map[cy, cx]), 2)
        direction = compute_direction(cx, w)

        # Check for obstacle in center corridor (40%-60% width)
        warning = ""
        left_bound = w * 0.4
        right_bound = w * 0.6
        for other in self.objects:
            if other is obj:
                continue
            ox1, oy1, ox2, oy2 = other['bbox']
            ocx = (ox1 + ox2) // 2
            ocy = (oy1 + oy2) // 2
            od = float(self.depth_map[max(0, min(ocy, h-1)), max(0, min(ocx, w-1))])
            if od < dist and left_bound < ocx < right_bound:
                # Obstacle directly ahead
                side = 'left' if ocx > w/2 else 'right'
                warning = f"Obstacle ({other['label']}) {round(od,2)}m ahead. Step {side} 0.5m, then move forward."
                break

        # Build instruction
        if warning:
            instruction = warning + f" Then the {label} is {dist} units away to your {direction}."
        else:
            instruction = f"The {label} is {dist} units away to your {direction}."

        # Show and speak
        self.txt_info.insert(tk.END, f"\nNavigation:\n{instruction}\n")
        self.tts.say(instruction)
        self.tts.runAndWait()

        # Highlight selected object
        img_arr = np.array(self.selected_image)
        cv2.rectangle(img_arr, (x1, y1), (x2, y2), (0, 255, 0), 3)
        disp = Image.fromarray(img_arr).resize((640, 360))
        self.photo = ImageTk.PhotoImage(disp, master=self)
        self.canvas.create_image(0, 0, anchor='nw', image=self.photo)

    def process_video(self, path):
        cap = cv2.VideoCapture(path)
        ret, frame = cap.read()
        cap.release()
        if not ret:
            messagebox.showerror("Error", "Cannot read video.")
            return
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(frame).resize((640, 360))
        self.photo = ImageTk.PhotoImage(img, master=self)
        self.canvas.create_image(0, 0, anchor='nw', image=self.photo)
        caption = describe_video(path)
        self.txt_info.insert(tk.END, f"Video Description:\n{caption}\n(Depth/object detection not available for video)")

if __name__ == "__main__":
    app = VisionApp()
    app.mainloop()

AttributeError: '_tkinter.tkapp' object has no attribute 'on_object_select'

In [7]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PIL import Image, ImageTk
import numpy as np
import cv2
import pyttsx3
import math

from modules.yolo_module import YOLODetector
from modules.midas_module import MiDaSDepth
from modules.smolvlm_module import describe_image, describe_video

# Compute left/center/right based on x coordinate
def compute_direction(cx, width):
    if cx < width * 0.33:
        return "left"
    elif cx > width * 0.66:
        return "right"
    else:
        return "center"

class VisionApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("AI Vision Assistant - Guided Navigation")
        self.geometry("900x700")

        # UI
        tk.Button(self, text="Load Image/Video", command=self.load_file).pack(pady=10)
        self.canvas = tk.Canvas(self, width=640, height=360, bg="black")
        self.canvas.pack()
        self.txt_info = tk.Text(self, wrap="word", height=12)
        self.txt_info.pack(fill="both", expand=True, padx=10, pady=10)
        self.obj_dropdown = ttk.Combobox(self, state="readonly")
        self.obj_dropdown.pack(pady=5)
        self.obj_dropdown.bind("<<ComboboxSelected>>", self.on_object_select)

        # Models & state
        self.yolo = YOLODetector()
        self.midas = MiDaSDepth()
        self.tts = pyttsx3.init()
        self.selected_image = None
        self.depth_map = None
        self.landmarks = []  # each is dict with label, bbox, dist, cx, cy
        self.scene_overview = ""

    def load_file(self):
        path = filedialog.askopenfilename(filetypes=[("Media files","*.jpg *.jpeg *.png *.mp4 *.avi"),("All","*.*")])
        if not path: return
        self.canvas.delete("all")
        self.txt_info.delete("1.0", tk.END)
        self.process_image(path)

    def process_image(self, path):
        # display
        img = Image.open(path).convert("RGB").resize((640,360))
        self.selected_image = img.copy()
        self.photo = ImageTk.PhotoImage(img, master=self)
        self.canvas.create_image(0,0,anchor='nw',image=self.photo)
        
        # YOLO + depth
        raw = self.yolo.detect(img)
        depth_arr, depth_vis = self.midas.estimate_depth(img)
        self.depth_map = depth_arr
        h,w = depth_arr.shape

        # Scene overview from SMOL
        self.scene_overview = describe_image(path)
        self.txt_info.insert(tk.END, f"Scene Overview:\n{self.scene_overview}\n\n")
        self.tts.say(self.scene_overview)
        self.tts.runAndWait()

        # Build landmarks with geometry
        self.landmarks = []
        for idx,obj in enumerate(raw):
            x1,y1,x2,y2 = map(int,obj['bbox'])
            cx,cy = (x1+x2)//2,(y1+y2)//2
            cx=np.clip(cx,0,w-1); cy=np.clip(cy,0,h-1)
            dist=float(depth_arr[cy,cx])
            self.landmarks.append({'label':obj['label'],'bbox':(x1,y1,x2,y2),'dist':round(dist,2),'cx':cx,'cy':cy})
        # sort by distance
        self.landmarks.sort(key=lambda o:o['dist'])

        # list landmarks
        self.txt_info.insert(tk.END,"Landmarks (nearest first):\n")
        for i, lm in enumerate(self.landmarks):
            self.txt_info.insert(tk.END,f"{i}. {lm['label']} at {lm['dist']} m\n")

        # dropdown
        vals=[f"{lm['label']} ({i})" for i,lm in enumerate(self.landmarks)]
        self.obj_dropdown['values']=vals
        if vals: self.obj_dropdown.set(vals[0])

        # show depth map
        depth_win=tk.Toplevel(self); depth_win.title("Depth Map")
        dv_img=depth_vis.resize((320,180)) if isinstance(depth_vis,Image.Image) else Image.fromarray((depth_vis*255).astype(np.uint8)).resize((320,180))
        ph=ImageTk.PhotoImage(dv_img,master=depth_win); tk.Label(depth_win,image=ph).pack(); depth_win.image=ph

    def on_object_select(self,event):
        idx=self.obj_dropdown.current()
        if idx<0 or idx>=len(self.landmarks): return
        # generate path steps
        steps=[]
        current_angle=0.0
        user_x, user_y=320,360  # bottom-center of image
        # iterate through landmarks up to target
        target=self.landmarks[idx]
        path_lms=self.landmarks[:idx] + [target]
        for lm in path_lms:
            dx=lm['cx']-user_x; dy=user_y-lm['cy']
            angle=math.degrees(math.atan2(dx,dy))
            turn=angle-current_angle
            if abs(turn)>10:
                dirn='right' if turn>0 else 'left'
                steps.append(f"Turn {abs(int(turn))}° {dirn} to face the {lm['label']}")
                current_angle=angle
            steps.append(f"Walk forward {lm['dist']:.1f} meters until the {lm['label']}")
            # update user position roughly at lm
            user_x,lm_x=lm['cx'],lm['dist']
        # speak and show
        self.txt_info.insert(tk.END,"\nNavigation Steps:\n")
        for s in steps:
            self.txt_info.insert(tk.END,s+"\n")
            self.tts.say(s)
        self.tts.runAndWait()
        # highlight
        x1,y1,x2,y2=target['bbox']
        arr=np.array(self.selected_image)
        cv2.rectangle(arr,(x1,y1),(x2,y2),(0,255,0),3)
        disp=Image.fromarray(arr).resize((640,360))
        self.photo=ImageTk.PhotoImage(disp,master=self)
        self.canvas.create_image(0,0,anchor='nw',image=self.photo)

if __name__=="__main__":
    app=VisionApp(); app.mainloop()


Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master
Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master



0: 384x640 8 persons, 1 backpack, 7 chairs, 1 couch, 1 tv, 2 laptops, 158.0ms
Speed: 1.4ms preprocess, 158.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)


In [2]:
import tkinter as tk
from tkinter import ttk, filedialog
from PIL import Image, ImageTk
import numpy as np
import cv2
import pyttsx3
import math

from modules.yolo_module import YOLODetector
from modules.midas_module import MiDaSDepth
from modules.smolvlm_module import describe_image, plan_navigation 
from torchvision import transforms


def compute_direction(cx, width):
    if cx < width * 0.33:
        return "left"
    elif cx > width * 0.66:
        return "right"
    else:
        return "center"


def build_geometry_steps(landmarks, target_idx):
    steps = []
    current_angle = 0.0
    user_x, user_y = 320, 360  # assume user at bottom-center of image
    path = landmarks[:target_idx] + [landmarks[target_idx]]

    for lm in path:
        dx = lm['cx'] - user_x
        dy = user_y - lm['cy']
        angle = math.degrees(math.atan2(dx, dy))
        turn = angle - current_angle
        if abs(turn) > 5:
            dirn = 'right' if turn > 0 else 'left'
            steps.append(f"Turn {abs(int(turn))}° {dirn} to face the {lm['label']}")
            current_angle = angle
        steps.append(f"Walk forward {lm['dist']:.1f} meters to the {lm['label']}")
        user_x, user_y = lm['cx'], lm['cy']
    return steps


class VisionApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("AI Vision Assistant - Guided Navigation")
        self.geometry("900x700")

        # UI
        tk.Button(self, text="Load Image/Video", command=self.load_file).pack(pady=10)
        self.canvas = tk.Canvas(self, width=640, height=360, bg="black")
        self.canvas.pack()
        self.txt_info = tk.Text(self, wrap="word", height=12)
        self.txt_info.pack(fill="both", expand=True, padx=10, pady=10)
        self.obj_dropdown = ttk.Combobox(self, state="readonly")
        self.obj_dropdown.pack(pady=5)
        self.obj_dropdown.bind("<<ComboboxSelected>>", self.on_object_select)

        # Models & state
        self.yolo = YOLODetector()
        self.midas = MiDaSDepth()
        self.tts = pyttsx3.init()
        self.selected_image = None
        self.depth_map = None
        self.landmarks = []  # each is dict with label, bbox, dist, cx, cy
        self.scene_overview = ""

    def load_file(self):
        path = filedialog.askopenfilename(filetypes=[("Media files","*.jpg *.jpeg *.png *.mp4 *.avi"),("All","*.*")])
        if not path:
            return
        self.canvas.delete("all")
        self.txt_info.delete("1.0", tk.END)
        self.process_image(path)

    def process_image(self, path):
        img = Image.open(path).convert("RGB").resize((640, 360))
        self.selected_image = img.copy()
        self.photo = ImageTk.PhotoImage(img, master=self)
        self.canvas.create_image(0, 0, anchor='nw', image=self.photo)

        # YOLO + depth
        raw = self.yolo.detect(img)
        depth_arr, depth_vis = self.midas.estimate_depth(img)
        self.depth_map = depth_arr
        h, w = depth_arr.shape

        # Scene overview
        self.scene_overview = describe_image(path)
        self.txt_info.insert(tk.END, f"Scene Overview:\n{self.scene_overview}\n\n")
        self.tts.say(self.scene_overview)
        self.tts.runAndWait()

        # Build landmarks
        self.landmarks = []
        for idx, obj in enumerate(raw):
            x1, y1, x2, y2 = map(int, obj['bbox'])
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
            cx = np.clip(cx, 0, w - 1)
            cy = np.clip(cy, 0, h - 1)
            dist = float(depth_arr[cy, cx])
            self.landmarks.append({
                'label': obj['label'],
                'bbox': (x1, y1, x2, y2),
                'dist': round(dist, 2),
                'cx': cx,
                'cy': cy
            })
        self.landmarks.sort(key=lambda o: o['dist'])

        # List landmarks
        self.txt_info.insert(tk.END, "Landmarks (nearest first):\n")
        for i, lm in enumerate(self.landmarks):
            self.txt_info.insert(tk.END, f"{i}. {lm['label']} at {lm['dist']} m\n")

        # Dropdown
        vals = [f"{lm['label']} ({i})" for i, lm in enumerate(self.landmarks)]
        self.obj_dropdown['values'] = vals
        if vals:
            self.obj_dropdown.set(vals[0])

        # Depth map preview
        depth_win = tk.Toplevel(self)
        depth_win.title("Depth Map")
        if isinstance(depth_vis, Image.Image):
            dv_img = depth_vis.resize((320, 180))
        else:
            dv_img = Image.fromarray((depth_vis * 255).astype(np.uint8)).resize((320, 180))
        ph = ImageTk.PhotoImage(dv_img, master=depth_win)
        tk.Label(depth_win, image=ph).pack()
        depth_win.image = ph

    def on_object_select(self, event):
        idx = self.obj_dropdown.current()
        if idx < 0 or idx >= len(self.landmarks):
            return

        geometry_steps = build_geometry_steps(self.landmarks, idx)

        # Generate natural language from geometry steps
        natural_instruction = plan_navigation(self.landmarks, idx, geometry_steps)



        # Display and speak
        self.txt_info.insert(tk.END, "\nNavigation Plan:\n")
        self.txt_info.insert(tk.END, natural_instruction + "\n")
        self.tts.say(natural_instruction)
        self.tts.runAndWait()

        # Highlight target object
        x1, y1, x2, y2 = self.landmarks[idx]['bbox']
        arr = np.array(self.selected_image)
        cv2.rectangle(arr, (x1, y1), (x2, y2), (0, 255, 0), 3)
        disp = Image.fromarray(arr).resize((640, 360))
        self.photo = ImageTk.PhotoImage(disp, master=self)
        self.canvas.create_image(0, 0, anchor='nw', image=self.photo)


if __name__ == "__main__":
    app = VisionApp()
    app.mainloop()


Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master
Using cache found in C:\Users\hp/.cache\torch\hub\intel-isl_MiDaS_master



0: 384x640 2 persons, 4 cars, 155.6ms
Speed: 1.5ms preprocess, 155.6ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
