In [13]:
import apriltag
import cv2
import numpy as np
import math
import mediapipe as mp

In [14]:
def CreateUI():
	UIHeight = 1080
	UIWidth = 1900

	UIBG = np.zeros((UIHeight,UIWidth,3),dtype=np.uint8)

	BoxHeight = 1000
	BoxWidth = 400
	Gap = 175

	YStart = int((UIHeight-BoxHeight)/2)

	for i in range(3):
		XStart = Gap + i*(BoxWidth+Gap)

		BoxTopLeft = (XStart,YStart)
		BoxBottomRight = (XStart+BoxWidth , YStart+BoxHeight)

		cv2.rectangle(UIBG,BoxTopLeft,BoxBottomRight,(192,242,30),-1)

		BoxDims.append((XStart,YStart,BoxWidth,BoxHeight))
	return UIBG

def CalculateUICorners(Tag):
	MarkerTopLeft,MarkerTopRight,MarkerBottomRight,MarkerBottomLeft = Tag.corners[0],Tag.corners[1],Tag.corners[2],Tag.corners[3]
	MarkerCenter = Tag.center
	TopVector = MarkerTopRight - MarkerTopLeft
	LeftVector = MarkerBottomLeft - MarkerTopLeft
	OffsetScale = 2
	UIScale = 2
	UICenter = MarkerCenter+(LeftVector*OffsetScale)
	UIWidth = TopVector*UIScale
	UIHeight = LeftVector*UIScale
	UITopLeft = UICenter-(UIWidth/2)-(UIHeight/2)
	UITopRight = UITopLeft+UIWidth
	UIBottomLeft = UITopLeft+UIHeight
	UIBottomRight = UITopLeft+UIWidth+UIHeight

	return np.array([UITopLeft,UITopRight,UIBottomRight,UIBottomLeft], dtype=np.float32)

def Touch(frame,Matrix,WFrame,HFrame,UIBG):
	FingerOption = -1
	RGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	MpResult = hand.process(RGB)

	if MpResult.multi_hand_landmarks and Matrix is not None:
		HandLandmarks = MpResult.multi_hand_landmarks[0]
		mpdrawing.draw_landmarks(frame, HandLandmarks, mphands.HAND_CONNECTIONS)
		IndexTip = HandLandmarks.landmark[mphands.HandLandmark.INDEX_FINGER_TIP]
		ThumbTip = HandLandmarks.landmark[mphands.HandLandmark.THUMB_TIP]
		IndexPos = (int(IndexTip.x * WFrame),int(IndexTip.y * HFrame))

		try:
			InvMatrix = np.linalg.inv(Matrix)
			FingerPosOnUI = cv2.perspectiveTransform(np.array([[IndexPos]], dtype=np.float32), InvMatrix)
			fx,fy = int(FingerPosOnUI[0][0][0]),int(FingerPosOnUI[0][0][1])
			cv2.circle(UIBG, (fx, fy), 15, (0, 0, 255), -1)
			for i,(bx,by,bw,bh) in enumerate(BoxDims):
				if bx<fx<bx+bw and by<fy<by+bh:
					FingerOption = i
					break
		except np.linalg.LinAlgError:
			print("LinAlgError!")

		dist = math.hypot(ThumbTip.x - IndexTip.x , ThumbTip.y-IndexTip.y)
		IsPinching = dist < 0.05
		if FingerOption != -1 and IsPinching:
			print(f"User selected box {FingerOption+1}")
			(bx,by,bw,bh) = BoxDims[FingerOption]
			cv2.rectangle(UIBG, (bx,by), (bx+bw,by+bh), (0,255,0), -1)

	return UIBG,FingerOption

In [15]:
#Setup

BoxDims = []
UIBGOriginal = CreateUI()
h,w,c = UIBGOriginal.shape
options = apriltag.DetectorOptions(families="tag16h5")
detector = apriltag.Detector(options)
Angle = 0
mphands = mp.solutions.hands
mpdrawing = mp.solutions.drawing_utils
hand = mphands.Hands(min_detection_confidence = 0.7,max_num_hands = 1)

KalmanFilters = [cv2.KalmanFilter(4,2) for _ in range(4)]
for kf in KalmanFilters:
	kf.transitionMatrix = np.array([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
	kf.measurementMatrix = np.array([[1, 0, 0, 0], [0, 1, 0, 0]], np.float32)
	kf.processNoiseCov = np.eye(4, dtype=np.float32) * 1e-3
	kf.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1e-2

calibrated = False
FramesSinceDetection = 0

cam = cv2.VideoCapture(0,cv2.CAP_V4L2)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1080)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 1900)
if not cam.isOpened():
	print("Cannot Open Camera")
	exit()

I0000 00:00:1757946449.620699  398612 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1757946449.723279  434797 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 580.65.06), renderer: NVIDIA GeForce RTX 5070 Laptop GPU/PCIe/SSE2
W0000 00:00:1757946449.766146  434773 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757946449.793176  434776 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [16]:
while True:
	
	UIBG = UIBGOriginal.copy()
	success,frame = cam.read()
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

	HFrame,WFrame,_ = frame.shape
	FrameArea = HFrame*WFrame

	ApriltagResults = detector.detect(gray)

	Matrix = None

	key = cv2.waitKey(1) & 0xFF
	
	if not calibrated and ApriltagResults:
		if key == ord('c'):
			for r in ApriltagResults:
				MarkerArea = cv2.contourArea(np.array(r.corners, dtype=np.int32))
				if r.tag_id == 0 and MarkerArea > (FrameArea*0.008):
					DestinationPoints = CalculateUICorners(ApriltagResults[0])

					for i,corner in enumerate(DestinationPoints):
						KalmanFilters[i].statePost = np.array([corner[0], corner[1], 0, 0], dtype=np.float32)

			calibrated = True
			print("Calibrated!")
	
	if calibrated:
		PredictedCorners = np.array([kf.predict()[:2].flatten() for kf in KalmanFilters], dtype=np.float32)

		if ApriltagResults:
			for r in ApriltagResults:
				MarkerArea = cv2.contourArea(np.array(r.corners, dtype=np.int32))
				if r.tag_id == 0 and MarkerArea > (FrameArea*0.008):
					DestinationPoints = CalculateUICorners(ApriltagResults[0])
					FramesSinceDetection = 0
					for i,corner in enumerate(DestinationPoints):
						KalmanFilters[i].correct(corner)
		else:
			FramesSinceDetection += 1

		SmoothPoints = np.array([kf.statePost[:2].flatten() for kf in KalmanFilters], dtype=np.float32)
		
		if FramesSinceDetection <= 3:
			SourcePoints = np.array([[0,0],[w,0],[w,h],[0,h]], dtype=np.float32)
			Matrix,_ = cv2.findHomography(SourcePoints,SmoothPoints)

			UIBG,FingerOption = Touch(frame,Matrix,WFrame,HFrame,UIBG)

			if Matrix is not None:
				WarpedUI = cv2.warpPerspective(UIBG, Matrix, (WFrame,HFrame))

				mask = np.sum(WarpedUI, axis=2) > 0
				frame[mask] = WarpedUI[mask]

	if not calibrated:
			cv2.putText(frame, "Show marker and press 'c' to calibrate", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
			
	cv2.imshow("VideoFeed",frame)

	if key == ord('q'):
		break

cam.release()
cv2.destroyAllWindows()

Calibrated!
User selected box 2
User selected box 2
User selected box 2
User selected box 2
User selected box 1
User selected box 1
User selected box 1
User selected box 2
User selected box 2
User selected box 3
