In [None]:
import json
import logging
import cv2
import numpy as np
import speech_recognition as sr
from deep_translator import GoogleTranslator
import spacy
from ultralytics import YOLO
import matplotlib.pyplot as plt

# Suppress warnings
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load YOLOv8-seg model
model_yolo = YOLO("yolov8x-seg.pt")

# Danh sách các vị trí hợp lệ
valid_locations = {"left", "right", "top", "bottom", "front", "back", "center", "middle", "upper left", "below", "above"}

# -------------------------------------------------------------------------------------------------------------------------
def extract_object_and_location(user_input):
    doc = nlp(user_input)

    object_name = None
    target_location = None

    for token in doc:
        if token.text.lower() in valid_locations:
            target_location = token.text.lower()

    for chunk in doc.noun_chunks:
        if chunk.root.pos_ == "PRON":
            continue
        if not object_name:
            object_name = chunk.text.strip()
            break

    if object_name:
        tokens = object_name.split()
        if tokens[0].lower() in {"a", "an", "the"}:
            object_name = " ".join(tokens[1:])

    if not object_name:
        print("Không tìm thấy đối tượng trong câu.")
    if not target_location:
        print("Không tìm thấy vị trí trong câu.")

    return (object_name.strip() if object_name else None), target_location

# -------------------------------------------------------------------------------------------------------------------------
def detect_segment_object(image_path, target_object, target_location):
    if not target_object:
        print("No object detected from text input.")
        return

    image = cv2.imread(image_path)
    results = model_yolo(image, conf=0.5)

    output_image = image.copy()
    mask_overlay = np.zeros_like(image, dtype=np.uint8)
    found = False
    object_positions = []

    for result in results:
        for box, mask in zip(result.boxes, result.masks.xy):
            label = result.names[int(box.cls[0].item())]
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            object_positions.append((x1, y1, x2, y2, label, mask))

            cv2.fillPoly(mask_overlay, [np.array(mask, dtype=np.int32)], (0, 255, 255))
            cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(output_image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    object_positions = [pos for pos in object_positions if pos[4].lower() == target_object.lower()]

    if not object_positions:
        print(f"Object '{target_object}' not found in the image.")
        return

    if target_location == "right":
        object_positions.sort(key=lambda x: x[0], reverse=True)
    elif target_location == "left":
        object_positions.sort(key=lambda x: x[0])
    elif target_location in {"middle", "center"}:
        center_x = sum([(x1 + (x2 - x1) / 2) for x1, y1, x2, y2, label, mask in object_positions]) / len(object_positions)
        object_positions.sort(key=lambda x: abs((x[0] + x[2]) / 2 - center_x))
    elif target_location in {"top", "above"}:
        object_positions.sort(key=lambda x: x[1])
    elif target_location in {"bottom", "below"}:
        object_positions.sort(key=lambda x: x[3], reverse=True)

    print("Object positions (sorted by location):")
    for position in object_positions:
        print(f"Object: {position[4]}, x1: {position[0]}, y1: {position[1]}, x2: {position[2]}, y2: {position[3]}")

    for x1, y1, x2, y2, label, mask in object_positions:
        print(f"Found {label} at x1: {x1}, x2: {x2}, y1: {y1}, y2: {y2}")

        mask_bin = np.zeros(image.shape[:2], dtype=np.uint8)
        cv2.fillPoly(mask_bin, [np.array(mask, dtype=np.int32)], 255)

        blacked_out_image = image.copy()
        blacked_out_image[mask_bin == 255] = (255, 0, 0)

        cv2.rectangle(blacked_out_image, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Tính trọng tâm
        M = cv2.moments(mask_bin)
        if M["m00"] != 0:
            cx = int(M["m10"] / M["m00"])
            cy = int(M["m01"] / M["m00"])
            print(f"Trọng tâm của {label}: (cx={cx}, cy={cy})")
            cv2.circle(blacked_out_image, (cx, cy), 5, (0, 0, 255), -1)
        else:
            print("Không tính được trọng tâm (diện tích bằng 0).")

        plt.figure(figsize=(10, 6))
        plt.imshow(cv2.cvtColor(blacked_out_image, cv2.COLOR_BGR2RGB))
        plt.axis("off")
        plt.title(f"Segmented Object: {target_object}")
        plt.show()

        found = True
        break

    if not found:
        print(f"Object '{target_object}' not found in the image.")

# -------------------------------------------------------------------------------------------------------------------------
def speech_to_text():
    recognizer = sr.Recognizer()
    translator = GoogleTranslator(source='vi', target='en')

    with sr.Microphone() as source:
        print("Hãy nói điều gì đó...")
        recognizer.adjust_for_ambient_noise(source, duration=1)
        print("Bắt đầu ghi âm...")

        try:
            audio_data = recognizer.listen(source, timeout=5)
            print("Đang xử lý...")

            text_vietnamese = recognizer.recognize_google(audio_data, language='vi-VN')
            print("Bạn vừa nói (tiếng Việt):", text_vietnamese)

            translated_text = translator.translate(text_vietnamese)
            print("Dịch sang tiếng Anh:", translated_text)

            return translated_text

        except sr.UnknownValueError:
            print("Xin lỗi, tôi không hiểu được.")
        except sr.RequestError as e:
            print("Lỗi kết nối đến dịch vụ Google Speech Recognition:", e)
        except Exception as e:
            print("Đã xảy ra lỗi:", e)

    return None

# -------------------------------------------------------------------------------------------------------------------------
# Main
user_input = speech_to_text()
if user_input:
    object_name, target_location = extract_object_and_location(user_input)
    print("Extracted object:", object_name)
    print("Target location:", target_location)

    image_path = r"C:\FPTU\OJT\project_robot_mic\Project\Project\images.jpg"
    detect_segment_object(image_path, object_name, target_location)
