In [None]:
!pip install facenet_pytorch
!pip install -U torch torchvision
!pip install torch torchvision torchaudio
!pip install opencv-python
!pip install face_recognition
!pip install pyttsx3
!pip install numpy
!pip install ultralytics 
!pip install timm
!pip install mediapipe

In [None]:
import cv2
import os
import face_recognition
from IPython.display import display, clear_output
from PIL import Image
import numpy as np

# Step 1: Create output folder
output_folder = r"C:\Users\HP\Documents\face\face_data\train\sorna"
os.makedirs(output_folder, exist_ok=True)

# Step 2: Start webcam
cap = cv2.VideoCapture(0)

count = 0
total_images = 10  # keep small for notebook
print("📸 Face capture started...")

try:
    while count < total_images:
        ret, frame = cap.read()
        if not ret:
            print("❌ Failed to read from webcam")
            break

        # Resize frame for faster face detection
        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        rgb_small = small_frame[:, :, ::-1]

        # Detect faces
        face_locations = face_recognition.face_locations(rgb_small)

        for top, right, bottom, left in face_locations:
            top *= 4
            right *= 4
            bottom *= 4
            left *= 4

            # Crop the face
            face_img = frame[top:bottom, left:right]

            if face_img.size > 0:
                filename = os.path.join(output_folder, f"siva_{count:03d}.jpg")
                cv2.imwrite(filename, face_img)
                count += 1

                # Show preview in notebook
                clear_output(wait=True)
                pil_img = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB))
                display(pil_img)
                print(f"✅ Saved {filename}")

except KeyboardInterrupt:
    print("⏹️ Interrupted manually.")

# Step 3: Release the webcam
cap.release()
print("🎉 Face capture completed.")

In [None]:
import os
import torch
import timm
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# 1. User parameters
dataset_path = r"C:\Users\HP\Documents\face\face_data\train"  # <-- Your dataset path
output_path = "C:/Users/HP/Documents/face/swin_fewshot_model.pt"

# 2. Transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# 3. Dataset
dataset = ImageFolder(dataset_path, transform=transform)
loader = DataLoader(dataset, batch_size=1, shuffle=False)
idx_to_class = {v: k for k, v in dataset.class_to_idx.items()}

# 4. Swin Transformer encoder
model = timm.create_model("swin_tiny_patch4_window7_224", pretrained=True)
model.head = torch.nn.Identity()
model.eval()

# 5. Extract embeddings
prototypes = {}
with torch.no_grad():
    for img, label in loader:
        name = idx_to_class[label.item()]
        emb = model(img).squeeze(0)
        if name not in prototypes:
            prototypes[name] = []
        prototypes[name].append(emb)

# 6. Mean embeddings = class prototypes
for name in prototypes:
    prototypes[name] = torch.stack(prototypes[name]).mean(dim=0)

# 7. Save model + prototypes
torch.save({
    'model_state_dict': model.state_dict(),
    'prototypes': prototypes
}, output_path)

print("✅ Few-shot face model trained and saved.")


In [None]:
import torch
import cv2
from PIL import Image
import timm
from facenet_pytorch import MTCNN
from ultralytics import YOLO
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from IPython.display import clear_output
import numpy as np

# Paths
model_path = "C:/Users/HP/Documents/face/swin_fewshot_model.pt"

# Load face model
checkpoint = torch.load(model_path, map_location=torch.device("cpu"))
model = timm.create_model("swin_tiny_patch4_window7_224", pretrained=False)
model.head = torch.nn.Identity()
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
prototypes = checkpoint["prototypes"]

# Face detector
mtcnn = MTCNN(keep_all=True, device="cpu")

# YOLOv8 object detector
yolo = YOLO("yolov8n.pt")  # Replace with custom model if needed

# Face transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Face recognizer
def identify_face(face_img_pil):
    face_tensor = transform(face_img_pil).unsqueeze(0)
    with torch.no_grad():
        emb = model(face_tensor).squeeze(0)
    dists = {name: torch.norm(emb - proto).item() for name, proto in prototypes.items()}
    return min(dists, key=dists.get)

# Start webcam
cap = cv2.VideoCapture(0)

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        face_names = []

        # Detect faces
        boxes, _ = mtcnn.detect(frame)
        if boxes is not None:
            for box in boxes:
                x1, y1, x2, y2 = map(int, box)
                face_crop = frame[y1:y2, x1:x2]
                if face_crop.size == 0: continue
                face_pil = Image.fromarray(cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB))
                name = identify_face(face_pil)
                face_names.append(name)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
                cv2.putText(frame, name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

        # Detect objects
        results = yolo(frame, verbose=False)
        if results and results[0].boxes.cls.numel() > 0:
            class_ids = results[0].boxes.cls.cpu().numpy().astype(int)
            object_names = list({yolo.names[cid] for cid in class_ids})
        else:
            object_names = []

         # Smart Label Logic
        if "Siva" in face_names and "person" in object_names and len(face_names) == 1:
            label_text = "Siva"
        elif face_names:
            # If only Siva is in face_names
            label_text = f"{' & '.join(face_names)}"
            if object_names:
                label_text += f" with {', '.join([obj for obj in object_names if obj != 'person'])}"
        elif object_names:
            label_text = f"{', '.join(object_names)}"
        else:
            label_text = "No faces or objects detected"


        cv2.putText(frame, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)

        # Convert BGR to RGB and display using matplotlib
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        clear_output(wait=True)
        plt.imshow(rgb_frame)
        plt.title("Siva with Objects")
        plt.axis("off")
        plt.show()

except KeyboardInterrupt:
    print("Stopped by user.")

finally:
    cap.release()


In [None]:
import cv2
import mediapipe as mp

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.7
)

# Start webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # 🪞 Mirror the camera
    frame = cv2.flip(frame, 1)

    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    hand_labels = []

    if results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            label = handedness.classification[0].label  # "Left" or "Right"
            hand_labels.append(f"{label} Hand")
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Add label to screen
    if hand_labels:
        label_text = " | ".join(hand_labels)
        cv2.putText(frame, label_text, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

    # Display the result
    cv2.imshow("🖐️ Hand Detection (Mirror)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Clean up
cap.release()
cv2.destroyAllWindows()


In [None]:
import cv2
import torch
from PIL import Image
import timm
from facenet_pytorch import MTCNN
from ultralytics import YOLO
import torchvision.transforms as transforms
import mediapipe as mp
import matplotlib.pyplot as plt
from IPython.display import clear_output
import numpy as np

# ========== Load Face Recognition Model ==========
model_path = "C:/Users/HP/Documents/face/swin_fewshot_model.pt"
checkpoint = torch.load(model_path, map_location=torch.device("cpu"))
model = timm.create_model("swin_tiny_patch4_window7_224", pretrained=False)
model.head = torch.nn.Identity()
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
prototypes = checkpoint["prototypes"]

# Face detection
mtcnn = MTCNN(keep_all=True, device="cpu")

# YOLOv8
yolo = YOLO("yolov8n.pt")

# Face transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

def identify_face(face_img_pil):
    face_tensor = transform(face_img_pil).unsqueeze(0)
    with torch.no_grad():
        emb = model(face_tensor).squeeze(0)
    dists = {name: torch.norm(emb - proto).item() for name, proto in prototypes.items()}
    return min(dists, key=dists.get)

# ========== MediaPipe Hand Detection Setup ==========
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)

# ========== Start Webcam ==========
cap = cv2.VideoCapture(0)

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)  # Mirror view
        face_names = []
        hand_labels = []

        # Face detection
        boxes, _ = mtcnn.detect(frame)
        if boxes is not None:
            for box in boxes:
                x1, y1, x2, y2 = map(int, box)
                face_crop = frame[y1:y2, x1:x2]
                if face_crop.size == 0: continue
                face_pil = Image.fromarray(cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB))
                name = identify_face(face_pil)
                face_names.append(name)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
                cv2.putText(frame, name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

        # Object detection
        results = yolo(frame, verbose=False)
        if results and results[0].boxes.cls.numel() > 0:
            class_ids = results[0].boxes.cls.cpu().numpy().astype(int)
            object_names = list({yolo.names[cid] for cid in class_ids})
        else:
            object_names = []

        # Hand detection
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        hand_results = hands.process(rgb_frame)

        if hand_results.multi_hand_landmarks:
            for hand_landmarks, handedness in zip(hand_results.multi_hand_landmarks, hand_results.multi_handedness):
                label = handedness.classification[0].label  # "Left" or "Right"
                hand_labels.append(f"{label} Hand")
                mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Smart Label Logic
        if "Siva" in face_names and "person" in object_names and len(face_names) == 1:
            label_text = "Siva"
        elif face_names:
            label_text = f"{' & '.join(face_names)}"
            if object_names:
                label_text += f" with {', '.join([obj for obj in object_names if obj != 'person'])}"
        elif object_names:
            label_text = f"{', '.join(object_names)}"
        else:
            label_text = "No faces or objects detected"

        if hand_labels:
            label_text += f" + {', '.join(hand_labels)}"

        # Show label
        cv2.putText(frame, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)

        # Display using matplotlib
        rgb_show = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        clear_output(wait=True)
        plt.imshow(rgb_show)
        plt.title("Siva with Objects and Hands")
        plt.axis("off")
        plt.show()

except KeyboardInterrupt:
    print("Stopped by user.")

finally:
    cap.release()
    cv2.destroyAllWindows()


In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
import timm
import torchvision.transforms as transforms
from facenet_pytorch import MTCNN
from ultralytics import YOLO
import mediapipe as mp

# --- Initialize models and paths ---
model_path = "C:/Users/HP/Documents/face/swin_fewshot_model.pt"
yolo = YOLO("yolov8n.pt")  # You can replace with your custom trained model
mtcnn = MTCNN(keep_all=True, device="cpu")

# Face recognition model (Swin Transformer)
checkpoint = torch.load(model_path, map_location="cpu")
model = timm.create_model("swin_tiny_patch4_window7_224", pretrained=False)
model.head = torch.nn.Identity()
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
prototypes = checkpoint["prototypes"]

# Transform for face embedding
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

def identify_face(face_img_pil):
    face_tensor = transform(face_img_pil).unsqueeze(0)
    with torch.no_grad():
        emb = model(face_tensor).squeeze(0)
    dists = {name: torch.norm(emb - proto).item() for name, proto in prototypes.items()}
    return min(dists, key=dists.get)

# MediaPipe Hands
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.7)

# Start webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    face_names, object_names, hand_labels = [], [], []

    # --- Face Detection & Recognition ---
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None:
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            face_crop = frame[y1:y2, x1:x2]
            if face_crop.size == 0: continue
            face_pil = Image.fromarray(cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB))
            name = identify_face(face_pil)
            face_names.append(name)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(frame, name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

    # --- Object Detection ---
    results = yolo(frame, verbose=False)
    if results and results[0].boxes.cls.numel() > 0:
        class_ids = results[0].boxes.cls.cpu().numpy().astype(int)
        object_names = list({yolo.names[cid] for cid in class_ids})

    # --- Hand Detection ---
    hand_result = hands.process(rgb)
    if hand_result.multi_hand_landmarks:
        for hand_landmarks, hand_info in zip(hand_result.multi_hand_landmarks, hand_result.multi_handedness):
            label = hand_info.classification[0].label  # "Left" or "Right"
            hand_labels.append(f"{label} Hand")
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # --- Smart Label Logic ---
    label_text = ""
    
    # Remove 'person' from object list for clarity
    clean_objects = [obj for obj in object_names if obj.lower() != "person"]
    
    if "Siva" in face_names:
        label_text = "Siva"
        if clean_objects:
            label_text += f" with {', '.join(clean_objects)}"
    else:
        if face_names:
            label_text = " & ".join(face_names)
            if clean_objects:
                label_text += f" with {', '.join(clean_objects)}"
        elif clean_objects:
            label_text = f"{', '.join(clean_objects)}"
    
    # Add hand info if any
    if hand_labels:
        if label_text:
            label_text += f" showing {', '.join(hand_labels)}"
        else:
            label_text = f"{', '.join(hand_labels)}"
    
    # Fallback if no detections
    if not label_text:
        label_text = "No faces, hands, or objects detected"


    # --- Display ---
    cv2.putText(frame, label_text, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)
    cv2.imshow("Siva Intelligent Vision", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()
