In [1]:
# Importing necessary libraries
import cv2
import numpy as np
from ultralytics import YOLO
import random
import time

In [2]:
# Setting index to either 1 (Mac camera), or 0 (iPhone camera)
cam_idx = 0

# Using YOLO 11n for person detection
model = YOLO("yolo11n.pt")      

# Storing appearance features (color histograms) and count of unique people
known_appearances = []
unique_person_count = 0

# Assigning IDs to unique individuals
person_names = []           # Storing the ID for each known appearance
used_names = set()     # Tracking used names to avoid duplicates

# Setting the similarity threshold 
# Lower = less duplicates. Higher = less failing to recognize someone is new.
SIMILARITY_THRESHOLD = 0.6

In [3]:
# Creating a function to grab the color histogram of clothes for people on camera
def extract_appearance_features(person_img):

    # Converting color space to HSV for better comparison capabilities
    hsv = cv2.cvtColor(person_img, cv2.COLOR_BGR2HSV)
    
    # Computing individual histograms for H, S, and V
    # Indexes are: Channel, Mask (none here), number of bins, and Value Range in Open CV.
    hist_h = cv2.calcHist([hsv], [0], None, [50], [0, 180])
    hist_s = cv2.calcHist([hsv], [1], None, [60], [0, 256])
    hist_v = cv2.calcHist([hsv], [2], None, [60], [0, 256])
    
    # Now we normalize the histograms
    cv2.normalize(hist_h, hist_h)
    cv2.normalize(hist_s, hist_s)
    cv2.normalize(hist_v, hist_v)
    
    # Then we concatenate all the histograms into one feature vector
    features = np.concatenate([hist_h.flatten(), hist_s.flatten(), hist_v.flatten()])

    # And what the function returns is that feature vector
    return features

In [4]:
# Creating function to compare histograms so that we don't count duplicates
def compare_appearances(features1, features2):

    # Measuring similarity of a feature vector with another
    similarity = np.dot(features1, features2) / (np.linalg.norm(features1) * np.linalg.norm(features2))

    # And we return how similar they are
    return similarity

In [5]:
# In this block let's create what we need to assign funny identifiers to anyone captured
# Note: at 30 adjectives and 30 animal names, our nax count is 900 individual identifiers.
# Add more adjectives and names if need to go higher
# Or even better, add a third list type (color, etc) to multiply the number of options.

# List of adjectives 
adjectives = [
    "Fuzzy", "Pink", "Hyperactive", "Sleepy", "Grumpy", "Happy", "Bouncy", 
    "Sneaky", "Wise", "Mighty", "Tiny", "Giant", "Swift", "Lazy", "Brave",
    "Silly", "Clever", "Dancing", "Flying", "Swimming", "Quiet", "Loud",
    "Sparkly", "Shiny", "Fluffy", "Smooth", "Spiky", "Gentle", "Wild", "Calm"
]

# List of animal names
animals = [
    "Panda", "Monkey", "Turtle", "Eagle", "Dolphin", "Lion", "Tiger", "Bear",
    "Fox", "Wolf", "Elephant", "Giraffe", "Penguin", "Koala", "Kangaroo",
    "Octopus", "Owl", "Rabbit", "Squirrel", "Hedgehog", "Raccoon", "Otter",
    "Seal", "Kraken", "Shark", "Parrot", "Flamingo", "Zebra", "Axolotl", "Rhino"
]

# And here's the function to generate adjective / name identifiers
def generate_unique_name(used_names):
    while True:
        adjective = random.choice(adjectives)
        animal = random.choice(animals)
        name = f"{adjective} {animal}"
        if name not in used_names:
            used_names.add(name)
            return name

In [6]:
# Storing appearance features and assigning names
known_appearances = []      # Storing the feature vectors for each person
person_names = []           # Storing the name for each known person
person_has_left = []        # Tracking if person has ever left the frame
last_frame_ids = set() # Tracking who was in the previous frame
used_names = set()     # Tracking used names to avoid duplicates
unique_person_count = 0     # Initializing a counter of unique people

In [7]:
# Camera, Open Sesame!
cap = cv2.VideoCapture(cam_idx, cv2.CAP_AVFOUNDATION)

# Yay error handling, just in case
if not cap.isOpened():
    raise RuntimeError(f"Could not open camera {cam_idx}")

# Creating a capture loop to grab people
while True:
    ok, frame = cap.read()

    # Yay error handling
    if not ok:
        print("Failed to read the frame, try again!")
        break

    # Detecting people with YOLO
    results = model(
        frame,
        imgsz=640,
        classes=[0],
        verbose=False,
        conf=0.5
    )[0]
    
    # Creating variable to start the counting 
    current_frame_count = 0
    current_frame_ids = set()  # Track who's in this frame
    
    # When a person is detected, let's grab them
    if len(results.boxes) > 0:
        for box in results.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            person_img = frame[y1:y2, x1:x2]
            
            # Let's not count any people who are lilliputians (or too far away)
            if person_img.shape[0] < 50 or person_img.shape[1] < 30:
                continue
            
            # Now we grab our captured person and we extract an embedding for their appearance.
            features = extract_appearance_features(person_img)
            is_new_person = True   # Setting to True for now, we'll flip it to False if a match is found
            person_name = None
            person_idx = None
            
            # Comparing to existing embeddings, and avoiding counting if there's a match
            for idx, known_features in enumerate(known_appearances):
                similarity = compare_appearances(features, known_features)
                if similarity > SIMILARITY_THRESHOLD:
                    is_new_person = False
                    person_name = person_names[idx]  # Get the name of the matched person
                    person_idx = idx
                    break
            
            # For people that don't match existing captures, lets add them and create a new name
            if is_new_person:
                known_appearances.append(features)
                person_name = generate_unique_name(used_names)
                person_names.append(person_name)
                person_has_left.append(False)  # First time seeing this person
                person_idx = len(person_names) - 1
                unique_person_count += 1
                print(f"New person detected: {person_name} | Total unique: {unique_person_count}")
            
            current_frame_ids.add(person_idx)
            
            # Incrementing the frame count
            current_frame_count += 1

            # Drawing the bounding box (yellow if never left, green if has left before)
            color = (0, 255, 255) if not person_has_left[person_idx] else (0, 255, 0)
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

            # Displaying the person's name above the bounding box
            cv2.putText(frame, person_name, 
                (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
    
    # Checking who left the frame (was in last frame but not in current frame)
    people_who_left = last_frame_ids - current_frame_ids
    for idx in people_who_left:
        person_has_left[idx] = True  # Marking that they've left at least once
    
    # Updating last frame tracking
    last_frame_ids = current_frame_ids.copy()
    
    # Showing how many people are in frame, and how many unique visitors we've seen total
    cv2.putText(frame, f"Current: {current_frame_count}  Total unique: {unique_person_count}",
                (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Showing which visitors are new, and which are known
    cv2.putText(frame, "Yellow=First visit  Green=Return visit", 
                (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

    # Providing a way to quit capturing visitors 
    cv2.imshow("People counter (press q to quit)", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

    cv2.imshow("People counter (press q to quit)", frame)

    # Most stable key polling on macOS:
    key = cv2.waitKey(1)
    if key == ord("q"):  # clean comparison, no bitmask
        print("Quit signal received")
        break

# Closing the camera and the OpenCV window showing people tracked
cap.release()
cv2.destroyAllWindows()
time.sleep(0.2)  # allow OS event loop flush
cv2.destroyAllWindows()  # second call to enforce close
print("Camera released, OpenCV windows closed.")

# Printing out total number of individual people detected
print(f"Total unique people detected: {unique_person_count}")

New person detected: Bouncy Seal | Total unique: 1
New person detected: Sneaky Owl | Total unique: 2
Camera released, OpenCV windows closed.
Total unique people detected: 2
