In [1]:
import cv2
import numpy as np
import pickle

with open('knn_model.pkl', 'rb') as f:
    knn_model = pickle.load(f)

def detect_keypoints_and_descriptors(image):
    orb = cv2.ORB_create()  # Create an ORB detector
    keypoints, descriptors = orb.detectAndCompute(image, None)

    if keypoints is None or descriptors is None:
        return None, None, image
        
    img_with_keypoints = cv2.drawKeypoints(image, keypoints, None, flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
    return keypoints, descriptors, img_with_keypoints

def classify_image_with_knn(descriptors, knn_model):
    flattened_descriptors = descriptors.flatten().reshape(1, -1)
    
    prediction = knn_model.predict(flattened_descriptors)
    return prediction


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [17]:
import pickle
import cv2
import numpy as np
from gtts import gTTS
from IPython.display import Audio
from collections import Counter

# Load the classifier from a pickle file
def load_classifier(pickle_file_path):
    try:
        with open(pickle_file_path, 'rb') as f:
            classifier = pickle.load(f)
        print(f"Classifier loaded successfully from {pickle_file_path}")
        return classifier
    except Exception as e:
        print(f"Error loading classifier: {e}")
        return None

# Function to preprocess the image (you can define this based on your model needs)
def preprocess_image(image):
    # Example preprocessing (you can modify as needed)
    gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    return gray_image

# Function to detect keypoints and descriptors (using ORB in this case)
def detect_keypoints_and_descriptors(image):
    orb = cv2.ORB_create()
    keypoints, descriptors = orb.detectAndCompute(image, None)
    return keypoints, descriptors

# Function to recognize the top 3 labels in an image and accumulate them
def recognize_top_labels_in_frame(frame, classifier, VOC_CLASSES):
    preprocessed_image = preprocess_image(frame)
    keypoints, descriptors = detect_keypoints_and_descriptors(preprocessed_image)

    if descriptors is not None and len(descriptors) > 0:
        # Make sure the descriptor length matches the classifier's expected feature size (128)
        if descriptors.shape[1] != 128:
            # If the number of descriptors is less than 128, pad the descriptor array with zeros
            # If it's greater than 128, truncate it
            if descriptors.shape[1] < 128:
                # Padding with zeros to ensure 128 features
                descriptors = np.pad(descriptors, ((0, 0), (0, 128 - descriptors.shape[1])), mode='constant')
            else:
                descriptors = descriptors[:, :128]  # Truncate to 128 features
        
        feature_vector = np.mean(descriptors, axis=0).reshape(1, -1)

        try:
            probabilities = classifier.predict_proba(feature_vector)
            sorted_indices = np.argsort(probabilities[0])[::-1]  # Sort by probability

            top_predictions = [(VOC_CLASSES[i], probabilities[0][i]) for i in sorted_indices[:3]]

            return top_predictions

        except Exception as e:
            print(f"Error during prediction: {e}")
            return None
    else:
        print("No descriptors detected")
        return None

# Real-time webcam processing to accumulate 100 frames and return the top predictions
def start_webcam_recognition(pickle_file_path, VOC_CLASSES):
    # Load the classifier
    classifier = load_classifier(pickle_file_path)

    # Open webcam
    cap = cv2.VideoCapture(1)  # 0 for default webcam

    frame_count = 0  # Initialize frame counter
    predictions = []  # List to store predictions

    while True:
        ret, frame = cap.read()  # Capture frame from webcam
        if not ret:
            print("Failed to grab frame")
            break

        # Call the recognition function for each frame
        top_predictions = recognize_top_labels_in_frame(frame, classifier, VOC_CLASSES)

        if top_predictions:
            predictions.extend([label for label, _ in top_predictions])  # Add the top labels to predictions list
        
        frame_count += 1

        if frame_count >= 100:
            # After processing 100 frames, calculate the most common labels
            if predictions:
                label_counts = Counter(predictions)
                top_labels = label_counts.most_common(3)  # Get top 3 most common labels

                # Display the top labels on the frame
                label_text = ", ".join([f"{label}: {count}" for label, count in top_labels])
                cv2.putText(frame, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

                print(f"Top 3 labels after 100 frames: {top_labels}")

            # Reset the frame counter and predictions list for the next batch of 100 frames
            frame_count = 0
            predictions = []

        # Display the frame with predictions
        cv2.imshow("Real-Time Image Recognition", frame)

        # Exit the loop if the user presses the 'q' key
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the webcam and close windows
    cap.release()
    cv2.destroyAllWindows()

# Example usage: 
# Ensure you have a valid path to your classifier pickle file and VOC_CLASSES
pickle_file_path = 'knn_model.pkl'  # Update with your pickle file path
VOC_CLASSES = [
    'chair',
    'diningtable',
    'person',
    'car',
    'motorbike',
    'bottle'
]
# Start the webcam recognition process
start_webcam_recognition(pickle_file_path, VOC_CLASSES)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Classifier loaded successfully from knn_model.pkl
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('bottle', 2)]
Top 3 labels after 100 frames: [('car', 2), ('person', 2), ('b

In [26]:
import pickle
import cv2
import numpy as np
from gtts import gTTS
from IPython.display import Audio
from collections import defaultdict

# Load the classifier from a pickle file
def load_classifier(pickle_file_path):
    try:
        with open(pickle_file_path, 'rb') as f:
            classifier = pickle.load(f)
        print(f"Classifier loaded successfully from {pickle_file_path}")
        return classifier
    except Exception as e:
        print(f"Error loading classifier: {e}")
        return None

# Function to preprocess the image (convert to grayscale)
def preprocess_image(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    return gray_image

# Function to detect keypoints and descriptors (using ORB in this case)
def detect_keypoints_and_descriptors(image):
    orb = cv2.ORB_create()
    keypoints, descriptors = orb.detectAndCompute(image, None)
    return keypoints, descriptors

# Function to recognize top labels in a frame and accumulate probabilities
def recognize_labels_with_probabilities(frame, classifier, VOC_CLASSES):
    preprocessed_image = preprocess_image(frame)
    keypoints, descriptors = detect_keypoints_and_descriptors(preprocessed_image)

    if descriptors is not None and len(descriptors) > 0:
        # Ensure descriptor length matches the expected feature size (128)
        if descriptors.shape[1] < 32:
            descriptors = np.pad(descriptors, ((0, 0), (0, 128 - descriptors.shape[1])), mode='constant')
        elif descriptors.shape[1] > 32:
            descriptors = descriptors[:, :32]  # Truncate to 128 features
        
        feature_vector = np.mean(descriptors, axis=0).reshape(1, -1)

        try:
            probabilities = classifier.predict_proba(feature_vector)[0]
            return {VOC_CLASSES[i]: probabilities[i] for i in range(len(VOC_CLASSES))}
        except Exception as e:
            print(f"Error during prediction: {e}")
            return {}
    else:
        print("No descriptors detected")
        return {}

# Real-time webcam processing to accumulate and summarize predictions every 100 frames
def start_webcam_recognition(pickle_file_path, VOC_CLASSES):
    # Load the classifier
    classifier = load_classifier(pickle_file_path)

    # Open webcam
    cap = cv2.VideoCapture(1)  # 0 for default webcam

    frame_count = 0
    accumulated_probabilities = defaultdict(list)

    while True:
        ret, frame = cap.read()  # Capture frame
        if not ret:
            print("Failed to grab frame")
            break

        # Get probabilities for this frame
        label_probabilities = recognize_labels_with_probabilities(frame, classifier, VOC_CLASSES)

        # Accumulate probabilities only if the probability is greater than 0
        for label, prob in label_probabilities.items():
            if prob > 0:  # Only accumulate non-zero probabilities
                accumulated_probabilities[label].append(prob)

        frame_count += 1

        # Display current frame
        cv2.putText(frame, f"Frame: {frame_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow("Real-Time Object Recognition", frame)

        # After 100 frames, summarize predictions
        if frame_count >= 100:
            print("\n--- Summary after 100 frames ---")
            top_predictions = []
            for label in VOC_CLASSES:
                if accumulated_probabilities[label]:
                    avg_prob = np.mean(accumulated_probabilities[label])
                    top_predictions.append((label, avg_prob))
                    print(f"{label}: {avg_prob:.4f}")
                else:
                    print(f"{label}: No detections")

            # Sort predictions by probability
            top_predictions = sorted(top_predictions, key=lambda x: x[1], reverse=True)

            # Generate TTS output for the top predictions
            tts_output = ""
            for label, score in top_predictions:
                tts_output += f"{label}"

            # Generate audio from the TTS output
            tts = gTTS(tts_output, lang='en')
            audio_file_path = "output.mp3"
            tts.save(audio_file_path)

            # Play the generated audio file
            audio = Audio(audio_file_path, autoplay=True)
            display(audio)

            # Reset counters for next batch
            frame_count = 0
            accumulated_probabilities = defaultdict(list)

        # Exit if 'q' key is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

# Example usage
pickle_file_path = 'knn_model_orb.pkl'  # Update with your pickle file path
VOC_CLASSES = ['chair', 'diningtable', 'person', 'car', 'motorbike', 'bottle']

start_webcam_recognition(pickle_file_path, VOC_CLASSES)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Classifier loaded successfully from knn_model_orb.pkl

--- Summary after 100 frames ---
chair: 0.9825
diningtable: No detections
person: 0.3333
car: 0.9785
motorbike: 0.3333
bottle: No detections



--- Summary after 100 frames ---
chair: No detections
diningtable: No detections
person: 0.3333
car: 0.8600
motorbike: 0.3333
bottle: No detections



--- Summary after 100 frames ---
chair: No detections
diningtable: No detections
person: 0.3333
car: 0.9800
motorbike: 0.3333
bottle: No detections



--- Summary after 100 frames ---
chair: No detections
diningtable: No detections
person: 0.3333
car: 0.7933
motorbike: 0.3333
bottle: No detections



--- Summary after 100 frames ---
chair: No detections
diningtable: No detections
person: 0.3333
car: 0.9133
motorbike: 0.3333
bottle: No detections



--- Summary after 100 frames ---
chair: No detections
diningtable: No detections
person: 0.3333
car: 0.9000
motorbike: 0.3333
bottle: No detections


KeyboardInterrupt: 