<a href="https://colab.research.google.com/github/sukhadak11/Object_Detection/blob/main/Object_detection_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install tensorflow opencv-python lxml bs4 scikit-learn

# Import required packages
import tensorflow as tf
import numpy as np
import cv2
import os
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
from tensorflow.keras.utils import Sequence
import pickle
import xml.etree.ElementTree as ET



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Paths to your dataset
train_images_dir = '/content/drive/MyDrive/database/database/train/images'
train_annotations_dir = '/content/drive/MyDrive/database/database/train/annotations'
valid_images_dir = '/content/drive/MyDrive/database/database/valid/images'
valid_annotations_dir = '/content/drive/MyDrive/database/database/valid/annotations'
test_images_dir = '/content/drive/MyDrive/database/database/Test/Images'


its option 2


In [None]:
# Step 1: Parse Pascal VOC Annotations
def parse_voc_annotation(annotation_file):
    tree = ET.parse(annotation_file)
    root = tree.getroot()
    objects = []
    for obj in root.findall('object'):
        name = obj.find('name').text.strip()
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)
        objects.append({'name': name, 'bbox': [xmin, ymin, xmax, ymax]})
    return objects

In [None]:
# Step 2: Check all unique labels in dataset
def check_all_labels(annotation_paths):
    all_labels = set()
    for annotation_path in annotation_paths:
        objects = parse_voc_annotation(annotation_path)
        for obj in objects:
            all_labels.add(obj['name'])
    return all_labels

In [None]:
# Step 3: Load Image and Annotation Paths
def load_image_annotation_paths(image_dir, annotation_dir):
    image_paths, annotation_paths = [], []
    for image_file in os.listdir(image_dir):
        if image_file.endswith(('.jpg', '.png')):
            image_paths.append(os.path.join(image_dir, image_file))
            annotation_file = os.path.join(annotation_dir, image_file.replace('.jpg', '.xml').replace('.png', '.xml'))
            annotation_paths.append(annotation_file)
    return image_paths, annotation_paths


In [None]:
# Step 4: Custom Data Generator using Keras Sequence
class ObjectDetectionDataGenerator(Sequence):
    def __init__(self, image_paths, annotation_paths, label_encoder, batch_size=8, img_size=(224, 224)):
        self.image_paths = image_paths
        self.annotation_paths = annotation_paths
        self.label_encoder = label_encoder
        self.batch_size = batch_size
        self.img_size = img_size
        self.num_classes = len(label_encoder.classes_)

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))

    def __getitem__(self, idx):
        batch_image_paths = self.image_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_annotation_paths = self.annotation_paths[idx * self.batch_size:(idx + 1) * self.batch_size]

        batch_images = []
        batch_labels = []
        batch_bboxes = []

        for img_path, annot_path in zip(batch_image_paths, batch_annotation_paths):
            # Load and preprocess image
            image = cv2.imread(img_path)
            image = cv2.resize(cv2.cvtColor(image, cv2.COLOR_BGR2RGB), self.img_size)
            image = image / 255.0

            # Parse annotations
            objects = parse_voc_annotation(annot_path)

            if objects:  # Only process if there are objects in the image
                obj = objects[0]  # For simplicity, we'll only use the first object
                class_id = self.label_encoder.transform([obj['name']])[0]
                bbox = obj['bbox']

                # Normalize bbox coordinates
                x_min, y_min, x_max, y_max = bbox
                x_min, x_max = x_min / self.img_size[1], x_max / self.img_size[1]
                y_min, y_max = y_min / self.img_size[0], y_max / self.img_size[0]

                batch_images.append(image)
                batch_labels.append(class_id)
                batch_bboxes.append([x_min, y_min, x_max, y_max])

        return np.array(batch_images), {'class_output': np.array(batch_labels), 'bbox_output': np.array(batch_bboxes)}

In [None]:
# Step 5: Initialize Label Encoder
def initialize_label_encoder(train_annotation_paths, valid_annotation_paths):
    train_labels = check_all_labels(train_annotation_paths)
    valid_labels = check_all_labels(valid_annotation_paths)
    classes = list(train_labels.union(valid_labels))
    label_encoder = LabelEncoder()
    label_encoder.fit(classes)
    return label_encoder, classes

In [None]:
# Step 6: Define the Object Detection Model
def create_model(input_shape, num_classes):
    base_model = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    class_output = tf.keras.layers.Dense(num_classes, activation='softmax', name='class_output')(x)
    bbox_output = tf.keras.layers.Dense(4, activation='sigmoid', name='bbox_output')(x)
    model = tf.keras.Model(inputs=base_model.input, outputs=[class_output, bbox_output])
    return model

In [None]:
# Step 7: Custom Loss Function
!pip install tensorflow
import tensorflow as tf
def bbox_loss(y_true, y_pred):
    return tf.keras.losses.MSE(y_true, y_pred) # Use MSE instead of mean_squared_error.
                                               # Both are valid and equivalent.



In [None]:
# Step 8: Train the Model
def train_model(train_generator, valid_generator, input_shape, num_classes, epochs=10):
    model = create_model(input_shape, num_classes)
    model.compile(optimizer='adam',
                  loss={'class_output': 'sparse_categorical_crossentropy',
                        'bbox_output': bbox_loss},
                  loss_weights={'class_output': 1.0, 'bbox_output': 1.0},
                  metrics={'class_output': 'accuracy'})
    model.fit(train_generator, validation_data=valid_generator, epochs=epochs)
    return model

In [None]:
# Step 9: Save the Trained Model
def save_model(model, filename='object_detection_model.h5'):
    model.save(filename)
    print(f"Model saved as '{filename}'")


In [None]:
# Step 10: Perform Object Detection on Video
def perform_video_object_detection(model, label_encoder, video_path, output_path, img_size=(224, 224), confidence_threshold=0.5):
    cap = cv2.VideoCapture(video_path)
    frame_width, frame_height = int(cap.get(3)), int(cap.get(4))
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'XVID'), 20, (frame_width, frame_height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Preprocess the frame
        img_resized = cv2.resize(frame, img_size)
        img_normalized = img_resized / 255.0
        img_expanded = np.expand_dims(img_normalized, axis=0)

        # Perform prediction
        class_pred, bbox_pred = model.predict(img_expanded)

        # Process predictions
        class_id = np.argmax(class_pred[0])
        class_prob = np.max(class_pred[0])
        if class_prob > confidence_threshold:
            class_label = label_encoder.inverse_transform([class_id])[0]
            x_min, y_min, x_max, y_max = bbox_pred[0]

            # Denormalize bbox coordinates
            x_min, x_max = int(x_min * frame_width), int(x_max * frame_width)
            y_min, y_max = int(y_min * frame_height), int(y_max * frame_height)

            # Draw bounding box and label
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(frame, f"{class_label}: {class_prob:.2f}", (x_min, y_min - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        # Save the frame with the prediction
        out.write(frame)

    cap.release()
    out.release()
    print(f"Processed video saved at {output_path}")

In [None]:
# Example Usage
import xml.etree.ElementTree as ET
if __name__ == "__main__":
    # Set up paths
    train_images_dir = '/content/drive/MyDrive/database/database/train/images'
    train_annotations_dir = '/content/drive/MyDrive/database/database/train/annotations'
    valid_images_dir = '/content/drive/MyDrive/database/database/valid/images'
    valid_annotations_dir = '/content/drive/MyDrive/database/database/valid/annotations'

    # Load paths
    train_image_paths, train_annotation_paths = load_image_annotation_paths(train_images_dir, train_annotations_dir)
    valid_image_paths, valid_annotation_paths = load_image_annotation_paths(valid_images_dir, valid_annotations_dir)

    # Initialize label encoder
    label_encoder, classes = initialize_label_encoder(train_annotation_paths, valid_annotation_paths)

    # Create data generators
    batch_size = 32
    img_size = (224, 224)
    train_generator = ObjectDetectionDataGenerator(train_image_paths, train_annotation_paths, label_encoder, batch_size, img_size)
    valid_generator = ObjectDetectionDataGenerator(valid_image_paths, valid_annotation_paths, label_encoder, batch_size, img_size)

    # Train the model
    input_shape = (*img_size, 3)
    num_classes = len(classes)
    model = train_model(train_generator, valid_generator, input_shape, num_classes, epochs=20)


    # Perform object detection on a video
    video_path = '/content/drive/MyDrive/video'
    output_path = '/content/drive/MyDrive/Video_output'
    perform_video_object_detection(model, label_encoder, video_path, output_path)

Epoch 1/20


  self._warn_if_super_not_called()


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - class_output_accuracy: 0.1169 - loss: 5.5476

  self._warn_if_super_not_called()


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 6s/step - class_output_accuracy: 0.1363 - loss: 5.4665 - val_class_output_accuracy: 0.5000 - val_loss: 3.6713
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8s/step - class_output_accuracy: 0.7326 - loss: 2.0221 - val_class_output_accuracy: 0.4444 - val_loss: 4.0201
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4s/step - class_output_accuracy: 0.8891 - loss: 1.5910 - val_class_output_accuracy: 0.4444 - val_loss: 4.8585
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5s/step - class_output_accuracy: 0.9448 - loss: 1.3268 - val_class_output_accuracy: 0.4444 - val_loss: 5.6151
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5s/step - class_output_accuracy: 0.9783 - loss: 1.1507 - val_class_output_accuracy: 0.3333 - val_loss: 6.2542
Epoch 6/