<a href="https://colab.research.google.com/github/sneha2003er/project/blob/main/oversampling_of_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import cv2
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import random

In [None]:
base_directory = '/content/drive/MyDrive/data/dataset'
video_directory = os.path.join(base_directory, 'video_data')
img_directory = os.path.join(base_directory, 'img_data')

In [None]:
def extract_frames_from_video(video_path, output_dir, rate=1):
    """Extract frames from a video and save them to an output directory."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    vidcap = cv2.VideoCapture(video_path)
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))

    success, image = vidcap.read()
    count = 0
    frame_count = 0

    while success:
        if count % (fps // rate) == 0:  # Extract every (fps/rate) frame
            frame_name = os.path.join(output_dir, f"frame_from_video_{frame_count}.jpg")
            cv2.imwrite(frame_name, image)
            frame_count += 1

        success, image = vidcap.read()
        count += 1

    vidcap.release()



In [None]:
for data_type in ['train_videos', 'test_videos']:  # Updated directory names here
    current_video_dir = os.path.join(video_directory, data_type)
    current_img_output_dir = os.path.join(img_directory, data_type.replace('_videos', ''), 'extracted_from_videos')  # Adjusting the output directory to match img_data structure

    if os.path.exists(current_video_dir):
        for video_file in os.listdir(current_video_dir):
            video_path = os.path.join(current_video_dir, video_file)
            extract_frames_from_video(video_path, current_img_output_dir)
    else:
        print(f"Directory not found: {current_video_dir}")

In [None]:
# Check the directory counts to determine the class with most samples
class_counts = {cls: len(os.listdir(os.path.join(img_directory, 'train', cls))) for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']}
max_samples = max(class_counts.values())
print("Class counts before oversampling:", class_counts)

Class counts before oversampling: {'default': 161, 'extracted_from_videos': 437, 'fire': 274, 'smoke': 258}


In [None]:
def oversample_class(data_dir, class_name, target_samples, datagen):
    """Oversample a class using data augmentation."""
    class_dir = os.path.join(data_dir, class_name)
    images = [os.path.join(class_dir, img_file) for img_file in os.listdir(class_dir)]

    while len(os.listdir(class_dir)) < target_samples:
        # Randomly choose an image from the class directory
        chosen_image = random.choice(images)
        img = load_img(chosen_image)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)

        # Generate augmented images until we reach the target samples for the class
        for batch in datagen.flow(x, batch_size=1, save_to_dir=class_dir, save_prefix='aug', save_format='jpg'):
            break  # We only need one augmented image per chosen image, so break after the first batch.

# Set up data augmentation configuration
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Oversample each class to match the majority class
train_data_dir = os.path.join(img_directory, 'train')
for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']:
    current_class_samples = len(os.listdir(os.path.join(train_data_dir, cls)))
    if current_class_samples < max_samples:
        oversample_class(train_data_dir, cls, max_samples, datagen)

# Check class distribution after oversampling
class_counts_after = {cls: len(os.listdir(os.path.join(img_directory, 'train', cls))) for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']}
print("Class counts after oversampling:", class_counts_after)


Class counts after oversampling: {'default': 437, 'extracted_from_videos': 437, 'fire': 437, 'smoke': 437}


In [None]:
# Check the directory counts to determine the class with most samples
class_counts = {cls: len(os.listdir(os.path.join(img_directory, 'test', cls))) for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']}
max_samples = max(class_counts.values())
print("Class counts before oversampling:", class_counts)

Class counts before oversampling: {'default': 84, 'extracted_from_videos': 65, 'fire': 57, 'smoke': 30}


In [None]:
# Oversample each class to match the majority class
train_data_dir = os.path.join(img_directory, 'test')
for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']:
    current_class_samples = len(os.listdir(os.path.join(train_data_dir, cls)))
    if current_class_samples < max_samples:
        oversample_class(train_data_dir, cls, max_samples, datagen)

In [None]:
# Check class distribution after oversampling
class_counts_after = {cls: len(os.listdir(os.path.join(img_directory, 'test', cls))) for cls in ['default', 'extracted_from_videos', 'fire', 'smoke']}
print("Class counts after oversampling:", class_counts_after)

Class counts after oversampling: {'default': 84, 'extracted_from_videos': 84, 'fire': 84, 'smoke': 84}
