# Preprocess

Available GeoimageNet dataset annotations are in form of image_path, x_min, y_min, x_max, y_max, class_label for all images in single text file. But YOLO v8 requires annotations in form class_index,x_center, y_center, box_width, box_height for each image in a single text file.

In [2]:
#code to remove prefix path for image in text file
def remove_prefix(file_path, prefix):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    with open(file_path, 'w') as file:
        for line in lines:
            
            new_line = line.replace(prefix, '')
            file.write(new_line)


text_file_path = '/Users/rajdipingale/Downloads/Hydrocarbon/gin.test (1).txt'
prefix_to_remove = 'data/geoimagenet/'
remove_prefix(text_file_path, prefix_to_remove)
print("Prefix removed from the text file.")


Prefix removed from the text file.


In [6]:
import os
import shutil


In [8]:
# All images are avialable in single folder. Splitting them into Train and validation datasets
def read_image_names(file_path):
    with open(file_path, 'r') as file:
        image_names = [line.split(',')[0].strip() for line in file.readlines()]
    return image_names


def split_dataset(image_folder, image_names, train_folder, test_folder):
    for image_name in image_names:
        source_path = os.path.join(image_folder, image_name)
        if os.path.exists(source_path):
            
            destination_folder = train_folder if image_name in train_image_names else test_folder
            
            
            destination_path = os.path.join(destination_folder, image_name)
            shutil.move(source_path, destination_path)
        else:
            print(f"Image {image_name} not found in {image_folder}")


train_image_folder = '/Users/rajdipingale/Downloads/Hydrocarbon/Geo_Aug'
train_file = '/Users/rajdipingale/Downloads/Hydrocarbon/gin.train.aug.txt'
train_folder = '/Users/rajdipingale/Downloads/Hydrocarbon/train'


test_file = '/Users/rajdipingale/Downloads/Hydrocarbon/gin.test (1).txt'  
test_folder = '/Users/rajdipingale/Downloads/Hydrocarbon/test'   


train_image_names = read_image_names(train_file)
test_image_names = read_image_names(test_file)

#save images to train and test indivisual folders
split_dataset(train_image_folder, train_image_names, train_folder, test_folder)
split_dataset(train_image_folder, test_image_names, train_folder, test_folder)

print("Dataset split completed.")


Dataset split completed.


In [10]:
#code to save size of each images in a single text file

import os
from PIL import Image

def save_image_sizes_to_file(folder_path, output_file):
    filenames = sorted(os.listdir(folder_path))
    
    with open(output_file, 'w') as f:
        for filename in filenames:
            if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
                file_path = os.path.join(folder_path, filename)
                with Image.open(file_path) as img:
                    width, height = img.size
                    f.write(f"{filename}: {width} x {height}\n")

#for train dataset
Tfolder_path = '/Users/rajdipingale/Downloads/Hydrocarbon/train'
Toutput_file = '/Users/rajdipingale/Downloads/Hydrocarbon/train_size1.txt'

save_image_sizes_to_file(Tfolder_path, Toutput_file)
#for test dataset
Tfolder_path = '/Users/rajdipingale/Downloads/Hydrocarbon/val'
Toutput_file = '/Users/rajdipingale/Downloads/Hydrocarbon/val_size1.txt'

In [11]:
#converting image_path, x_min, y_min, x_max, y_max, class_label format to class_index,x_center, y_center, box_width, box_height format

#function to read file size from text file
def get_image_sizes_from_file(image_size_file):
    image_sizes = {}
    with open(image_size_file, 'r') as infile:
        for line in infile:
            parts = line.strip().split(': ')
            if len(parts) != 2: 
                continue
            filename, size = parts
            width, height = map(int, size.split(' x '))
            image_sizes[filename.strip()] = (width, height)
    return image_sizes

#function for conversion
def convert_geoimagenet_to_yolo(input_file, output_dir, class_mapping, image_sizes):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    error_files = []
    
    with open(input_file, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
           
            parts = line.strip().split(',')
            if len(parts) != 6: 
                continue
            image_path, x_min, y_min, x_max, y_max, class_label = parts
            
            try:
                
                image_width, image_height = image_sizes[os.path.basename(image_path)]
                
                x_center = (float(x_min) + float(x_max)) / (2.0 * image_width)
                y_center = (float(y_min) + float(y_max)) / (2.0 * image_height)
                box_width = (float(x_max) - float(x_min)) / image_width
                box_height = (float(y_max) - float(y_min)) / image_height
            except (ValueError, KeyError):
                # Skip the line if conversion fails or image size not found
                error_files.append(os.path.basename(image_path))
                continue
            
            # Check if coordinates are within the valid range [0, 1]
            if any(coord < 0 or coord > 1 for coord in [x_center, y_center, box_width, box_height]):
                print(f"WARNING: Ignoring corrupt image/label: non-normalized or out of bounds coordinates {parts[1:]}")
                continue
            
            # Get class index from the class_mapping dictionary
            class_index = class_mapping.get(class_label, -1)
            if class_index == -1:
                print(f"WARNING: Class label '{class_label}' not found in class_mapping for file '{os.path.basename(image_path)}'")
                continue  # Skip if class label is not found in class_mapping
            
            # Write COCO format annotation to output file
            output_filename = os.path.splitext(os.path.basename(image_path))[0] + '.txt'
            with open(os.path.join(output_dir, output_filename), 'a') as outfile:
                outfile.write(f"{class_index} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}\n")
    
    return error_files

#for training data

#directory for single input text file and output folder to save indivisual text file for each sample image
input_file = '/Users/rajdipingale/Downloads/Hydrocarbon/gin.train.aug.txt'  
output_dir = '/Users/rajdipingale/Downloads/Hydrocarbon/traintxt'  
image_size_file = '/Users/rajdipingale/Downloads/Hydrocarbon/train_size1.txt'  
class_mapping = {'basins': 0, 'bays': 1, 'islands': 2, 'lakes': 3, 'ridges': 4, 'Valley': 5, 'Island': 2, 'Lake': 3, 'Ridge': 2, 'Bay': 1, 'Basin': 0}


# Get image sizes from the size file
image_sizes = get_image_sizes_from_file(image_size_file)

# Perform conversion and get error files
error_files = convert_geoimagenet_to_yolo(input_file, output_dir, class_mapping, image_sizes)

print("Conversion completed.")
print("Files with conversion errors:")
for file in error_files:
    print(file)


Conversion completed.
Files with conversion errors:


In [15]:
#For validation data

#directory for single input text file and output folder to save indivisual text file for each sample image
input_file = '/Users/rajdipingale/Downloads/Hydrocarbon/gin.test (1).txt'  
output_dir = '/Users/rajdipingale/Downloads/Hydrocarbon/valtxt'  
image_size_file = '/Users/rajdipingale/Downloads/Hydrocarbon/val_size1.txt'  
class_mapping = {'basins': 0, 'bays': 1, 'islands': 2, 'lakes': 3, 'ridges': 4, 'Valley': 5, 'Island': 2, 'Lake': 3, 'Ridge': 2, 'Bay': 1, 'Basin': 0}


# Get image sizes from the size file
image_sizes = get_image_sizes_from_file(image_size_file)

# Perform conversion and get error files
error_files = convert_geoimagenet_to_yolo(input_file, output_dir, class_mapping, image_sizes)

print("Conversion completed.")
print("Files with conversion errors:")
for file in error_files:
    print(file)

Conversion completed.
Files with conversion errors:
