In [1]:
#Converting .tif to .jpg images
import os
from PIL import Image

hd_dir="Dataset/image_chips_hd"
native_dir="Dataset/image_chips_native"
hd_output_dir="Dataset/image_chips_hd_jpg"
native_output_dir="Dataset/image_chips_native_jpg"

os.makedirs(hd_output_dir,exist_ok=True)
os.makedirs(native_output_dir,exist_ok=True)

# Function to convert .tif to .jpg
def convert_images(image_dir, output_dir):
    for file in os.listdir(image_dir):
        if file.endswith(".tif"):
            img = Image.open(os.path.join(image_dir, file))
            new_filename = file.replace(".tif", ".jpg")
            img.convert("RGB").save(os.path.join(output_dir, new_filename), "JPEG")



In [3]:

# Converting images for HD and Native to jpg
convert_images(hd_dir, hd_output_dir)
convert_images(native_dir, native_output_dir)

In [4]:
#Finding missing labels not corresponding to images and cleaning them

hd_label_dir="Labels/labels_hd"
native_label_dir="Labels/labels_native"


# Getting filenames without .jpg or .txt
hd_images={f.replace(".jpg", "") for f in os.listdir(hd_output_dir)}
native_images={f.replace(".jpg", "") for f in os.listdir(native_output_dir)}
hd_labels={f.replace(".txt", "") for f in os.listdir(hd_label_dir)}
native_labels={f.replace(".txt", "") for f in os.listdir(native_label_dir)}

# Find missing labels & extra labels
missing_hd_labels=hd_images-hd_labels
missing_native_labels=native_images-native_labels
extra_hd_labels=hd_labels-hd_images
extra_native_labels=native_labels-native_images

if extra_hd_labels:
    print(list(extra_hd_labels)) #These are the extra hd labels not present in the hd dataset


In [5]:

# List of extra labels that need to be removed
extra_hd_labels = [
    "solarpanels_hd_1__x0_9203_y0_23477_dxdy_832(1)",
    "solarpanels_hd_1__x0_9203_y0_23322_dxdy_832(1)",
    "solarpanels_hd_1__x0_9107_y0_23305_dxdy_832(1)",
    "solarpanels_hd_1__x0_8971_y0_13024_dxdy_832(1)",
    "solarpanels_hd_1__x0_9076_y0_26335_dxdy_832(1)",
    "solarpanels_hd_1__x0_9182_y0_23369_dxdy_832(1)",
    "solarpanels_hd_1__x0_8985_y0_23459_dxdy_832(1)",
    "solarpanels_hd_1__x0_9018_y0_25843_dxdy_832(1)",
    "solarpanels_hd_1__x0_9056_y0_21439_dxdy_832(1)",
    "solarpanels_hd_1__x0_914_y0_14724_dxdy_832(1)"
]

# Deleting extra labels
for lbl in extra_hd_labels:
    label_path = os.path.join(hd_label_dir, lbl + ".txt")
    if os.path.exists(label_path):
        os.remove(label_path)
    else:
        print("Not found")#Shows not found since I have cleaned it already

print("All Removed!")


Not found
Not found
Not found
Not found
Not found
Not found
Not found
Not found
Not found
Not found
All Removed!


In [6]:

hd_images={f.replace(".jpg", "") for f in os.listdir(hd_output_dir)}
hd_labels={f.replace(".txt", "") for f in os.listdir(hd_label_dir)}

extra_hd_labels=hd_labels-hd_images

print(f"Extra HD labels remaining: {len(extra_hd_labels)}")



Extra HD labels remaining: 0


In [7]:
#Now YOLO needs a 5-tuple in bounding box format (class_id,x_center,y_center,width,height),so checking for consistency

def check_label_format(label_dir):
    errors = []
    
    for file in os.listdir(label_dir):
        file_path = os.path.join(label_dir, file)
        with open(file_path, "r") as f:
            for line in f:
                values = line.strip().split()
                
                # Check if there are exactly 5 columns
                if len(values) != 5:
                    errors.append((file, f"Invalid column count: {len(values)}"))
                    continue
                
                try:
                    class_id, x, y, w, h = map(float, values)
                    
                    # Check if values are in valid range (0 to 1)
                    if not (0 <= x <= 1 and 0 <= y <= 1 and 0 <= w <= 1 and 0 <= h <= 1):
                        errors.append((file, f"Invalid bbox values: {x, y, w, h}"))
                
                except ValueError:
                    errors.append((file, "Non-numeric values found"))

    return errors

# Check both HD and Native labels
invalid_hd = check_label_format(hd_label_dir)
invalid_native = check_label_format(native_label_dir)

print(f"HD labels with errors: {len(invalid_hd)}")
print(f"Native labels with errors: {len(invalid_native)}")



HD labels with errors: 0
Native labels with errors: 0


In [8]:
#Checking that all label values are between 0 and 1
def fix_label_format(label_dir):
    for file, error in check_label_format(label_dir):
        file_path = os.path.join(label_dir, file)
        with open(file_path, "r") as f:
            lines = f.readlines()

        # Remove incorrect lines
        fixed_lines = []
        for line in lines:
            values = line.strip().split()
            if len(values) == 5:
                try:
                    class_id, x, y, w, h = map(float, values)
                    if 0 <= x <= 1 and 0 <= y <= 1 and 0 <= w <= 1 and 0 <= h <= 1:
                        fixed_lines.append(line)  # Keeping only valid lines
                except ValueError:
                    continue

        with open(file_path, "w") as f:
            f.writelines(fixed_lines)

    print(f"Fixed label format in: {label_dir}")

# Applying fixes
fix_label_format(hd_label_dir)
fix_label_format(native_label_dir)


Fixed label format in: Labels/labels_hd
Fixed label format in: Labels/labels_native


In [9]:
#Resizing images to 416x416 
from PIL import Image

resized_native_dir = "Dataset/native_resized"


os.makedirs(resized_native_dir, exist_ok=True)

# Resize images
for file in os.listdir(native_output_dir):
    if file.endswith(".jpg"):
        img = Image.open(os.path.join(native_output_dir, file))
        img = img.resize((416, 416), Image.Resampling.LANCZOS)  # Resize with high-quality resampling
        img.save(os.path.join(resized_native_dir, file))

print("All native images resized to 416×416.")


All native images resized to 416×416.


In [10]:

resized_hd_dir = "Dataset/hd_resized"

os.makedirs(resized_hd_dir, exist_ok=True)

# Resize images
for file in os.listdir(hd_output_dir):
    if file.endswith(".jpg"):
        img = Image.open(os.path.join(hd_output_dir, file))
        img = img.resize((416, 416), Image.Resampling.LANCZOS)  # Resize with high-quality resampling
        img.save(os.path.join(resized_hd_dir, file))

print("All hd images resized to 416×416.")

All hd images resized to 416×416.
