In [None]:
%pip install minio

In [None]:
import os

# MinIO server information
my_minio_endpoint = str(os.getenv('AWS_S3_ENDPOINT'))  # Your MinIO server URL
my_access_key = os.getenv('AWS_ACCESS_KEY_ID')
my_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
my_bucket_name = os.getenv('AWS_S3_BUCKET')
object_key = 'gdotdata.zip'  # The file you want to pull
download_path = 'trafficCounter/model/gdotdata.zip'  # Where to save the file

dataset_dir = "datasets"



# Pull training data
Retrieve the training data from the Minio bucket. Be sure the bucket exists and upload the file to the bucket.

In [None]:
from minio import Minio
import urllib3
from minio.error import S3Error
from urllib3 import PoolManager
import ssl
import os

# Create directory if it doesn't exist
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# # Set the current working directory
# os.chdir('datasets')

# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Create a custom HTTP client to allow self-signed certificates
http_client = PoolManager(
    cert_reqs='CERT_NONE',  # Do not verify SSL certificates
    ssl_context=ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
)

# Initialize the MinIO client with the custom HTTP client
minio_client = Minio(
    my_minio_endpoint.replace("http://", ""),  # Replace with your MinIO server endpoint and port
    access_key=my_access_key,  # Replace with your access key
    secret_key=my_secret_key,  # Replace with your secret key
    secure=False,  # Still using https, but with self-signed certs allowed
    http_client=http_client  # Pass the custom HTTP client
)

# Specify the bucket name and object name
bucket_name = my_bucket_name
object_name = "gdotdata.zip"
file_path = dataset_dir + "/gdotdata.zip"  # Local path where the file will be saved

try:
    # Download the object from the bucket
    minio_client.fget_object(bucket_name, object_name, file_path)
    print(f"Object '{object_name}' downloaded successfully to '{file_path}'")
except S3Error as e:
    print(f"Error occurred: {e}")


# Unzip data
Unzip the training data to the datasets directory. Aftrer unzip, it should contain

classes.txt
images
lables

In [None]:
!unzip datasets/gdotdata.zip -d datasets

# Distribute training data
This Python script is used to prepare a dataset for model training by organizing images and their corresponding labels into three subsets: training, testing, and validation. The script divides the dataset according to specified proportions (e.g., 70% for training, 15% for testing, and 15% for validation). It ensures that the data is randomly shuffled and then distributed into the appropriate directories, which are crucial for training, evaluating, and validating machine learning models. After organizing the files, it provides a summary of how the data has been split, giving insights into the distribution of the dataset across the training, testing, and validation sets. This organization is essential for training robust and reliable models.

In [None]:
import os
import random
import shutil

# Set the paths for the original image and label directories
image_dir = dataset_dir + "/images"
label_dir = dataset_dir + "/labels"

# Set the path for the training directory
training_dir = dataset_dir + "/training"

# Set the desired proportions for train, test, and valid sets
train_split = 0.7
test_split = 0.15
valid_split = 0.15

def count_files(directory):
    file_count = 0
    # Iterate over all files in the directory
    for _, _, files in os.walk(directory):
        file_count += len(files)
    return file_count

# Create the training directory
os.makedirs(training_dir, exist_ok=True)

# Create the train, test, and valid directories within the training directory
train_dir = os.path.join(training_dir, "train")
test_dir = os.path.join(training_dir, "test")
valid_dir = os.path.join(training_dir, "valid")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)

# Create image and label directories within train, test, and valid directories
train_image_dir = os.path.join(train_dir, "images")
train_label_dir = os.path.join(train_dir, "labels")
os.makedirs(train_image_dir, exist_ok=True)
os.makedirs(train_label_dir, exist_ok=True)

test_image_dir = os.path.join(test_dir, "images")
test_label_dir = os.path.join(test_dir, "labels")
os.makedirs(test_image_dir, exist_ok=True)
os.makedirs(test_label_dir, exist_ok=True)

valid_image_dir = os.path.join(valid_dir, "images")
valid_label_dir = os.path.join(valid_dir, "labels")
os.makedirs(valid_image_dir, exist_ok=True)
os.makedirs(valid_label_dir, exist_ok=True)

# Retrieve the list of image filenames
image_filenames = os.listdir(image_dir)

# Shuffle the image filenames
random.shuffle(image_filenames)

# Calculate the number of images for each set
total_images = len(image_filenames)
train_count = int(total_images * train_split)
test_count = int(total_images * test_split)
valid_count = total_images - train_count - test_count

# Copy images and labels to the train directory
for filename in image_filenames[:train_count]:
    name, extension = os.path.splitext(filename)

    src_image_path = os.path.join(image_dir, filename)
    dest_image_path = os.path.join(train_image_dir, filename)
    shutil.copy(src_image_path, dest_image_path)

    label_filename, _ = os.path.splitext(filename)

    # Use the original filename without extension
    label_filename += ".txt"  # Append ".txt" extension
    src_label_path = os.path.join(label_dir, label_filename)
    dest_label_path = os.path.join(train_label_dir, label_filename)
    shutil.copy(src_label_path, dest_label_path)

# Copy images and labels to the test directory
for filename in image_filenames[train_count : train_count + test_count]:
    name, extension = os.path.splitext(filename)

    src_image_path = os.path.join(image_dir, filename)
    dest_image_path = os.path.join(test_image_dir, filename)
    shutil.copy(src_image_path, dest_image_path)

    label_filename, _ = os.path.splitext(filename)

    # Use the original filename without extension
    label_filename += ".txt"  # Append ".txt" extension
    src_label_path = os.path.join(label_dir, label_filename)
    dest_label_path = os.path.join(test_label_dir, label_filename)
    shutil.copy(src_label_path, dest_label_path)

# Copy images and labels to the valid directory
for filename in image_filenames[train_count + test_count :]:
    name, extension = os.path.splitext(filename)

    src_image_path = os.path.join(image_dir, filename)
    dest_image_path = os.path.join(valid_image_dir, filename)
    shutil.copy(src_image_path, dest_image_path)

    label_filename, _ = os.path.splitext(filename)

    # Use the original filename without extension
    label_filename += ".txt"  # Append ".txt" extension
    src_label_path = os.path.join(label_dir, label_filename)
    dest_label_path = os.path.join(valid_label_dir, label_filename)
    shutil.copy(src_label_path, dest_label_path)

print("Dataset files distributed to folders: test, train, and valid")

directory_path = dataset_dir + "/training/test/images"
num_test = count_files(directory_path)
directory_path = dataset_dir + "/training/train/images"
num_train = count_files(directory_path)
directory_path = dataset_dir + "/training/valid/images"
num_valid = count_files(directory_path)

num_total = num_test + num_train + num_valid

print("File count - Test:       [" + format(num_test/num_total, ".0%") + "] " + str(num_test))
print("File count - Train:      [" + format(num_train/num_total, ".0%") + "] " + str(num_train))
print("File count - Validation: [" + format(num_valid/num_total, ".0%") + "] " + str(num_valid))

# Create the class file
This Python script generates a YAML configuration file that is essential for training a machine learning model, particularly in tasks like object detection. It reads class names from a text file and organizes them along with paths to the training, validation, and testing datasets. The resulting YAML file provides a structured configuration that the model training framework uses to locate the dataset, understand the number of classes, and apply the correct labels during training, validation, and testing phases. This setup is crucial for ensuring the model is trained on the correct data and knows how to interpret the classes it will learn to recognize.

In [None]:
import yaml

# Read lines from the text file
with open(dataset_dir + '/classes.txt', 'r') as file:
    lines = file.read().splitlines()

# Prepare the data for the YAML file
yaml_data = {
    'path': 'training',
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'nc': len(lines),
    'names': lines
}

# Write to a YAML file
with open(dataset_dir + '/gdot.yaml', 'w') as yaml_file:
    yaml.dump(yaml_data, yaml_file, default_flow_style=False, sort_keys=False)

print("YAML file 'gdot.yaml' created successfully.")



# Set ultralytics config
This step is necessary to set proper defaults for Ultralytics to pickup proper locations for training tasks

In [None]:
import os
import yaml

# Path to the settings.yaml file
config_file = "/opt/app-root/src/.config/Ultralytics/settings.yaml"
custom_datasets_dir = "/opt/app-root/src/trafficCounter/model/datasets"

# Ensure the settings.yaml file exists
if not os.path.exists(config_file):
    os.makedirs(os.path.dirname(config_file), exist_ok=True)
    settings = {
        'settings_version': '0.0.4',
        'datasets_dir': custom_datasets_dir,
        'weights_dir': 'weights',
        'runs_dir': 'runs',
        'uuid': os.popen('uuidgen').read().strip(),
        'sync': True,
        'api_key': '',
        'openai_api_key': '',
        'clearml': True,
        'comet': True,
        'dvc': True,
        'hub': True,
        'mlflow': True,
        'neptune': True,
        'raytune': True,
        'tensorboard': True,
        'wandb': True,
    }
else:
    # Load existing settings.yaml
    with open(config_file, 'r') as file:
        settings = yaml.safe_load(file)
    
    # Update the datasets_dir
    settings['datasets_dir'] = custom_datasets_dir

# Save the updated settings.yaml
with open(config_file, 'w') as file:
    yaml.dump(settings, file)

print(f"datasets_dir set to {custom_datasets_dir}")


# Train the model
## Command Overview

This command initiates the training of a YOLOv8 model with the following parameters:

- **`source=datasets/training`**: Specifies the directory containing the training dataset.
- **`model=yolov8n.pt`**: Uses the pre-trained YOLOv8n model as the starting point for training.
- **`epochs=100`**: Trains the model for 100 epochs (iterations over the entire dataset).
- **`batch=16`**: Processes the data in batches of 16 images at a time during training.
- **`data=datasets/gdot.yaml`**: Points to a YAML file that defines the dataset structure, including paths to the train, validation, and test sets, and the class names.
- **`project=runs`**: Saves the training results, including model weights and logs, in a directory named `runs`.


In [None]:
!yolo train source=datasets/training model=yolov8n.pt epochs=100 batch=16 data=datasets/gdot.yaml project=runs

# Show the results

In [None]:
import os
from PIL import Image
from IPython.display import display

# Open an image file
img = Image.open('runs/train/results.png')

# Display the image in the notebook
display(img)


# Export the new model
## Command Overview

This command exports a trained YOLOv8 model to a different format with the following parameters:

- **`model="runs/train/weights/best.pt"`**: Specifies the path to the trained YOLOv8 model file (`best.pt`) that you want to export.
- **`format=onnx`**: Converts the model to the ONNX format, which is commonly used for deploying models across different platforms.
- **`imgsz=224,128`**: Specifies the input image size for the exported model, setting it to a width of 224 pixels and a height of 128 pixels.


In [None]:
!yolo export model="runs/train/weights/best.pt" format=onnx imgsz=224,128

# Copy model to bucket
Upload the new model to a Minio bucket for consumption by other services/serving components

In [None]:
from minio import Minio
from minio.error import S3Error
import urllib3

# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Create a custom HTTP client to allow self-signed certificates
http_client = PoolManager(
    cert_reqs='CERT_NONE',  # Do not verify SSL certificates
    ssl_context=ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
)

# Initialize the MinIO client with the custom HTTP client
minio_client = Minio(
    my_minio_endpoint.replace("http://", ""),  # Replace with your MinIO server endpoint and port
    access_key=my_access_key,  # Replace with your access key
    secret_key=my_secret_key,  # Replace with your secret key
    secure=False,  # Still using https, but with self-signed certs allowed
    http_client=http_client  # Pass the custom HTTP client
)


# Set file and bucket details
file_path = "runs/train/weights/best.onnx"  # Local file path
bucket_name = my_bucket_name
object_name = "best.onnx"  # Name to save in the bucket

try:
    # Upload the file to the MinIO bucket
    minio_client.fput_object(
        bucket_name, object_name, file_path
    )
    print(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")

except S3Error as e:
    print(f"Error occurred: {e}")
