# data preprocessing

In [78]:
import os
import shutil
import json
import yaml
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

## install the dataset

In [4]:
# https://universe.roboflow.com/corrosion-a1pkl/screw_detection_iphone/dataset/14/images?split=train

# !pip install roboflow
# from roboflow import Roboflow

rf = Roboflow(api_key="XXXXXXXX")
project = rf.workspace("corrosion-a1pkl").project("screw-detection-classification")
version = project.version(1)
dataset = version.download("yolov5")


loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Screw-detection-&-classification-1 to yolov5pytorch:: 100%|██████████| 31433/31433 [00:01<00:00, 16192.69it/s]





Extracting Dataset Version Zip to Screw-detection-&-classification-1 in yolov5pytorch:: 100%|██████████| 954/954 [00:00<00:00, 7408.46it/s]


## data yaml

In [48]:
DATA_ROOT = "screw_detection_iphone.v14i.yolov5pytorch"

data_yaml_path = f"{DATA_ROOT}/data.yaml"

with open(data_yaml_path, 'r') as f:
    data_yaml = yaml.safe_load(f)
data_yaml

{'train': '../train/images',
 'val': '../valid/images',
 'test': '../test/images',
 'nc': 6,
 'names': ['Flat Head Screw',
  'Hex Washer Screw',
  'Hexagonal Bolt',
  'Philips Screw',
  'Pozidriv Screw',
  'Torx Screw'],
 'roboflow': {'workspace': 'corrosion-a1pkl',
  'project': 'screw_detection_iphone',
  'version': 14,
  'license': 'CC BY 4.0',
  'url': 'https://universe.roboflow.com/corrosion-a1pkl/screw_detection_iphone/dataset/14'}}

In [49]:
id_dict = {str(i): name.replace(" ", "_") for i, name in enumerate(data_yaml["names"])}
id_dict

{'0': 'Flat_Head_Screw',
 '1': 'Hex_Washer_Screw',
 '2': 'Hexagonal_Bolt',
 '3': 'Philips_Screw',
 '4': 'Pozidriv_Screw',
 '5': 'Torx_Screw'}

## prepare classification data

In [50]:
classification_root = "./screw-classification"
os.mkdir(classification_root)

In [52]:
split = "train"
    
split_root =  f"{DATA_ROOT}/{split}"

for image_name in os.listdir(f"{split_root}/images"):
    label_path = f"{split_root}/labels/" + image_name.replace(".jpg", ".txt")
    
    img = cv2.imread(f"{split_root}/images/{image_name}")
    img_height, img_width, _ = img.shape

    
    with open(label_path) as f:
        data = f.readlines()
    
    for i, line in enumerate(data):
        labels = line.strip().split()
        class_id = labels[0]
        center_x, center_y, width, height = map(float, labels[1:]) # normalized center coordinates and width-height of the bbox            
        x_min, y_min = int((center_x - (width / 2)) * img_width), int((center_y - (height / 2)) * img_height)
        x_max, y_max = int((center_x + (width / 2)) * img_width), int((center_y + (height / 2)) * img_height)
        
        class_name = id_dict[class_id]
        
        if class_name not in os.listdir(classification_root):
            os.mkdir(f"{classification_root}/{class_name}")
        
        cv2.imwrite(f"{classification_root}/{class_name}/{i}_{image_name}", img[y_min:y_max, x_min:x_max])

In [53]:
cropped_labels = os.listdir("./screw-classification")
cropped_labels

['Flat_Head_Screw',
 'Hexagonal_Bolt',
 'Hex_Washer_Screw',
 'Philips_Screw',
 'Pozidriv_Screw',
 'Torx_Screw']

In [54]:
{k: len(os.listdir(f"./screw-classification/{k}")) for k in cropped_labels}

{'Flat_Head_Screw': 40,
 'Hexagonal_Bolt': 490,
 'Hex_Washer_Screw': 574,
 'Philips_Screw': 658,
 'Pozidriv_Screw': 650,
 'Torx_Screw': 1122}

#### split train - test- val

In [68]:
from sklearn.model_selection import train_test_split

In [70]:
train_ratio, test_ratio, val_ratio = 0.7, 0.15, 0.15

for split in ["train", "test", "val"]:
    os.mkdir(f"./screw-classification/{split}")

for label in cropped_labels:
    for split in ["train", "test", "val"]:
        os.mkdir(f"./screw-classification/{split}/{label}")

    train_files, test_val_files = train_test_split(os.listdir(f"./screw-classification/{label}"), test_size=test_ratio + val_ratio)
    
    test_files, val_files = train_test_split(test_val_files, test_size=val_ratio / (test_ratio + val_ratio))

    for file in test_files:
        shutil.move(f"./screw-classification/{label}/{file}", f"./screw-classification/test/{label}/{file}")
    
    for file in val_files:
        shutil.move(f"./screw-classification/{label}/{file}", f"./screw-classification/val/{label}/{file}")

    for file in train_files:
        shutil.move(f"./screw-classification/{label}/{file}", f"./screw-classification/train/{label}/{file}")

In [74]:
{split: {label: len(os.listdir(f"./screw-classification/{split}/{label}")) for label in cropped_labels} for split in ["train", "test", "val"]}

{'train': {'Flat_Head_Screw': 28,
  'Hexagonal_Bolt': 343,
  'Hex_Washer_Screw': 401,
  'Philips_Screw': 460,
  'Pozidriv_Screw': 455,
  'Torx_Screw': 785},
 'test': {'Flat_Head_Screw': 6,
  'Hexagonal_Bolt': 73,
  'Hex_Washer_Screw': 86,
  'Philips_Screw': 99,
  'Pozidriv_Screw': 97,
  'Torx_Screw': 168},
 'val': {'Flat_Head_Screw': 6,
  'Hexagonal_Bolt': 74,
  'Hex_Washer_Screw': 87,
  'Philips_Screw': 99,
  'Pozidriv_Screw': 98,
  'Torx_Screw': 169}}

## prepare detection data

In [89]:
detection_root = "./screw-bolt-detection"
os.mkdir(detection_root)

In [90]:
split = "train"
os.mkdir(f"{detection_root}/{split}")
for folder in ["images", "labels"]:
    os.mkdir(f"{detection_root}/{split}/{folder}")

In [91]:
detection_mapping = {
    'Flat_Head_Screw': "screw",
    'Hexagonal_Bolt': "bolt",
    'Hex_Washer_Screw': "screw",
    'Philips_Screw': "screw",
    'Pozidriv_Screw': "screw",
    'Torx_Screw': "screw"
    }

In [92]:
detection_ids = {
    "screw": "0",
    "bolt": "1"
}

In [93]:
split = "train"

split_root =  f"{DATA_ROOT}/{split}"

for label_name in os.listdir(f"{split_root}/labels"):
    
    img_name = label_name.replace(".txt", ".jpg")
    img = cv2.imread(f"{split_root}/images/{img_name}")
    height, width, _ = img.shape
    
    with open(f"{split_root}/labels/{label_name}") as f:
        data = f.readlines()
    
    edited_data_lines = []
    for i, line in enumerate(data):
        labels = line.strip().split()
        
        org_class_id = labels[0]
        org_class_name = id_dict[org_class_id]
        
        edited_data_lines.append(detection_ids[detection_mapping[org_class_name]] + " " + " ".join(labels[1:]) + "\n")
    
    with open(f"{detection_root}/{split}/labels/{label_name}", "w+") as f:
            f.writelines(edited_data_lines)

    cv2.imwrite(f"{detection_root}/{split}/images/{img_name}", img)

### split train - test - val

In [94]:
train_ratio, test_ratio, val_ratio = 0.7, 0.15, 0.15 

test_dir = os.path.join(detection_root, "test")
val_dir = os.path.join(detection_root, "val")

os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

for folder in ["images", "labels"]:
    os.mkdir(f"{test_dir}/{folder}")
    os.mkdir(f"{val_dir}/{folder}")

# Get the list of image files in the dataset
image_files = [file for file in os.listdir(f"{detection_root}/train/images/")]

# Shuffle the image files
random.shuffle(image_files)

# Calculate the number of images for each split
num_images = len(image_files)
num_train = int(num_images * train_ratio)
num_test = int(num_images * test_ratio)
num_val = num_images - num_train - num_test

# Split the image files into train, test, and val sets
train_files = image_files[:num_train]
test_files = image_files[num_train:num_train+num_test]
val_files = image_files[num_train+num_test:]

for file in test_files:
    shutil.move(f"{detection_root}/train/images/{file}", f"{detection_root}/test/images/{file}")
    shutil.move(f"{detection_root}/train/labels/"+file.replace(".jpg", ".txt"), f"{detection_root}/test/labels/"+file.replace(".jpg", ".txt"))

for file in val_files:
    shutil.move(f"{detection_root}/train/images/{file}", f"{detection_root}/val/images/{file}")
    shutil.move(f"{detection_root}/train/labels/"+file.replace(".jpg", ".txt"), f"{detection_root}/val/labels/"+file.replace(".jpg", ".txt"))