In [11]:
import os
import argparse
import sys
import subprocess
import glob
import shutil
# import dvc.api

from collections import Counter
# from git.repo.base import Repo

from scripts.utils import extract_archive
from sklearn.model_selection import train_test_split
from pathlib import Path

ml_root = Path("/home/ec2-user/SageMaker/sagemaker-pipeline/capstone/")

In [14]:
def write_dataset(image_paths, output_dir):
    for img_path in image_paths:
        Path(output_dir / img_path.parent.stem).mkdir(parents=True, exist_ok=True)
        shutil.copyfile(img_path, output_dir / img_path.parent.stem / img_path.name)


In [12]:
dataset_extracted = ml_root / "tmp"
dataset_extracted.mkdir(parents=True, exist_ok=True)

# split dataset and save to their directories
dataset_zip = ml_root / "datasets/intel_s12.zip"
print(f":: Extracting Zip {dataset_zip} to {dataset_extracted}")
extract_archive(
    from_path=dataset_zip,
    to_path=dataset_extracted
)

dataset_full = list(dataset_extracted.glob("*/*/*/*.jpg"))
labels = [x.parent.stem for x in dataset_full]

print(":: Dataset Class Counts: ", Counter(labels))


d_train = list((dataset_extracted / "seg_train" / "seg_train").glob("*/*.jpg"))
d_test = list((dataset_extracted / "seg_test" / "seg_test").glob("*/*.jpg"))

d_train, d_test = train_test_split(dataset_full, stratify=labels)

print("\t:: Train Dataset Class Counts: ", Counter(x.parent.stem for x in d_train))
print("\t:: Test Dataset Class Counts: ", Counter(x.parent.stem for x in d_test))

:: Extracting Zip /home/ec2-user/SageMaker/sagemaker-pipeline/capstone/datasets/intel_s12.zip to /home/ec2-user/SageMaker/sagemaker-pipeline/capstone/tmp
:: Dataset Class Counts:  Counter({'mountain': 3037, 'glacier': 2957, 'street': 2883, 'sea': 2784, 'forest': 2745, 'buildings': 2628})
	:: Train Dataset Class Counts:  Counter({'mountain': 2277, 'glacier': 2218, 'street': 2162, 'sea': 2088, 'forest': 2059, 'buildings': 1971})
	:: Test Dataset Class Counts:  Counter({'mountain': 760, 'glacier': 739, 'street': 721, 'sea': 696, 'forest': 686, 'buildings': 657})


In [15]:
for path in ['train', 'test']:
    output_dir = ml_root / "datasets" / path
    print(f"\t:: Creating Directory {output_dir}")
    output_dir.mkdir(parents=True, exist_ok=True)

print(":: Writing Datasets")
write_dataset(d_train, ml_root / "datasets" / "train")
write_dataset(d_test, ml_root / "datasets" / "test")

	:: Creating Directory /home/ec2-user/SageMaker/sagemaker-pipeline/capstone/datasets/train
	:: Creating Directory /home/ec2-user/SageMaker/sagemaker-pipeline/capstone/datasets/test
:: Writing Datasets
