In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import os
import zipfile
import shutil
import random

NUM_IMAGES = 500

ARCHIVE_DIR = '/content/drive/MyDrive/data/victims/archives'
ARCHIVES = [os.path.join(ARCHIVE_DIR, f) for f in os.listdir(ARCHIVE_DIR)]

CSV_DIR = '/content/drive/MyDrive/data/victims/csv'
CSV_FILES = [os.path.join(CSV_DIR, f) for f in os.listdir(CSV_DIR)]

print(f"Archives: {ARCHIVES}")
print(f"CSV: {CSV_FILES}")

ACCEPTED_LABELS = ['dead', 'living']

IMAGES_DIR = '/content/images'
os.makedirs(IMAGES_DIR, exist_ok=True)

OUT_IMAGES_DIR = '/content/out_images'
if os.path.exists(OUT_IMAGES_DIR):
  shutil.rmtree(OUT_IMAGES_DIR)

os.makedirs(OUT_IMAGES_DIR, exist_ok=True)
OUT_CSV_PATH = '/content/data.csv'

# Unzip images
for archive in ARCHIVES:
  with zipfile.ZipFile(archive) as zip_file:
    for member in zip_file.namelist():
      filename = os.path.basename(member)

      if not filename:
        continue
      source = zip_file.open(member)
      try:
        target = open(os.path.join(IMAGES_DIR, filename), "wb")
        with source, target:
          shutil.copyfileobj(source, target)
      except:
        print("EXCEPTION!")

# Object dict to be filled with image_filename : [(label, xmin, ymin, xmax, ymax), ...]
objects = {}

# Open and parse CSV files
for csv_file in CSV_FILES:
  with open(csv_file, 'r') as f:
    for row in f.readlines():
      cols = row.split(',')

      filename = cols[0].replace('"', '').strip()
      if not os.path.isfile(os.path.join(IMAGES_DIR, filename)):
        #print(f"{filename} not found in images.")
        continue

      label = cols[5].replace('"', '').strip()
      if not label in ACCEPTED_LABELS:
        continue

      xmin = int(float(cols[1]))
      ymin = int(float(cols[2]))
      xmax = int(float(cols[3]))
      ymax = int(float(cols[4]))

      obj = (label, xmin, ymin, xmax, ymax)
      if not filename in objects.keys():
        objects[filename] = [obj]
      else:
        objects[filename].append(obj)

filenames = list(objects.keys())
random.shuffle(filenames)

# Copy used images to output folder and write csv file
with open(OUT_CSV_PATH, 'w') as f:
  #f.write('file, xmin, ymin, xmax, ymax, label\n')
  for i in range(NUM_IMAGES):
      for o in objects[filenames[i]]:
        f.write(f'{filenames[i]}, {o[1]}, {o[2]}, {o[3]}, {o[4]}, {o[0]}\n')

      shutil.copy2(os.path.join(IMAGES_DIR, filenames[i]), OUT_IMAGES_DIR)

Archives: ['/content/drive/MyDrive/data/victims/archives/victims3.zip', '/content/drive/MyDrive/data/victims/archives/Kopie von daten_bordeaux.zip', '/content/drive/MyDrive/data/victims/archives/Kopie von daten_bordeaux_2.zip', '/content/drive/MyDrive/data/victims/archives/victims_bordeaux_vor_ort.zip', '/content/drive/MyDrive/data/victims/archives/victims_bordeaux_2.zip', '/content/drive/MyDrive/data/victims/archives/victims_bordeaux_007.zip']
CSV: ['/content/drive/MyDrive/data/victims/csv/victims2-export.csv', '/content/drive/MyDrive/data/victims/csv/Kopie von daten_bordeaux.csv', '/content/drive/MyDrive/data/victims/csv/Kopie von daten_bordeaux_2.csv', '/content/drive/MyDrive/data/victims/csv/victims_bordeuax_vor_ort.csv', '/content/drive/MyDrive/data/victims/csv/victims_bordeaux_007.csv', '/content/drive/MyDrive/data/victims/csv/spaß_mit_sven--export.csv', '/content/drive/MyDrive/data/victims/csv/no_victims.csv']


In [17]:
# Zip images
OUT_ARCHIVE = '/content/data.zip'

with zipfile.ZipFile(OUT_ARCHIVE, 'w') as archive:
  for filename in os.listdir(OUT_IMAGES_DIR):
    archive.write(os.path.join(OUT_IMAGES_DIR, filename), filename)

In [18]:
# Copy CSV and archive to drive
DRIVE_ARCHIVE_PATH = '/content/drive/MyDrive/data/new_victims/data.zip'
DRIVE_CSV_PATH = '/content/drive/MyDrive/data/new_victims/data.csv'

shutil.copy2(OUT_ARCHIVE, DRIVE_ARCHIVE_PATH)
shutil.copy2(OUT_CSV_PATH, DRIVE_CSV_PATH)

'/content/drive/MyDrive/data/new_victims/data.csv'