In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
! pip uninstall opencv-python
! pip install opencv-python

Found existing installation: opencv-python 4.1.2.30
Uninstalling opencv-python-4.1.2.30:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/cv2/*
    /usr/local/lib/python3.7/dist-packages/opencv_python-4.1.2.30.dist-info/*
Proceed (y/n)? y
  Successfully uninstalled opencv-python-4.1.2.30
Collecting opencv-python
  Downloading opencv_python-4.5.5.64-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (60.5 MB)
[K     |████████████████████████████████| 60.5 MB 129 kB/s 
Installing collected packages: opencv-python
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed opencv-python-4.5.5.64


In [None]:
!pip install geopandas
!pip install rtree
!pip install fiftyone

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 15.2 MB/s 
[?25hCollecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 48.1 MB/s 
Collecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 77.4 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1
Collecting rtree
  Downloading Rtree-0.9.7-cp37-cp37m-manylinux2010_x86_64.whl (994 kB)
[K     |████

### Check datasets

In [None]:
import fiftyone as fo

base = '/content/drive/MyDrive/PyPSA_Africa_images/datasets/duke_512_val/'

dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    data_path=base+'data',
    labels_path=base+'labels.json',
    )



In [None]:
# The type of the dataset being imported
session = fo.launch_app(dataset)

In [None]:
import os

os.chdir("/content/drive/My Drive/PyPSA_Africa_images/duke")

In [None]:
os.getcwd()

'/content/drive/My Drive/PyPSA_Africa_images/duke'

In [None]:
!ls

arizona  clyde	   hartford  palmertson   sudan
brazil	 dunedin   kansas    rotorua	  tauranga
china	 gisborne  mexico    south_sudan  wilmington


### Create dataset

In [28]:
import os
import json
import pandas as pd
import geopandas as gpd
import numpy as np
from osgeo import gdal
from itertools import product
from PIL import Image
import fiftyone as fo
from shapely.geometry import Polygon



def extract_duke_dataset(dirs, 
                         prefixes, 
                         imgs_per_tower=2, 
                         width=512, 
                         height=512, 
                         base_path="", 
                         train_ratio=0.8,
                         out_train='train', 
                         out_val='val', 
                         ):
    """
    Extracts training images and bounding from region-zips provided in
    'https://figshare.com/articles/dataset/Electric_Transmission_and_
    Distribution_Infrastructure_Imagery_Dataset/6931088'
    
    Iterates over a list of directories and creates examples if it encounters the
    following structure in these directories:
        [dir_name]/raw/[.tif, .csv, .geojson etc. files]
        stores images to the following structure
        [dir_name]/examples/[prefix]+[id]+".png"]
        and a summarizing geojson file storing a dataframe of filenames and bbox
        [dir_name]/examples/[prefix]+"examples.geojson"
    
    Data must be unzipped!
    Be sure that all directories in [dirs] are in os.getcwd()!
    
    ----------
    Arguments:
        dirs : (list of str)
            list with names of directories which satisfy the outlined structure
        prefixes : (list of str)
            list with respective prefixes of respective resulting .png and .geojson files
        imgs_per_tower : (int)
            number of examples created for every tower found in geojson files
        width : (int)
            width of resulting example-images
        height : (int)
            height of resulting example-images
        base_path : (str)
            path to directories from which all dirs are accessible
        train_ratio : (float)
            share of examples labelled as part of training set (rest is val set)
    ----------
    Returns:
        -
    """

    print('Starting dataset extraction')
    print('Note currently all towers are labelled as tower')

    for country, prefix in zip(dirs, prefixes):
        
        # set up working path
        print("Extracting images from {}...".format(country))
        os.chdir(os.path.join(os.getcwd(), country, "raw"))

        # setup directory for resulting images
        train_path = "./../"+out_train 
        if not os.path.isdir(train_path): os.mkdir(train_path)
        train_path = out_train
        
        # setup directory for resulting images
        val_path = "./../"+out_val 
        if not os.path.isdir(val_path): os.mkdir(val_path)
        val_path = out_val


        # set up resulting dataset of examples (with towers)
        tower_df = gpd.GeoDataFrame({"filename": [], 
                                "ul_x": [], "ul_y": [], "lr_x": [], "lr_y": [], 
                                #"geometry": []
                                })
        
        # set up datasets for current country
        try: 
            dataset_train = fo.Dataset(name=country+'_'+out_train)
        except:
            dataset_train = fo.load_dataset(country+'_'+out_train)
            dataset_train.delete()
            dataset_train = fo.Dataset(name=country+'_'+out_train)
            # _dataset2 = fo.load_dataset("my_second_dataset")
        dataset_train.persistent = False

        try: 
            dataset_val = fo.Dataset(name=country+'_'+out_val)
        except:
            dataset_val = fo.load_dataset(country+'_'+out_val)
            dataset_val.delete()
            dataset_val = fo.Dataset(name=country+'_'+out_val)
            # _dataset2 = fo.load_dataset("my_second_dataset")
        dataset_val.persistent = False
        
        # Starting with adding examples to te training set
        curr_path = train_path
        curr_dataset = dataset_train
        switched_already = False

        # create list of relevant files
        filelist = os.listdir()
        csv_files = [fn for fn in filelist if fn.endswith('.csv')]

        unders = [i for i, letter in enumerate(csv_files[0]) if letter is "_"]
        file_prefix = csv_files[0][:unders[-1]+1]
        num_files = len(csv_files)

        tif_files = [file_prefix + str(i+1) + '.tif' for i in range(num_files)]  
        geojson_files = [file_prefix + str(i+1) + '.geojson' for i in range(num_files)]  
        
        # iterate over files
        for i, (tif, geojson) in enumerate(zip(tif_files, geojson_files)):        

            if (i+1) / num_files > train_ratio and not switched_already: 
                print('Switching to mode val after {} of {} files due to train ratio'.format(
                      i, num_files, train_ratio, train_ratio))
        
                print(base_path, country, curr_path)
                export_dir = os.path.join(base_path, country, curr_path) 
                label_field = "ground_truth"  

                # Export training dataset
                curr_dataset.export(
                     export_dir=export_dir,
                     dataset_type=fo.types.COCODetectionDataset,
                     label_field=label_field,
                     )
                
                curr_path = val_path
                curr_dataset = dataset_val
                switched_already = True


            print("Opening geojson file: ", geojson)
            # open files and get bands
            try:
                annots = gpd.read_file(geojson)
            except:
                print("Unable to read annotation file {}".format(geojson))
                print("Continuing to the next file...")
                continue

            # make sure geojson contains information
            if len(annots.columns) == 1:
                print('Bad geojson detected! Continuing...') 
                continue

            ds = gdal.Open(tif)
            bands = [ds.GetRasterBand(i) for i in range(1, 4)]
            info = gdal.Info(tif, format="json")

            pd.set_option('display.max_columns', None)

            # remove all assets except towers            
            remove_assets = ["DL", "TL", "OL", "SS"]
            for to_remove in remove_assets:
                annots = annots[annots["label"] != to_remove]

            def to_pixels(geom):
                '''
                receives pixel coordinates as string and returns columns 
                upper left, lower right and geometry as Polygon (rectangular) 
                all coordinates are relative to the tif file the assets is in
                '''
                geom = geom.split(" ")
                geom.remove('[')
                geom.remove(']')

                # transform to Polygon with rectangular bbox
                geom = [entry for entry in geom if not '[' in entry and not ']' in entry]
                geom = [int(float(entry.replace(",", ""))) for entry in geom]
                x, y = geom[::2], geom[1::2] 
                geom = Polygon([[max(x), max(y)], [max(x), min(y)], [min(x), min(y)], [min(x), max(y)]])
                return np.array([min(x), min(y)]), np.array([max(x), max(y)]), geom

            # make sure the dataframe contains only towers
            annots = annots[annots['geometry'].apply(lambda x: isinstance(x, Polygon))]
            if annots.empty: continue

            annots["ul"], annots["lr"], annots['geometry'] = zip(*annots['pixel_coordinates'].map(to_pixels))

            tif_width, tif_height = info['size'][0], info['size'][1]

            for (curr, tower), i in product(annots.iterrows(), range(imgs_per_tower)):

                if not isinstance(tower.geometry, Polygon): continue
                
                # if tower['label'] == "DT" or tower['label'] == 'OT': label = 'distribution'
                # elif tower['label'] == 'TT': label = 'transmission'
                label = 'tower'

                example_name = prefix + '_' + str(np.random.randint(1e10, 1e11)) + '.png'

                # define the bounds of random offset
                bb_ul, bb_lr = tower['ul'], tower['lr']
                min_x, max_x = max(0, bb_lr[0] - width), min(bb_ul[0], tif_width - width)
                min_y, max_y = max(0, bb_lr[1] - height), min(bb_ul[1], tif_height - height)

                # randomly draw corner of image (this can fail if towers are close to the frame -> skip tower)
                try:
                    img_ul_x = np.random.randint(min_x, max_x)
                    img_ul_y = np.random.randint(min_y, max_y)
                except:
                    continue

                # determine bounding box relative to new image
                bb_ul -= np.array([img_ul_x, img_ul_y])
                bb_lr -= np.array([img_ul_x, img_ul_y])

                # set up image and new filename
                new_img = np.zeros((height, width, 3), dtype=np.uint8)
        
                # transfer pixel data
                try:
                    for i in range(3):
                        new_img[:,:,i] = bands[i].ReadAsArray(img_ul_x, img_ul_y, width, height)
                except:
                    continue

                # transform array to image
                img = Image.fromarray(new_img, 'RGB')
                img.save(os.path.join('./../', curr_path, example_name), quality=100)

                # add to dataset
                sample = fo.Sample(filepath=os.path.join(
                                   base_path, country, curr_path, example_name)
                                   )
                
                detections = []

                # add main tower in image 
                outer_bbox = [bb_ul[0], bb_ul[1], bb_lr[0]-bb_ul[0], bb_lr[1]-bb_ul[1]]
                outer_bbox = (np.array(outer_bbox) / width).tolist()

                # this can be a useful consistency check
                # print('(outer): ', outer_bbox )

                detections.append(fo.Detection(label=label, bounding_box=outer_bbox))

                # create Polygon of created image
                img_corner = np.array([img_ul_x, img_ul_y])
                img_polygon = Polygon([
                                    img_corner,
                                    img_corner + np.array([width, 0]),
                                    img_corner + np.array([width, height]),
                                    img_corner + np.array([0, height])
                                    ])

                # add secondary towers that happen to be in the same image
                for j, other in annots.iterrows():

                    # if other['label'] == "DT" or other['label'] == 'OT': other_label = 'distribution'
                    # elif other['label'] == 'TT': other_label = 'transmission'
                    other_label = 'tower'

                    #if img_polygon.contains(other["geometry"]):
                    if img_polygon.intersects(other["geometry"]):

                        ul_pixels = np.min(other['geometry'].exterior.xy, axis=1)
                        lr_pixels = np.max(other['geometry'].exterior.xy, axis=1)


                        ul = (ul_pixels - img_corner) / width
                        lr = (lr_pixels - img_corner) / width
                        w, h = lr - ul

                        bbox = [ul[0], ul[1], w, h]

                        if not img_polygon.contains(other['geometry']):
                            in_part = other['geometry'].intersection(img_polygon)
                            shared_fraction = in_part.area / other['geometry'].area

                            bbox[0] = max(bbox[0], 0)
                            bbox[1] = max(bbox[1], 0)
                            bbox[2] = min(bbox[2], 1 - bbox[0])
                            bbox[3] = min(bbox[3], 1 - bbox[1])

                        else:
                            shared_fraction = 1


                        if not bbox == outer_bbox and shared_fraction > 0.5:

                            # can be useful to print this
                            # print('(inner): ', bbox, '. shared fraction: ', shared_fraction)

                            detections.append(fo.Detection(label=other_label, bounding_box=bbox))
                
                sample["ground_truth"] = fo.Detections(detections=detections)
                
                curr_dataset.add_sample(sample)
                
                
        export_dir = os.path.join(base_path, country, curr_path) 
        label_field = "ground_truth"  

        # Export training dataset
        try:
            curr_dataset.export(
                    export_dir=export_dir,
                    dataset_type=fo.types.COCODetectionDataset,
                    label_field=label_field,
                    )
        except ValueError:
            print('Could not export: ', export_dir)
            print('Continuing...')
            
        fix_filenames(os.path.join(base_path, country, out_val, 'labels.json'))
        '''
        print('Done with first dataset!')
        print(os.getcwd())
        fix_annots('./../'+curr_path+'/labels.json')
        print('Succesfully fixed annotations!')
        '''

        os.chdir(os.path.abspath(os.path.join('', '../..')))



def fix_annots(file):
    '''
    adds information on the 'iscrowd' property 
    for training with detectron2 to an annotation json file
    made by fiftyone
    '''
    dictionary = json.load(open(file)) 
    for annot in dictionary['annotations']:
        annot['iscrowd'] = 0
    
    with open(file, "w") as f:
        json.dump(dictionary, f)


def fix_filenames(file):
    '''
    removes buggy -2 attached by fiftyone to filenames
    '''

    dictionary = json.load(open(file))
    
    for imgs in dictionary['images']:
        imgs['file_name'] = imgs['file_name'].replace('-2', '')
    
    with open(file, "w") as f:
        json.dump(dictionary, f)


if __name__ == "__main__":
    base_path = "/content/drive/MyDrive/PyPSA_Africa_images/duke"
    os.chdir(base_path)
    dirs = [ 
            #'hartford',   #  (APPEARS TO HAVE CORRUPTED GEOJSON FILES)
             # 'china',
             'kansas',
             # 'dunedin',
             # 'gisborne',
             # 'palmertson',
             # 'rotorua',
             # 'tauranga',
             # 'wilmington',
             # 'arizona',
             # 'clyde',
             # 'sudan',
             # 'mexico',
             # 'brazil',
            ]
    prefixes = [word[:2].upper() for word in dirs]
    extract_duke_dataset(dirs, prefixes, 
                         imgs_per_tower=1, 
                         height=512, 
                         width=512,
                         base_path=base_path,
                         out_train='train_512_test',
                         out_val='val_512_test')


Starting dataset extraction
Note currently all towers are labelled as tower
Extracting images from kansas...
Opening geojson file:  USA_KS_Colwich:Maize_1.geojson
Skipping field image_geocoordinates_upper_left: invalid type 3
Skipping field image_geocoordinates_lower_left: invalid type 3
Skipping field image_geocoordinates_upper_right: invalid type 3
Skipping field image_geocoordinates_lower_right: invalid type 3
(outer):  [0.123046875, 0.482421875, 0.03125, 0.044921875]
(outer):  [0.9609375, 0.478515625, 0.03125, 0.05078125]
(outer):  [0.16796875, 0.63671875, 0.068359375, 0.044921875]
(inner):  [0.544921875, 0.611328125, 0.109375, 0.056640625] . shared fraction:  1
(outer):  [0.3125, 0.283203125, 0.109375, 0.056640625]
(outer):  [0.759765625, 0.185546875, 0.060546875, 0.05078125]
(inner):  [0.966796875, 0.181640625, 0.033203125, 0.046875] . shared fraction:  0.5151515151515151
(outer):  [0.193359375, 0.875, 0.064453125, 0.046875]
(inner):  [0, 0.87890625, 0.060546875, 0.05078125] . sh

In [None]:
from shapely.geometry import Polygon, Point
import numpy as np

a = Polygon([[0, 0], [0, 1], [1.5, 1.3], [1.2, 0]])
max_x, max_y =  np.max(a.exterior.xy, axis=1)

print(max_x, max_y)
print(a)

1.5 1.3
POLYGON ((0 0, 0 1, 1.5 1.3, 1.2 0, 0 0))


In [None]:
def fix_filenames(file):
    '''
    removes buggy -2 attached by fiftyone to filenames
    '''

    dictionary = json.load(open(file))
    
    for imgs in dictionary['images']:
        if '-2' in imgs["file_name"]:
            imgs['file_name'] = imgs['file_name'].replace('-2', '')

    
    with open(file, "w") as f:
        json.dump(dictionary, f)


In [None]:
import os
import shutil

def delete_all_examples(base_path, countries):
    for country in countries:
        
        to_delete = os.path.join(base_path, country, 'train_512')
        if os.path.isdir(to_delete):
            shutil.rmtree(to_delete)

        to_delete = os.path.join(base_path, country, 'val_512')
        if os.path.isdir(to_delete):
            shutil.rmtree(to_delete)
        
        print('Deleted {}'.format(to_delete))

if __name__ == '__main__':
    base_path = "/content/drive/MyDrive/PyPSA_Africa_images/duke/"
    
    countries = [ 
            # 'hartford',    (APPEARS TO HAVE CORRUPTED GEOJSON FILES)
            # 'china',
            'kansas',
            # 'dunedin',
            # 'gisborne',
            # 'palmertson',
            # 'rotorua',
            'tauranga',
            'wilmington',
            'arizona',
            'brazil',
            'clyde',
            'sudan',
            'mexico'
            ]
    delete_all_examples(base_path, countries)


Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/china/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/kansas/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/dunedin/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/gisborne/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/palmertson/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/rotorua/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/tauranga/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/wilmington/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/arizona/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/brazil/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/clyde/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/sudan/val_512
Deleted /content/drive/MyDrive/PyPSA_Africa_images/duke/mexico/val_512


# New section

In [None]:
import glob
import os

files = glob.glob('/YOUR/PATH/*')
for f in files:
    os.remove(f)

def delete_all_examples(to_delete):
    for dir in to_delete:
    
        if not os.path.isdir(dir + '/examples'): continue

        files = glob.glob(dir + '/examples/')
        print(files)

        for f in files:
            print('Remove this: {}?'.format(f))


### FOR IMAGES REPEAT THE NEXT CELL WITH name = "china" or name = "mexico or name = "palmertson" (in accordance with cell above)

In [29]:
import fiftyone as fo
import os

dirs = [ 
            "mexico", 
            'brazil',
            'arizona',
            #'hartford',
            'kansas',
            'dunedin',
            'gisborne',
            'palmertson',
            'rotorua',
            'tauranga',
            'wilmington'
            ]
# for name in dirs:
name = 'kansas'
data_path = '/content/drive/My Drive/PyPSA_Africa_images/duke/'+name+'/train_512_test/'

# The path to the COCO labels JSON file
labels_path = data_path + 'labels.json'

# Import the dataset
dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    data_path=data_path,
    labels_path=labels_path,
    )

print('location {} with num examples {}.'.format(name, len(dataset)))

# The type of the dataset being imported
session = fo.launch_app(dataset)

# session 

 100% |███████████████| 1002/1002 [2.2s elapsed, 0s remaining, 483.4 samples/s]      
location kansas with num examples 1002.


In [None]:

dirs = [ 
            "mexico", 
            #'brazil',  # this is valid for training data tho!
            'arizona',
            #'hartford',
            'kansas',
            'dunedin',
            'gisborne',
            'palmertson',
            'rotorua',
            'tauranga',
            'wilmington'
            ]

datasets = []

for country in dirs:
    labels_path = f'/content/drive/MyDrive/PyPSA_Africa_images/duke/{country}/val_512/labels.json'
    imgs_path = f'/content/drive/MyDrive/PyPSA_Africa_images/duke/{country}/val_512/data'

    datasets.append(fo.Dataset.from_dir(
            dataset_type=fo.types.COCODetectionDataset,
                data_path=imgs_path,
                labels_path=labels_path,
            ))

dataset = datasets[0]
for data in datasets[1:]:
    dataset.merge_samples(data)

 100% |███████████████| 1108/1108 [4.1s elapsed, 0s remaining, 280.3 samples/s]      
 100% |█████████████████| 230/230 [547.5ms elapsed, 0s remaining, 420.1 samples/s]      
 100% |█████████████████| 377/377 [1.1s elapsed, 0s remaining, 340.1 samples/s]         
 100% |███████████████████| 25/25 [67.1ms elapsed, 0s remaining, 372.6 samples/s]     
 100% |█████████████████| 190/190 [470.8ms elapsed, 0s remaining, 408.2 samples/s]      
 100% |█████████████████| 197/197 [674.7ms elapsed, 0s remaining, 293.8 samples/s]      
 100% |███████████████████| 61/61 [167.2ms elapsed, 0s remaining, 378.0 samples/s]    
 100% |█████████████████| 135/135 [424.4ms elapsed, 0s remaining, 318.1 samples/s]      
 100% |███████████████████| 91/91 [233.4ms elapsed, 0s remaining, 389.8 samples/s]     


In [None]:
def fix_annots(file):
    '''
    adds information on the 'iscrowd' property 
    for training with detectron2 to an annotation json file
    made by fiftyone
    '''
    dictionary = json.load(open(file)) 
    for annot in dictionary['annotations']:
        annot['iscrowd'] = 0
    
    with open(file, "w") as f:
        json.dump(dictionary, f)

name = 'duke_512_val'

dataset_path = '/content/drive/My Drive/PyPSA_Africa_images/datasets/'+name+'/'

dataset.export(
            export_dir=dataset_path,
            dataset_type=fo.types.COCODetectionDataset,
            label_field='ground_truth',
            )



# labels_train = f'/content/drive/My Drive/PyPSA_Africa_images/datasets/duke_512_train/labels.json'
labels_val = f'/content/drive/My Drive/PyPSA_Africa_images/datasets/duke_512_val/labels.json'

# fix_annots(labels_train)
fix_annots(labels_val)

# fix_annots(dummy)

 100% |███████████████| 2414/2414 [43.2s elapsed, 0s remaining, 68.2 samples/s]      


In [None]:
session = fo.launch_app(dataset)

In [None]:
import fiftyone as fo
import os
from shutil import copy


def split_set(origin_path, 
              name_1, name_2, 
              destpath_1, destpath_2,
              len_1, len_2):

    origin = fo.Dataset.from_dir(
        dataset_type=fo.types.COCODetectionDataset,
        data_path=origin_path,
        labels_path=origin_path + 'labels.json',
        )

    print("Loaded dataset: \n")
    print(origin)

    if len(origin) < len_1 + len_2:
        print('Origin set of size {} is too small for this split'.format(len(dataset)))
        return
    
    try: 
        ds1 = fo.Dataset(name=name_1)
    except:
        ds1 = fo.load_dataset(name_1)
        ds1.delete()
        ds1 = fo.Dataset(name=name_1)
    print("Created dataset 1!")

    if not os.path.isdir(destpath_1): os.mkdir(destpath_1)

    for sample in origin[:num_train]:

        path = sample["filepath"]
        fn = [i for i, letter in enumerate(path) if letter is '/']
        fn = path[fn[-1]+1:]

        copy(sample['filepath'], destpath_1+'/'+fn)

        ds1.add_sample(sample)

    ds1.export(
              export_dir=destpath_1,
              dataset_type=fo.types.COCODetectionDataset,
              label_field='ground_truth',
              )
    print("Exported dataset 1 to {}!".format(destpath_1))
    print("It has length {}!".format(len(ds1)))

    try: 
        ds2 = fo.Dataset(name=name_2)
    except:
        ds2 = fo.load_dataset(name_2)
        ds2.delete()
        ds2 = fo.Dataset(name=name_2)
    print("Created dataset 2!")

    if not os.path.isdir(destpath_2): os.mkdir(destpath_2)

    for sample in origin[num_train:]:

        path = sample["filepath"]
        fn = [i for i, letter in enumerate(path) if letter is '/']
        fn = path[fn[-1]+1:]

        copy(sample['filepath'], destpath_2+'/'+fn)

        ds2.add_sample(sample)

    ds2.export(
              export_dir=destpath_2,
              dataset_type=fo.types.COCODetectionDataset,
              label_field='ground_truth',
              )
    print("Exported dataset 2 to {}!".format(destpath_2))
    print("It has length {}!".format(len(ds2)))

    return 


num_train = 200
num_test = 54

base_path = '/content/drive/My Drive/PyPSA_Africa_images/datasets/'
origin_path = '/content/drive/My Drive/PyPSA_Africa_images/rotorua/examples/'
name_train = 'duke_train_512'
name_val = 'duke_val_512'
# overfit_train = fo.Daaa

split_set(origin_path,
          name_train,
          name_val, 
          base_path + name_train,
          base_path + name_val,
          num_train,
          num_test)




In [None]:
from shutil import copy
import os
import fiftyone as fo


def merge_datasets(base_path, origins, name, val_percentage, seed=1):
    """
    Merges all datasets from origins to two datasets with train and validation

    """
    os.chdir(base_path)

    try: 
        large_ds = fo.Dataset(name=name)
    except:
        large_ds = fo.load_dataset(name)
        large_ds.delete()
        large_ds = fo.Dataset(name=name)
    print("Created base dataset!")

    for origin in origins:

        origin_path = origin + '/examples/'
        curr_ds = fo.Dataset.from_dir(
            dataset_type=fo.types.COCODetectionDataset,
            data_path=origin_path,
            labels_path=origin_path + 'labels.json',
            ) 

        print("Copying examples from {}!".format(origin))
        for sample in curr_ds:
            large_ds.add_sample(sample)

    print("Copied all samples to the large dataset!")

    large_ds = large_ds.shuffle(seed=seed)

    print("Shuffled Dataset!")

    num_train = int(len(large_ds) * (1. - val_percentage))

    print("Instantiating training and dataset!") 
    train_name = name + '_train'
    val_name = name + '_val'
    
    try: 
        train_ds = fo.Dataset(name=train_name)
    except:
        train_ds = fo.load_dataset(train_name)
        train_ds.delete()
        train_ds = fo.Dataset(name=train_name)

    train_destpath = base_path + 'datasets/' + train_name + '/' 
    if not os.path.isdir(train_destpath): os.mkdir(train_destpath)

    for sample in large_ds[:num_train]:
        train_ds.add_sample(sample)
        
        path = sample["filepath"]
        fn = [i for i, letter in enumerate(path) if letter is '/']
        fn = path[fn[-1]+1:]

        print("Copzing from {} to {}".format(sample['filepath'], train_destpath + fn))
        copy(sample['filepath'], train_destpath + fn)

    train_ds.export(
              export_dir=train_destpath,
              dataset_type=fo.types.COCODetectionDataset,
              label_field='ground_truth',
              )
    print("Exported {} examples to training set to dir: \n {}".format(
            len(train_ds), train_destpath))

    try: 
        val_ds = fo.Dataset(name=val_name)
    except:
        val_ds = fo.load_dataset(val_name)
        val_ds.delete()
        val_ds = fo.Dataset(name=val_name)

    val_destpath = base_path + 'datasets/' + val_name + '/' 
    if not os.path.isdir(val_destpath): os.mkdir(val_destpath)

    for sample in large_ds[num_train:]:
        val_ds.add_sample(sample)
        
        path = sample["filepath"]
        fn = [i for i, letter in enumerate(path) if letter is '/']
        fn = path[fn[-1]+1:]

        print("Copying from {} to {}".format(sample['filepath'], val_destpath + fn))
        copy(sample['filepath'], val_destpath + fn)

    val_ds.export(
              export_dir=val_destpath,
              dataset_type=fo.types.COCODetectionDataset,
              label_field='ground_truth',
              )

    print("Exported {} examples to validations set to dir: \n {}".format(
            len(val_ds), val_destpath))


if __name__ == '__main__':

    base_path = '/content/drive/My Drive/PyPSA_Africa_images/'
    dirs = [ 
            #"mexico", 
            'brazil',
            #'arizona',
            #'hartford',
            #'kansas',
            #'dunedin',
            #'gisborne',
            #'palmertson',
            #'rotorua',
            #'tauranga',
            #'wilmington'
            ]
    merge_datasets(base_path, dirs, 'duke', 0.2)






In [None]:
origin = '/content/drive/My Drive/PyPSA_Africa_images/datasets'
origin_path = origin + '/duke_train/'

import json

labels = json.load(open(origin_path + 'labels.json'))
print(labels)


curr_ds = fo.Dataset.from_dir(
        dataset_type=fo.types.COCODetectionDataset,
        data_path=origin_path,
        labels_path=origin_path + 'labels.json',
        ) 

for sample in curr_ds[:1]:
    print(sample["filepath"])
    id = sample.id
    print(id)
