In [None]:
!ls

In [None]:
!rm -rf chip_dataset

In [None]:
!ls

# Creating Different Sized Chips for our Satellite Images
This notebook is meant to help you create chips out of your satellite images. Chips are basically smaller crops of images that you stitch together to recreate your larger image. To get a better idea take a look at the image below. 





In [None]:
from PIL import Image
Image.open('../input/mark-down-images/chips.png')

The helper classes below, allow you to read from the csv files that we created in our earlier notebooks, and use them in tandem with the SpaceNet 7 dataset to create formatted image chips that have your desired dimensions. 

The output directory is structured as follows:
 chip_dataset<br>
&nbsp;&nbsp;&nbsp;&nbsp;└── change_detection<br>
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└── fname<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├── chips<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│   └── year1_month1_year2_month2<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;       └── global_monthly_year1_month1_year2_month2_chip_x###_y###_fname.tif<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└── masks<br>
            &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└── year1_month1_year2_month2<br>
                &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└── global_monthly_year1_month1_year2_month2_chip_x###_y###_fname_blank.tif<br>


The `_blank` in the mask chips, indicates whether the mask is a blank mask or not.

In [None]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import rasterio as rio
from rasterio import features
from pathlib import Path
import pathlib
import geopandas as gpd
from descartes import PolygonPatch
from PIL import Image
import itertools
import re
from tqdm.notebook import tqdm

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

%matplotlib inline

In [None]:
root_dir = Path('../input/spacenet-7-multitemporal-urban-development/SN7_buildings_train_sample/sample')
csv_file = Path('../input/spacenet-7-directory-metadata-extraction/output_csvs/df_sample_untidy.csv')

In [None]:
df = pd.read_csv(csv_file)
df.head()

In [None]:
class MultiTemporalSatelliteDataset():
    """SpaceNet 7 Multi-Temporal Satellite Imagery Dataset Creater"""
    
    def __init__(self,csv_file, root_dir, no_udm=True, transform=None):
        """
        Args:
            csv_file (Path): Path to the csv file with annotations
            root_dir (Path): Parent directory containing all other directories.
            no_udm (bool): Specifies whether the dataset will load UDM images or not.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.no_udm = no_udm
        self.transform = transform
        
        self.idx_combinations = self.__get_all_idx_combinations()
        self.max = self.__len__()
        # for debugging purposes
        self.empty = []
    
    def __len__(self):
        return len(self.idx_combinations)
    
    def __getitem__(self,raster_idx):

            
        if torch.is_tensor(raster_idx):
            raster_idx = raster_idx.tolist()
            
        # get the indices of the 2 images
        idx1,idx2 = self.idx_combinations[raster_idx]
        # paths where the images are stored
        img1_path = self.root_dir/self.annotations.loc[idx1,'images_masked']
        img2_path = self.root_dir/self.annotations.loc[idx2,'images_masked']
        # paths where the corresponding true building footprints 
        labels1_path = self.root_dir/self.annotations.loc[idx1,'labels_match_pix']
        labels2_path = self.root_dir/self.annotations.loc[idx2,'labels_match_pix']
        # read rasters using imported rasterio library
        with rio.open(img1_path) as r1, rio.open(img2_path) as r2:
            raster1 = r1.read()[0:3]  
            raster2 = r2.read()[0:3]
            raster_bounds = r1.bounds
            rio_transform = r1.transform
        # get the concatenated array of the 2 images that will be fed into the neural_net
        raster_diff = np.concatenate((raster1,raster2),axis=0)
        # get the dates for the images
        date1 = tuple(self.annotations.loc[idx1,['month','year']])
        date2 = tuple(self.annotations.loc[idx2,['month','year']])
        # read geojson files for each of the satellite images into a geodataframe
        gdf1 = gpd.read_file(labels1_path).set_index('Id').sort_index()
        gdf2 = gpd.read_file(labels2_path).set_index('Id').sort_index()
        # get the change between the 2 satellite images by comparing their polygons
        gdf_diff = self.__geo_difference(labels1_path,labels2_path)
        # get the corresponding rasterized image of the geodataframes
        mask_diff = self.__rasterize_gdf(gdf_diff,out_shape=raster1.shape[1:3])
        
        sample = {'raster1':raster1,'raster2':raster2,'raster_diff':raster_diff,'raster_bounds':raster_bounds,'rio_transform':rio_transform,
                  'date1':date1,'date2':date2,'mask_diff':mask_diff,'fname':img1_path.parent.parent.stem}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    def __get_all_idx_combinations(self):
        all_combinations = []
        # group by satellite image location
        location_groups = self.annotations.groupby('image_dir_name')
        # loop through the groups and get the different index combinations
        for i,location in enumerate(location_groups):
            # get the dataframe in the group
            loc_frame = location[1]
            # make sure that list does not contain images with unidentified masks
            condition = (loc_frame['has_udm'] == False)
            # return a list of the indices in the location dataframe
            l = list(loc_frame[condition].index)
            # use itertools to get all the different combinations between 2 in the list
            combinations = list(itertools.combinations(l,2))
            all_combinations.extend(combinations)
        return all_combinations
        
    def __geo_difference(self,geojson1,geojson2):
        # read geojson into geodataframes
        gdf1 = gpd.read_file(geojson1).set_index('Id').sort_index()
        gdf2 = gpd.read_file(geojson2).set_index('Id').sort_index()

        # get geodataframe lengths
        len_1 = len(gdf1)
        len_2 = len(gdf2)
        # check which gdf is longer
        len_diff = abs(len_2-len_1)

        if len_2 > len_1:
            start_index = len_2-len_diff
            diff_gdf = gdf2.iloc[start_index:].copy()
        else:
            start_index = len_1-len_diff
            diff_gdf = gdf1.iloc[start_index:].copy()

        # reset the index
        diff_gdf.reset_index(inplace=True,drop=True)

        return diff_gdf

    
    def __rasterize_gdf(self,gdf,out_shape):
        # if geodataframe is empty return empty mask
        if len(gdf)==0:
            return np.zeros((1,*out_shape))
            
        mask = features.rasterize(((polygon, 255) for polygon in gdf['geometry']),out_shape=out_shape)
        
        return np.expand_dims(mask,axis=0)

In [None]:
class ChipGenerator():   
    def __init__(self, chip_dimension=256):  
        self.chip_dimension = chip_dimension
        self.chip_dict = {'chip':[],'x':[],'y':[], 'blank':[]}

    def __call__(self,image):
        np_array = self.__read_image(image)
        # get number of chips per colomn
        n_rows = (np_array.shape[1] - 1) // self.chip_dimension + 1
        # get number of chips per row
        n_cols = (np_array.shape[2] - 1) // self.chip_dimension + 1
        # segment image into chips and return dict of chips and metadata
        chip_dict = {'chip':[],'x':[],'y':[], 'blank':[]}
        for r in range(n_rows):
            for c in range(n_cols):
                start_r_idx = r*self.chip_dimension
                end_r_idx = start_r_idx + self.chip_dimension

                start_c_idx = c*self.chip_dimension
                end_c_idx = start_c_idx + self.chip_dimension
                chip = np_array[:,start_r_idx:end_r_idx,start_c_idx:end_c_idx]

                chip_dict['chip'].append(chip)
                chip_dict['x'].append(start_r_idx)
                chip_dict['y'].append(start_c_idx)
                # Check if the chip is an empty chip
                if chip.mean() == 0 and chip.sum() == 0:
                    chip_dict['blank'].append('_blank')
                else:
                    chip_dict['blank'].append('')


        return chip_dict

    def __read_image(self,image):
        # check whether image is a path or array
        if isinstance(image,(pathlib.PurePath,str)):
                with Image.open(image) as img:
                    # convert image into np array
                    np_array = np.array(img)
                return np_array

        elif isinstance(image,np.ndarray):
            return image
        else:
            raise ValueError(f"Expected Path or Numpy array received: {type(image)}")

In [None]:
class DatasetCreator():
    def __init__(self,chip_dimension=256):
        self.chip_dimension = chip_dimension
    
    def __call__(self,dataset):
        for d in tqdm(dataset):
            raster_diff = d['raster_diff']
            mask_diff = d['mask_diff']
            
            self.__fname = d['fname']
            self.__date1= d['date1']
            self.__date2 = d['date2']
            self.__raster_bounds = d['raster_bounds']
            self.__transform = d['rio_transform']
            self.__raster_shape = raster_diff.shape[1:3]
            
            
            
            self.__save_chips(image=raster_diff,subdir_name='chips')
            self.__save_chips(image=mask_diff,subdir_name='masks')
            
            
    def __save_chips(self,image,subdir_name='chips'):
        
        month1,year1 = self.__date1
        month2,year2 = self.__date2
        
        chip_generator = ChipGenerator(chip_dimension=self.chip_dimension)
        chip_dict = chip_generator(image)
        
        x_coords = chip_dict['x']
        y_coords = chip_dict['y']
        chips = chip_dict['chip']
        blanks = chip_dict['blank'] 
        
        im_dir = Path('chip_dataset/change_detection')/Path(self.__fname)/Path(subdir_name)/Path(f'{year1}_{month1}_{year2}_{month2}')
        im_dir.mkdir(parents=True, exist_ok=True)

        for chip,x,y,blank in zip(chips,x_coords,y_coords,blanks):
            im_name = f'global_monthly_{year1}_{month1}_{year2}_{month2}_chip_x{x}_y{y}_{self.__fname}{blank}.tif'
            im_path = im_dir/im_name
            
            if subdir_name == 'chips':
                count = 6
            else:
                count = 1
            
            # Calculate the new bounds for the raster chips
            transform = self.__get_geo_transform(x,y)
            
            profile = {'driver':'GTiff', 'width':self.chip_dimension,'height':self.chip_dimension,'crs':'EPSG:3857','count':count,'dtype':rio.uint8, 'compress':'zip','transform': transform}
            
            with rio.open(im_path, 'w',**profile) as dst:
                dst.write(chip.astype(rio.uint8))
                
    
    def __get_geo_transform(self,x,y):
        top = self.__raster_bounds[3]
        bottom = self.__raster_bounds[1]
        left = self.__raster_bounds[0]
        right = self.__raster_bounds[2]
        
        raster_height = self.__raster_shape[0]
        raster_width = self.__raster_shape[1]
        
        chip_height = (top-bottom)/(raster_height//self.chip_dimension)
        chip_width = (left-right)/(raster_width//self.chip_dimension)
        
        pixel_height = (top-bottom)/raster_height
        pixel_width = (left-right)/raster_width
        
        chip_top = top + y * pixel_height
        chip_bottom = chip_top + chip_height
        
        chip_left = left + x * pixel_width
        chip_right = chip_left + chip_width
        
        bounds = {'left': chip_left, 'bottom': chip_bottom, 'right': chip_right, 'top': chip_top}
        
        return rio.Affine(self.__transform[0], self.__transform[1], chip_left,self.__transform[3], self.__transform[4], chip_top)

In order to generate your chips, replace the chip dimension with your chip dimension of choice, and run the notebook. Make sure that the output does not exceed kaggle's storage restriction of 20gb. If you would like to exceed the limit, then you can download the notebook along with the dataset and run it localy instead.

In [None]:
dataset = MultiTemporalSatelliteDataset(root_dir=root_dir,csv_file=csv_file)

In [None]:
dataset_creator = DatasetCreator(chip_dimension=64)

In [None]:
dataset_creator(dataset=dataset)

In the above example we read our data from our sample satellite imagery directory, we then obained the cartesian product of all the images from the different timestamps. We then looped through the combinations and segmented each combination and saved the corresponding chips and masks in the specified directories. 

This code can be used to generate satellite imagery chips of any kind, and the best part is that the geolocation for each chip is conserved. 