In [1]:
# default_exp raster_to_df

In [2]:
# hide
# no_test
! [ -e /content ] && pip install -Uqq geowrangler

In [3]:
# hide
# no_test
!mkdir -p ../data
![ -e /content ] && ln -s ../data .

In [4]:
# hide
# no_test
!mkdir -p ../data
# download sample geojson files from repo

In [5]:
# hide
# no_test
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
# hide
import warnings

from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings(action="ignore", category=UserWarning, module="geopandas")
warnings.filterwarnings(
    action="ignore", category=ShapelyDeprecationWarning, module="pandas"
)

# Raster to Dataframe

> generate a dataframe where the rows are individual pixels and the columns are the image's bands

In [7]:
# export
import pandas as pd
import rasterio as rio
import rasterio.mask

In [9]:
# export
def read_bands(image_list: [], mask: str):
    """
    Reads the bands for each image in the list and returns a dataframe where each band is one column with the image name as a suffix for column name.
    """

    data = []

    label_ = rio.open(mask)
    label = label_.read(1).ravel()

    # Iterate over each year
    for idx, image_file in enumerate(image_list):
        # Read each band
        subdata = dict()
        raster = rio.open(image_file)

        for band_idx in range(raster.count):
            band = raster.read(band_idx + 1).ravel()
            subdata["B{}".format(band_idx + 1)] = band

        # Cast to pandas subdataframe
        subdata = pd.DataFrame(subdata).fillna(0)
        subdata.columns = [column + "_" + str(idx) for column in subdata.columns]

        data.append(subdata)
        del subdata

    data = pd.concat(data, axis=1)
    data["label"] = label

    return data

## Test data

### Converting an image to dataframe with labels

In [10]:
# Get filepaths
tiff_files = ["../data/raster_to_df_sample/cabanglasan.tif"]
mask_file = "../data/raster_to_df_sample/cabanglasan_mask.tiff"

In [11]:
data = read_bands(tiff_files, mask_file)

  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


In [12]:
data

Unnamed: 0,B1_0,B2_0,B3_0,B4_0,B5_0,B6_0,B7_0,B8_0,B9_0,B10_0,B11_0,B12_0,label
0,514530987,413800618,400693218,299766238,458038093,869741527,1069432766,1010383929,1207977984,161941927,855782146,430381479,0
1,514530987,411441286,397088683,304091680,458038093,869741527,1069432766,1027554623,1207977984,161941927,855782146,430381479,0
2,493034851,404559901,406329400,303632921,445848211,875967542,1064320880,994261827,1166755211,165349851,783429298,387389207,0
3,493034851,394729351,380114600,270995495,445848211,875967542,1064320880,1000749990,1166755211,165349851,783429298,387389207,0
4,493034851,401610736,390010687,276172918,453778188,894645587,1059143457,1039613431,1159218456,165349851,775237173,383981283,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775824,0,0,0,0,0,0,0,0,0,0,0,0,0
775825,0,0,0,0,0,0,0,0,0,0,0,0,0
775826,0,0,0,0,0,0,0,0,0,0,0,0,0
775827,0,0,0,0,0,0,0,0,0,0,0,0,0
