In [3]:
import numpy as np
import pandas as pd
import rasterio
import rasterstats
import pathlib

from dl_toolbox.datasources import *
mean = np.array([0,0,0,0], dtype=np.float64)
count = np.array([0,0,0,0], dtype=np.float64)

scale_factor = 1/10

cities = [
    ('toulouse', DigiPolygons.toulouse.value),
    ('arcachon', DigiPolygons.arcachon.value),
    ('biarritz', DigiPolygons.biarritz.value),
    #('brisbane', brisbane),
    #('buenos-aires', buenosaires),
    #('can-tho', cantho),
    #('helsinki', helsinki),
    #('lagos', lagos),
    #('le-caire', cairo),
    #('maros', maros),
    ('montpellier', DigiPolygons.montpellier.value),
    #('munich', munich),
    ('strasbourg', DigiPolygons.strasbourg.value),
    ('nantes', DigiPolygons.nantes.value),
    #('new-york', newyork),
    ('paris', DigiPolygons.paris.value),
    #('port-elisabeth',portelisabeth),
    #('rio-janeiro', rio),
    #('san-francisco', sanfrancisco),
    #('shanghai', shanghai),
    #('tianjin', tianjin)
]

digitanie = pathlib.Path('/work/AI4GEO/data/DATA/DATASETS/DIGITANIE_v3')
#digitanie = Path('/data/DIGITANIE')

stats_table = []


for city, poly in cities:
    
    #city = city.capitalize()
    city = city.upper()
    
    print(f'processing {city}')

    with rasterio.open(digitanie/f"{city}/{city}.tif") as dataset:
        
        print(f'data shape {dataset.shape}')
        
        # resample data to target shape
        data = dataset.read(
            out_shape=(
                dataset.count,
                int(dataset.height * scale_factor),
                int(dataset.width * scale_factor)
            ),
            resampling=rasterio.enums.Resampling.bilinear,
            out_dtype=np.float32
        )

        # scale image transform
        transform = dataset.transform * dataset.transform.scale(
            (dataset.width / data.shape[-1]),
            (dataset.height / data.shape[-2])
        )
        
    stat_vals = [city]

    for band in range(1, 5):
        
        print(f"computing running mean over dataset and stats per city for band {band}")
        stat = rasterstats.zonal_stats(
            poly, 
            data[band-1],
            stats="count mean min max percentile_0.5 percentile_1 percentile_2 percentile_99.5 percentile_99 percentile_98",
            affine=transform,
            band_num=band
        )[0]
        count[band-1] += stat['count']
        mean[band-1] += stat['count']*(stat['mean']- mean[band-1]) / count[band-1]
        
        stat_vals += [
            stat['min'],
            stat['max'],
            stat['percentile_0.5'],
            stat['percentile_1'],
            stat['percentile_2'],
            stat['percentile_99.5'],
            stat['percentile_99'],
            stat['percentile_98']
        ]
        
    stats_table.append(stat_vals)
              
print(f'dataset mean: {mean}')

cols = ['city']
for i in range(1,5):
    cols += [
        f'p0_{i}',
        f'p100_{i}',
        f'p0.5_{i}',
        f'p1_{i}',
        f'p2_{i}',
        f'p99.5_{i}',
        f'p99_{i}',
        f'p98_{i}',
    ]
    
df_stats = pd.DataFrame(stats_table, columns = cols)

from IPython.display import HTML
pd.options.display.float_format = '{:,.4f}'.format
HTML(df_stats.to_html(index=False))

processing TOULOUSE
data shape (54062, 55600)
computing running mean over dataset and stats per city for band 1
computing running mean over dataset and stats per city for band 2
computing running mean over dataset and stats per city for band 3
computing running mean over dataset and stats per city for band 4
processing ARCACHON
data shape (45609, 45980)
computing running mean over dataset and stats per city for band 1
computing running mean over dataset and stats per city for band 2
computing running mean over dataset and stats per city for band 3
computing running mean over dataset and stats per city for band 4
processing BIARRITZ
data shape (52359, 46818)
computing running mean over dataset and stats per city for band 1
computing running mean over dataset and stats per city for band 2
computing running mean over dataset and stats per city for band 3
computing running mean over dataset and stats per city for band 4
processing MONTPELLIER
data shape (69683, 40416)
computing running mea

city,p0_1,p100_1,p0.5_1,p1_1,p2_1,p99.5_1,p99_1,p98_1,p0_2,p100_2,p0.5_2,p1_2,p2_2,p99.5_2,p99_2,p98_2,p0_3,p100_3,p0.5_3,p1_3,p2_3,p99.5_3,p99_3,p98_3,p0_4,p100_4,p0.5_4,p1_4,p2_4,p99.5_4,p99_4,p98_4
TOULOUSE,0.0277,1.0329,0.0363,0.0383,0.0407,0.3343,0.2865,0.2464,0.048,0.9887,0.0578,0.0599,0.0626,0.3027,0.2531,0.212,0.0704,0.942,0.0789,0.0804,0.0822,0.2909,0.2447,0.2055,0.0194,1.0318,0.0246,0.0299,0.0556,0.409,0.381,0.3524
ARCACHON,0.0235,0.9582,0.0272,0.0279,0.0287,0.3115,0.2684,0.2139,0.0446,0.8634,0.0522,0.0528,0.0537,0.2606,0.2283,0.1843,0.0643,0.8096,0.0721,0.0733,0.0747,0.2225,0.2011,0.1705,0.0112,0.958,0.0137,0.0139,0.0141,0.3704,0.3497,0.3203
BIARRITZ,0.0235,0.9998,0.028,0.0287,0.0299,0.2796,0.2412,0.2041,0.0408,0.9111,0.0484,0.05,0.0521,0.2514,0.2018,0.1679,0.0618,0.8795,0.0674,0.0685,0.07,0.2433,0.19,0.1608,0.0123,1.026,0.0152,0.0159,0.0168,0.4711,0.4512,0.4286
MONTPELLIER,0.0,1.084,0.0416,0.0438,0.0463,0.3209,0.285,0.2524,0.0,1.066,0.0626,0.0656,0.0693,0.2847,0.2508,0.222,0.0,1.0456,0.0759,0.0785,0.0817,0.2666,0.2353,0.2098,0.0,1.1669,0.0177,0.0183,0.0192,0.4118,0.382,0.352
STRASBOURG,0.0211,0.9323,0.027,0.0275,0.0284,0.2815,0.2342,0.195,0.0362,0.8796,0.0483,0.0498,0.0514,0.2604,0.2144,0.1803,0.0458,0.8543,0.0632,0.0655,0.0683,0.2555,0.2104,0.1795,0.0176,0.9514,0.0193,0.0199,0.0218,0.4984,0.4725,0.4556
NANTES,0.0444,1.9954,0.0495,0.0507,0.052,0.2564,0.2166,0.1785,0.0684,1.8037,0.0758,0.0768,0.0782,0.2401,0.2058,0.1771,0.1022,1.7459,0.1126,0.1135,0.1146,0.253,0.2217,0.1961,0.0305,2.2352,0.0351,0.0363,0.0397,0.4353,0.4133,0.3898
PARIS,0.0406,1.6277,0.0589,0.0613,0.0643,0.3266,0.2888,0.2545,0.0596,1.5346,0.0812,0.0831,0.0855,0.3063,0.2727,0.243,0.0871,1.4651,0.1073,0.1092,0.1113,0.2978,0.2691,0.2439,0.0278,1.6678,0.0418,0.0434,0.05,0.4249,0.3924,0.3592


In [4]:
df_stats_duplic = df_stats.loc[df_stats.index.repeat(10)].reset_index(drop=True)

In [12]:
cls_distrib_table = []

def class_distrib_tile(msk_path, num_class):
        
    with rasterio.open(msk_path) as file:
        labels = file.read(out_dtype=np.uint8)
    
    counts, bins = np.histogram(labels.flatten(), range(num_class+1))
    
    return counts, bins

NUM_CLASS = 10
for city, _ in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    msk_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city/'COS9').glob('*_mask.tif')],
        key=lambda x: int(x.stem.split('_')[1])
    )
    
    for msk in msk_paths:
        
        tile_distrib = [str(msk)]
        counts, bins = class_distrib_tile(digitanie/msk, NUM_CLASS)
        tile_distrib += list(counts)
        cls_distrib_table.append(tile_distrib)
    
cols = ['mask']+[f'class {i}' for i in range(NUM_CLASS)]
df_cls = pd.DataFrame(cls_distrib_table, columns = cols)
#HTML(df_cls.to_html(index=False))

processing TOULOUSE
processing ARCACHON
processing BIARRITZ
processing MONTPELLIER
processing STRASBOURG
processing NANTES
processing PARIS


In [13]:
total = list(df_cls.sum()[1:])
print(total)
coeffs_ce = [np.round(max(total)/c,1) for c in total]
print(coeffs_ce)
coeffs_bce = [np.round((sum(total) - c)/c,1) for c in total]
print(coeffs_bce)

[53595582, 11710278, 55743888, 12429850, 61576043, 86512821, 11410248, 39022780, 3117383, 425447]
[1.6, 7.4, 1.6, 7.0, 1.4, 1.0, 7.6, 2.2, 27.8, 203.3]
[5.3, 27.7, 5.0, 26.0, 4.4, 2.9, 28.4, 7.6, 106.6, 787.7]


In [14]:
split_table = []

for city, poly in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    img_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city).glob('*_[0-9].tif')], 
        key=lambda x: int(x.stem.split('_')[-1])
    )
    
    for i, img in enumerate(img_paths):
        
        if i < 7: split=0
        elif i < 8: split=1
        else: split=2
        
        with rasterio.open(digitanie/img) as f:
            height, width = f.shape
            
        row = [img,0,0,width,height,split]
        split_table.append(row)
    
cols = ['img', 'col_off', 'row_off', 'width', 'height', 'split']
df_split = pd.DataFrame(split_table, columns = cols)
#HTML(df_split.to_html(index=False))

processing TOULOUSE
processing ARCACHON
processing BIARRITZ
processing MONTPELLIER
processing STRASBOURG
processing NANTES
processing PARIS


In [5]:
#df_split.to_csv('../datamodules/splits/digitanie_france/split.csv', index=True)
df_stats_duplic.to_csv('../datamodules/splits/digitanie_france/stats.csv', index=True)
#df_cls.to_csv('../datamodules/splits/digitanie_france/cls.csv', index=True)