### Semcity

In [1]:
import numpy as np
import pandas as pd
import rasterio
import rasterstats
import pathlib

num_img = [f'TLS_BDSD_M_{i:02}.tif' for i in range(1, 16)]
semcity = pathlib.Path('/data/SemCity-Toulouse-bench/img_multispec_05/TLS_BDSD_M')
scale_factor = 1/4
imgs = []
for num in num_img:
    tile_path = semcity/num
    tile_stats = []
    with rasterio.open(tile_path) as ds:
        img = ds.read(
            out_shape=(
                ds.count,
                int(ds.height * scale_factor),
                int(ds.width * scale_factor)
            ),
            resampling=rasterio.enums.Resampling.bilinear,
            out_dtype=np.float32
        )
        imgs.append(np.reshape(img, (ds.count,-1)))
data = np.concatenate(imgs, axis=1)        
mean = np.mean(data, axis=(1))[np.newaxis,:]
std = np.std(data, axis=(1))[np.newaxis,:]
percentiles_vals = np.array([0, 0.5, 1, 2, 98, 99, 99.5, 100])
percentiles = np.percentile(data, q=percentiles_vals, axis=1)
tile_stats = np.concatenate([mean, std, percentiles], axis=0)
tile_df = pd.DataFrame(
    tile_stats.transpose(),
    index=[f'band_{i}' for i in range(1,9)],
    columns = ['mean', 'std'] + [f'p{str(i)}' for i in percentiles_vals]
)
print(tile_df['mean'].loc['band_1'])
tile_df.to_csv('./semcity_stats.csv')
df = pd.read_csv('./semcity_stats.csv', index_col=0)
print(df['mean'].loc['band_1'])

322.777099609375
322.777099609375


### DIGITANIE

In [7]:
import numpy as np
import pandas as pd
import rasterio
import rasterstats
import pathlib
import shapely

digitanie = pathlib.Path('/work/AI4GEO/data/DATA/PROCESSED/PHR/COG/16bits')
city = list(digitanie.glob('TOULOUSE_2013*.tif'))[0]
scale_factor = 1/10
imgs = []
city_poly = shapely.Polygon(
    [
        [359326, 4833160],
        [376735, 4842547],
        [385238, 4826271],
        [367914, 4816946],
        [359326, 4833160],
    ]
)
with rasterio.open(city) as ds:
    nbands = ds.count
    data = ds.read(
        out_shape=(
            ds.count,
            int(ds.height * scale_factor),
            int(ds.width * scale_factor)
        ),
        resampling=rasterio.enums.Resampling.bilinear,
        out_dtype=np.int16
    )
    tf = ds.transform * ds.transform.scale(
        (ds.width / data.shape[-1]),
        (ds.height / data.shape[-2])
    )
stats = []
for b in range(1, nbands+1):
    stat = rasterstats.zonal_stats(
        city_poly, 
        data[b-1],
        stats="mean std min percentile_0.5 percentile_1 percentile_2 percentile_98 percentile_99 percentile_99.5 max",
        affine=tf,
        band_num=b
    )[0]
    stats.append([
        stat['mean'],
        stat['std'],
        stat['min'],
        stat['percentile_0.5'],
        stat['percentile_1'],
        stat['percentile_2'],
        stat['percentile_98'],
        stat['percentile_99'],
        stat['percentile_99.5'],
        stat['max']
    ])
tile_stats = np.array(stats)
tile_df = pd.DataFrame(
    tile_stats,
    index=[f'band_{i}' for i in range(1,nbands+1)],
    columns = ['mean', 'std'] + [f'p{str(i)}' for i in percentiles_vals]
)
print(tile_df)
print(tile_df['mean'].loc['band_1'])
tile_df.to_csv('./digitanie_toulouse_stats.csv')
df = pd.read_csv('./digitanie_toulouse_stats.csv', index_col=0)
print(df['mean'].loc['band_1'])

In [76]:
cls_distrib_table = []

def class_distrib_tile(msk_path, num_class):
        
    with rasterio.open(msk_path) as file:
        labels = file.read(out_dtype=np.uint8)
    
    counts, bins = np.histogram(labels.flatten(), range(num_class+1))
    
    return counts, bins

NUM_CLASS = 10
for city, _ in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    msk_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city/'COS9').glob('*_mask.tif')],
        key=lambda x: int(x.stem.split('_')[1])
    )
    
    for msk in msk_paths:
        
        tile_distrib = [str(msk)]
        counts, bins = class_distrib_tile(digitanie/msk, NUM_CLASS)
        tile_distrib += list(counts)
        cls_distrib_table.append(tile_distrib)
    
cols = ['mask']+[f'class {i}' for i in range(NUM_CLASS)]
df_cls = pd.DataFrame(cls_distrib_table, columns = cols)
#HTML(df_cls.to_html(index=False))

NameError: name 'cities' is not defined

In [13]:
total = list(df_cls.sum()[1:])
print(total)
coeffs_ce = [np.round(max(total)/c,1) for c in total]
print(coeffs_ce)
coeffs_bce = [np.round((sum(total) - c)/c,1) for c in total]
print(coeffs_bce)

[53595582, 11710278, 55743888, 12429850, 61576043, 86512821, 11410248, 39022780, 3117383, 425447]
[1.6, 7.4, 1.6, 7.0, 1.4, 1.0, 7.6, 2.2, 27.8, 203.3]
[5.3, 27.7, 5.0, 26.0, 4.4, 2.9, 28.4, 7.6, 106.6, 787.7]


In [14]:
split_table = []

for city, poly in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    img_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city).glob('*_[0-9].tif')], 
        key=lambda x: int(x.stem.split('_')[-1])
    )
    
    for i, img in enumerate(img_paths):
        
        if i < 7: split=0
        elif i < 8: split=1
        else: split=2
        
        with rasterio.open(digitanie/img) as f:
            height, width = f.shape
            
        row = [img,0,0,width,height,split]
        split_table.append(row)
    
cols = ['img', 'col_off', 'row_off', 'width', 'height', 'split']
df_split = pd.DataFrame(split_table, columns = cols)
#HTML(df_split.to_html(index=False))

processing TOULOUSE
processing ARCACHON
processing BIARRITZ
processing MONTPELLIER
processing STRASBOURG
processing NANTES
processing PARIS


In [5]:
#df_split.to_csv('../datamodules/splits/digitanie_france/split.csv', index=True)
df_stats_duplic.to_csv('../datamodules/splits/digitanie_france/stats.csv', index=True)
#df_cls.to_csv('../datamodules/splits/digitanie_france/cls.csv', index=True)