### Semcity

In [1]:
import numpy as np
import pandas as pd
import rasterio
import rasterstats
import pathlib

num_img = [f'TLS_BDSD_M_{i:02}.tif' for i in range(1, 16)]
semcity = pathlib.Path('/data/SemCity-Toulouse-bench/img_multispec_05/TLS_BDSD_M')
semcity_dfs = {}
scale_factor = 1/4
imgs = []
for num in num_img:
    tile_path = semcity/num
    tile_stats = []
    with rasterio.open(tile_path) as ds:
        img = ds.read(
            out_shape=(
                ds.count,
                int(ds.height * scale_factor),
                int(ds.width * scale_factor)
            ),
            resampling=rasterio.enums.Resampling.bilinear,
            out_dtype=np.float32
        )
        imgs.append(np.reshape(img, (8,-1)))
data = np.concatenate(imgs, axis=1)        
mean = np.mean(data, axis=(1))[np.newaxis,:]
std = np.std(data, axis=(1))[np.newaxis,:]
percentiles_vals = np.array([0, 0.5, 1, 2, 98, 99, 99.5, 100])
percentiles = np.percentile(data, q=percentiles_vals, axis=1)
tile_stats = np.concatenate([mean, std, percentiles], axis=0)
tile_df = pd.DataFrame(
    tile_stats.transpose(),
    index=[f'band_{i}' for i in range(1,9)],
    columns = ['mean', 'std'] + [f'p{str(i)}' for i in percentiles_vals]
)
print(tile_df['mean'].loc['band_1'])
tile_df.to_csv('./semcity_stats.csv')
df = pd.read_csv('./semcity_stats.csv', index_col=0)
print(df['mean'].loc['band_1'])

322.777099609375
322.777099609375


In [2]:
df

Unnamed: 0,mean,std,p0.0,p0.5,p1.0,p2.0,p98.0,p99.0,p99.5,p100.0
band_1,322.7771,59.326756,0.0,258.191406,262.050781,265.962891,475.78125,551.963496,652.297852,2492.530273
band_2,273.763428,82.755508,0.0,184.3125,189.280273,194.268555,487.035762,593.495537,732.981074,3218.647461
band_3,345.823303,130.571365,0.0,184.500977,189.282227,195.941406,688.098867,854.931455,1078.33022,3321.768555
band_4,345.391846,163.533218,0.0,140.9375,147.586914,155.711914,760.711914,913.821104,1131.410063,3901.088867
band_5,215.733887,123.387817,0.0,67.041992,72.379883,78.053711,531.698242,625.225029,756.831938,3117.879883
band_6,432.866486,184.068466,0.0,102.390625,107.286133,115.136719,838.165645,944.909971,1089.32064,3784.349609
band_7,411.423279,218.723602,0.0,55.307617,59.609375,67.114258,971.815059,1054.635557,1125.375,4373.316406
band_8,390.854584,208.512939,0.0,38.797852,42.194336,49.271484,914.756465,986.087705,1046.629883,3120.116211


In [76]:
cls_distrib_table = []

def class_distrib_tile(msk_path, num_class):
        
    with rasterio.open(msk_path) as file:
        labels = file.read(out_dtype=np.uint8)
    
    counts, bins = np.histogram(labels.flatten(), range(num_class+1))
    
    return counts, bins

NUM_CLASS = 10
for city, _ in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    msk_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city/'COS9').glob('*_mask.tif')],
        key=lambda x: int(x.stem.split('_')[1])
    )
    
    for msk in msk_paths:
        
        tile_distrib = [str(msk)]
        counts, bins = class_distrib_tile(digitanie/msk, NUM_CLASS)
        tile_distrib += list(counts)
        cls_distrib_table.append(tile_distrib)
    
cols = ['mask']+[f'class {i}' for i in range(NUM_CLASS)]
df_cls = pd.DataFrame(cls_distrib_table, columns = cols)
#HTML(df_cls.to_html(index=False))

NameError: name 'cities' is not defined

In [13]:
total = list(df_cls.sum()[1:])
print(total)
coeffs_ce = [np.round(max(total)/c,1) for c in total]
print(coeffs_ce)
coeffs_bce = [np.round((sum(total) - c)/c,1) for c in total]
print(coeffs_bce)

[53595582, 11710278, 55743888, 12429850, 61576043, 86512821, 11410248, 39022780, 3117383, 425447]
[1.6, 7.4, 1.6, 7.0, 1.4, 1.0, 7.6, 2.2, 27.8, 203.3]
[5.3, 27.7, 5.0, 26.0, 4.4, 2.9, 28.4, 7.6, 106.6, 787.7]


In [14]:
split_table = []

for city, poly in cities:
    
    #city = city.capitalize()
    city = city.upper()
    print(f'processing {city}')
    
    img_paths = sorted(
        [path.relative_to(digitanie) for path in pathlib.Path(digitanie/city).glob('*_[0-9].tif')], 
        key=lambda x: int(x.stem.split('_')[-1])
    )
    
    for i, img in enumerate(img_paths):
        
        if i < 7: split=0
        elif i < 8: split=1
        else: split=2
        
        with rasterio.open(digitanie/img) as f:
            height, width = f.shape
            
        row = [img,0,0,width,height,split]
        split_table.append(row)
    
cols = ['img', 'col_off', 'row_off', 'width', 'height', 'split']
df_split = pd.DataFrame(split_table, columns = cols)
#HTML(df_split.to_html(index=False))

processing TOULOUSE
processing ARCACHON
processing BIARRITZ
processing MONTPELLIER
processing STRASBOURG
processing NANTES
processing PARIS


In [5]:
#df_split.to_csv('../datamodules/splits/digitanie_france/split.csv', index=True)
df_stats_duplic.to_csv('../datamodules/splits/digitanie_france/stats.csv', index=True)
#df_cls.to_csv('../datamodules/splits/digitanie_france/cls.csv', index=True)