In [None]:
from glob import glob
import os 
import json
import requests
from pyproj import Transformer

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import HTML

from datetime import datetime
from pandas import concat, DataFrame, IndexSlice, read_csv, MultiIndex
from numpy import sqrt, argsort, argmin, array, vstack
import scipy.stats as stats
from math import cos, radians
from scipy.ndimage import gaussian_filter1d

sns.set_style('white')

ToDo
- convert RD to latitude and longitude before export [DONE]
- interpolate square voxel for smooth shapes

# Settings

In [2]:
url_openstreetmap = "https://nominatim.openstreetmap.org/search"

In [3]:
dir_geotop = 'data/GeoTOP_v01r6s1_csv_bestanden/'
dir_export = 'output/'

In [4]:
# projections for different coordinate systems
projection_rd_amersfoort = 'epsg:28992'
projection_geocoordinates = 'epsg:4326'

In [5]:
save = False

In [6]:
map_lithoclasses = dict({
    0: 'NaN', 1: 'veen', 2: 'klei', 3: 'kleiig_zand', 
    4: 'vervallen', 5: 'zand_fijn', 6: 'zand_matig_grof',
    7: 'zand_grof', 8: 'grind'
    })

In [7]:
material_color_mapping = dict({
    'NaN': '#ffffff',
    'veen': '#64564c',
    'klei':'#b2a38d', 
    'kleiig_zand':'#8a8783', 
    'vervallen':'#ee82ee', 
    'zand_fijn':'#000000', 
    'zand_matig_grof': '#c5c5c5',  
    'zand_grof': '#616160',
    'grind': '#ffff82',
    'schelpen': '#eb611e' 
    })


In [8]:
# crop to Rotterdam
#x_min, x_max = 56761, 101916
#y_min, y_max = 427675, 447090

# crop to Rotterdam centre
rotterdam_x_min, rotterdam_x_max = 83810, 98770 # 89461, 96059 #
rotterdam_y_min, rotterdam_y_max = 432003, 440842 #432754, 438916 #

file_selection = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09']

In [None]:
user_input_address = None
data = DataFrame()

# Utils

In [10]:
def get_data_for_coordinates(x, y, data):
    df = data.loc[x, y, :].sort_index(ascending=False)
    
    df = df.copy()  # avoid SettingWithCopyWarning
    df['x'] = x
    df['y'] = y
    
    return df.reset_index()
    
def get_material_and_color_profile(datapoint, map_lithoclasses, material_color_mapping):
    datapoint['lithoclass_material'] = [map_lithoclasses[lithoclass] for lithoclass in datapoint.lithoklasse]
    
    datapoint.loc[:, 'color_lithoclass_material'] = [
        material_color_mapping[datapoint.loc[z, 'lithoclass_material']] for z in datapoint.index
        ]

    return datapoint

In [11]:
def find_closest_points_to_input(data, latitude, longitude, delta_lat=0.001, delta_lon=0.001):
    df_box = data.reset_index()[
        (data.reset_index()['lat'] >= latitude - delta_lat) & (data.reset_index()['lat'] <= latitude + delta_lat) &
        (data.reset_index()['lon'] >= longitude - delta_lon) & (data.reset_index()['lon'] <= longitude + delta_lon)
    ]

    print(
        f"found {len(df_box[['lat', 'lon']].drop_duplicates())} unique lat/lon pairs:\n"
        f"{df_box[['lat', 'lon']].drop_duplicates()}"
    )
    return df_box

In [12]:
def plot_3D_projection_datapoints(profiles, columns_selection, figsize=(10,8)):

    color_column = [col for col in columns_selection if 'color' in col]
    if len(color_column)!= 1:
        print('failed to identify color for material!')
    else:
        color_column = color_column[0]

    material_column = [col for col in columns_selection if 'material' in col and 'color' not in col]
    if len(material_column)!= 1:
        print('failed to identify material!')
    else:
        material_column = material_column[0]

    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111, projection='3d')

    material_colors = {}

    for profile in profiles:
        df = profile[columns_selection].sort_index(ascending=False)

        xs = df['x'].values
        ys = df['y'].values
        zs = df.index.values    
        cs = df[color_column].values

        ax.scatter(xs, ys, zs, c=cs, s=100, depthshade=True)

        for mat, col in zip(df[material_column], df[color_column]):
            material_colors[mat] = col

    ax.set_xlabel("X Coordinate")
    ax.set_ylabel("Y Coordinate")
    ax.set_zlabel("Depth (m)")

    ax.set_zlim(ax.get_zlim()[::-1])

    ax.xaxis.pane.fill = False
    ax.yaxis.pane.fill = False
    ax.zaxis.pane.fill = False
    ax.grid(False)

    handles = [
        plt.Line2D([0], [0], marker='o', color='w', label=mat, markerfacecolor=color, markersize=8)
        for mat, color in material_colors.items()
        ]
    ax.legend(handles=handles, title=material_column, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.title(f"3D Geological Profiles for RD coordinates {int(x)},{int(y)}", pad=20)
    plt.tight_layout()

    return fig

In [13]:
def convert_rd_into_geocoordinates(data):
    
    x_rd, y_rd, z = data.index.codes[0], data.index.codes[1], data.index.codes[2]

    # If your MultiIndex uses actual values, get them:
    x_vals = data.index.get_level_values(0).to_numpy()
    y_vals = data.index.get_level_values(1).to_numpy()
    z_vals = data.index.get_level_values(2).to_numpy()

    # Set up transformer
    transformer = Transformer.from_crs("epsg:28992", "epsg:4326", always_xy=True)

    # Vectorized transformation
    lon, lat = transformer.transform(x_vals, y_vals)

    # Create MultiIndex directly from NumPy arrays (fast)
    data.index = MultiIndex.from_arrays([lon, lat, z_vals], names=['lon', 'lat', 'z'])
    return data

# User Input

In [14]:
user_input_address = "Depot Boijmans Van Beuningen"

## Get RD coordinates from Address

### get geometric coordinates for address

In [15]:
if not user_input_address:
    latitude, longitude = 51.9139529, 4.4711320
    print(f"No user input defined; fall back to default: {latitude}, {longitude} (lat, lon)")

else:
    try:
        geo = requests.get(
            url_openstreetmap, 
            headers={"User-Agent": "CaraLogic (contact: silvia@caralogic.com)"}, 
            params={"q": user_input_address, "format": "json", "limit": 1}
            )

        geo.raise_for_status()
        if len(geo.json()) == 0:
            print(f"no data found for {user_input_address}")
            latitude, longitude = None, None
        else:  
            location = geo.json()[0]
            latitude, longitude = float(location['lat']), float(location['lon']) 
    except:
        latitude, longitude = 51.9139529, 4.4711320

    print(f"Coordinates found for {user_input_address}: {latitude}, {longitude} (lat, lon)")

Coordinates found for Depot Boijmans Van Beuningen: 51.9139529, 4.471132 (lat, lon)


# Import GeoTop data

### import all data

In [16]:
ls_files = sorted([file for file in glob(dir_geotop + '*.csv')])
ls_files

['data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B01.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B02.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B03.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B04.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B05.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B06.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B07.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B08.csv',
 'data/GeoTOP_v01r6s1_csv_bestanden/zuidholland_B09.csv']

In [17]:
ls_data = [read_csv(file, index_col=[0,1,2]) for file in ls_files]

data = concat(ls_data).sort_index()
data.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lithostrat,lithoklasse,kans_1_veen,kans_2_klei,kans_3_kleiig_zand,kans_4_vervallen,kans_5_zand_fijn,kans_6_zand_matig_grof,kans_7_zand_grof,kans_8_grind,kans_9_schelpen,modelonzekerheid_lithoklasse,modelonzekerheid_lithostrat
x,y,z,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
58550.0,437550.0,-49.75,5120,6,0.0,0.17,0.23,0.0,0.24,0.28,0.08,0.0,0.0,0.86,0.37
58550.0,437550.0,-49.25,5120,6,0.0,0.24,0.26,0.0,0.18,0.24,0.08,0.0,0.0,0.86,0.42
58550.0,437550.0,-48.75,5120,6,0.02,0.17,0.31,0.0,0.18,0.25,0.07,0.0,0.0,0.88,0.47
58550.0,437550.0,-48.25,5120,6,0.04,0.24,0.32,0.0,0.15,0.19,0.06,0.0,0.0,0.9,0.5
58550.0,437550.0,-47.75,5120,1,0.06,0.21,0.33,0.0,0.05,0.27,0.08,0.0,0.0,0.87,0.53
58550.0,437550.0,-47.25,5120,6,0.04,0.13,0.23,0.0,0.22,0.28,0.1,0.0,0.0,0.92,0.57
58550.0,437550.0,-46.75,5120,6,0.02,0.13,0.15,0.0,0.3,0.29,0.11,0.0,0.0,0.89,0.62
58550.0,437550.0,-46.25,5120,2,0.02,0.27,0.21,0.0,0.23,0.18,0.09,0.0,0.0,0.91,0.64
58550.0,437550.0,-45.75,5120,5,0.02,0.25,0.33,0.0,0.19,0.15,0.06,0.0,0.0,0.87,0.66
58550.0,437550.0,-45.25,5120,6,0.02,0.25,0.33,0.0,0.16,0.17,0.07,0.0,0.0,0.88,0.68


## Crop To User Defined Area

In [18]:
idx = IndexSlice
data_cropped = data.loc[idx[rotterdam_x_min:rotterdam_x_max, rotterdam_y_min:rotterdam_y_max, :], :].sort_index()
print(f"Reduced shape to {data_cropped.shape} compared to {data.shape}")

Reduced shape to (1278945, 13) compared to (24874929, 13)


## convert RD coordinates to normal geo coordinates 

and plot in 2D plotly

In [19]:
data_cropped = convert_rd_into_geocoordinates(data_cropped)
data_cropped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lithostrat,lithoklasse,kans_1_veen,kans_2_klei,kans_3_kleiig_zand,kans_4_vervallen,kans_5_zand_fijn,kans_6_zand_matig_grof,kans_7_zand_grof,kans_8_grind,kans_9_schelpen,modelonzekerheid_lithoklasse,modelonzekerheid_lithostrat
lon,lat,z,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4.354032,51.872447,-49.75,5120,3,0.01,0.09,0.5,0.0,0.08,0.3,0.02,0.0,0.0,0.7,0.0
4.354032,51.872447,-49.25,5120,3,0.0,0.19,0.41,0.0,0.1,0.26,0.04,0.0,0.0,0.78,0.0
4.354032,51.872447,-48.75,5120,3,0.01,0.16,0.33,0.0,0.11,0.3,0.09,0.0,0.0,0.85,0.0
4.354032,51.872447,-48.25,5120,3,0.01,0.19,0.4,0.0,0.08,0.26,0.06,0.0,0.0,0.81,0.0
4.354032,51.872447,-47.75,5120,6,0.0,0.09,0.21,0.0,0.09,0.5,0.11,0.0,0.0,0.75,0.0


In [20]:
unique_pairs = data_cropped.reset_index()[['lat', 'lon']].drop_duplicates()

In [21]:
fig = px.scatter_map(unique_pairs, lat='lat', lon='lon', zoom=11, height=600)
fig.update_layout(map_style="open-street-map")
fig.add_trace(
    go.Scattermap(
        lat=[latitude],
        lon=[longitude],
        mode="markers",
        marker=dict(
            size=16,
            color="orange",
            symbol="star"   # star marker
        ),
        name="User Input {latitude}, {longitude}".format(latitude=latitude, longitude=longitude)
    )
)

HTML(fig.to_html(include_plotlyjs='cdn'))

# Find data points in dataset from input

### get box around address 

In [88]:
print(f"finding closest point to {longitude}, {latitude}:\n")
points_around_input = find_closest_points_to_input(
    data=data_cropped, latitude=latitude, longitude=longitude, delta_lat=0.002, delta_lon=0.004
    )

finding closest point to 4.471132, 51.9139529:

found 25 unique lat/lon pairs:
              lat       lon
679173  51.912045  4.467944
679273  51.912944  4.467925
679373  51.913843  4.467907
679472  51.914741  4.467889
679570  51.915640  4.467870
687726  51.912057  4.469397
687824  51.912955  4.469378
687922  51.913854  4.469360
688020  51.914753  4.469342
688118  51.915651  4.469323
696223  51.912068  4.470850
696321  51.912967  4.470832
696419  51.913865  4.470813
696517  51.914764  4.470795
696614  51.915663  4.470777
704730  51.912079  4.472303
704828  51.912978  4.472285
704926  51.913877  4.472267
705024  51.914775  4.472248
705122  51.915674  4.472230
713235  51.912091  4.473756
713332  51.912989  4.473738
713429  51.913888  4.473720
713527  51.914787  4.473702
713625  51.915685  4.473683


In [95]:
fig = px.scatter_map(points_around_input[['lat', 'lon']].drop_duplicates(), lat='lat', lon='lon', zoom=15, height=600)
fig.update_traces(marker=dict(size=10))
fig.update_layout(map_style="open-street-map")

fig.add_trace(
    go.Scattermap(
        lat=[latitude],
        lon=[longitude],
        mode="markers",
        marker=dict(
            size=16,
            color="orange",
            symbol="star" 
        ),
        name="User Input {latitude}, {longitude}".format(latitude=latitude, longitude=longitude)
    )
)

HTML(fig.to_html(include_plotlyjs='cdn'))

### get profiles for data points

In [90]:
points_around_input['lithoklasse_material'] = [map_lithoclasses[k] for k in points_around_input.lithoklasse]
points_around_input['lithoklasse_color'] = [material_color_mapping[c] 
                                            for c in points_around_input['lithoklasse_material']]

In [91]:
profiles = points_around_input.groupby(['lat', 'lon'])

unique_points = []
unique_pairs = 0
for (lat, lon), group in profiles:
    layers = group[['z', 'lithoklasse_material', 'lithoklasse_color']].sort_values('z')
    unique_points.append({
        'lat': lat,
        'lon': lon,
        'layers': layers.to_dict(orient='records')
    })
    unique_pairs +=1
print(f"processed {unique_pairs} unique lat/lon pairs")
    
profiles = dict()
for g in unique_points:
    profiles[tuple([g['lat'], g['lon']])] = DataFrame(g['layers']) 

profiles

processed 25 unique lat/lon pairs


{(np.float64(51.912045209441125),
  np.float64(4.467943641689354)):         z lithoklasse_material lithoklasse_color
 0  -49.75          kleiig_zand           #8a8783
 1  -49.25            zand_fijn           #000000
 2  -48.75            zand_fijn           #000000
 3  -48.25          kleiig_zand           #8a8783
 4  -47.75            zand_fijn           #000000
 ..    ...                  ...               ...
 95  -2.25          kleiig_zand           #8a8783
 96  -1.75                  NaN           #ffffff
 97  -1.25                  NaN           #ffffff
 98  -0.75                  NaN           #ffffff
 99  -0.25                  NaN           #ffffff
 
 [100 rows x 3 columns],
 (np.float64(51.9120565670889),
  np.float64(4.4693968373693025)):         z lithoklasse_material lithoklasse_color
 0  -49.75            zand_fijn           #000000
 1  -49.25            zand_fijn           #000000
 2  -48.75            zand_fijn           #000000
 3  -48.25            zand_fijn         

# Data Vis 3D Projection

In [92]:
def add_cuboid(fig, x, y, z_bottom, z_top, color, dx=0.01, dy=0.01):
    """
    Adds a cuboid (rectangular prism) to a Plotly 3D figure.
    x, y = center coordinates
    z_bottom, z_top = vertical boundaries
    dx, dy = width in x and y directions
    color = face color
    """
    x0, x1 = x - dx/2, x + dx/2
    y0, y1 = y - dy/2, y + dy/2
    z0, z1 = z_bottom, z_top

    vertices = array([
        [x0, y0, z0],
        [x1, y0, z0],
        [x1, y1, z0],
        [x0, y1, z0],
        [x0, y0, z1],
        [x1, y0, z1],
        [x1, y1, z1],
        [x0, y1, z1],
    ])

    I = [0,0,0,1,1,2,3,4,4,5,6,7]
    J = [1,3,4,2,5,3,7,5,7,6,7,5]
    K = [2,2,5,3,6,7,6,6,7,7,4,6]

    fig.add_trace(go.Mesh3d(
        x=vertices[:,0],
        y=vertices[:,1],
        z=vertices[:,2],
        color=color,
        opacity=1.0,
        i=I, j=J, k=K,
        flatshading=True
    ))


In [94]:
fig = go.Figure()

dx, dy = 0.002, 0.002
for pt in unique_points:
    x, y = pt['lon'], pt['lat']
    layers = sorted(pt['layers'], key=lambda l: l['z'], reverse=False)
    for i in range(len(layers)-1):
        z_bottom = layers[i]['z']
        z_top = layers[i+1]['z']
        color = layers[i]['lithoklasse_color']
        add_cuboid(fig, x, y, z_bottom, z_top, color, dx=dx, dy=dy)

    top_layer = layers[-1]
    add_cuboid(fig, x, y, top_layer['z'], top_layer['z']+0.5, top_layer['lithoklasse_color'], dx=dx, dy=dy)

fig.update_layout(
    scene=dict(
        xaxis_title='Longitude',
        yaxis_title='Latitude',
        zaxis_title='Depth',
        zaxis=dict(autorange='reversed')
    ),
    width=800, height=500
)

HTML(fig.to_html(include_plotlyjs='cdn'))

### Smoothing Filter 

##### Majority voting / clustering

In [None]:
from numpy import linspace, concatenate, unique, round, interp
from collections import Counter

In [105]:
all_z = unique(concatenate([
    array([l['z'] for l in pt['layers']])
    for pt in unique_points
]))

z_grid = linspace(all_z.min(), all_z.max(), 10) 

all_materials = list(set(l['lithoklasse_material'] for pt in unique_points for l in pt['layers']))
material_to_num = {m: i for i, m in enumerate(all_materials)}
num_to_material = {i: m for m, i in material_to_num.items()}

smoothed_per_point = []

for pt in unique_points:
    layers = sorted(pt['layers'], key=lambda l: l['z'])
    z_orig = array([l['z'] for l in layers])
    mat_nums = array([material_to_num[l['lithoklasse_material']] for l in layers])
    
    # Interpolate using nearest neighbor
    interp_material = interp(z_grid, z_orig, mat_nums, left=mat_nums[0], right=mat_nums[-1])
    interp_material = round(interp_material).astype(int)
    
    smoothed_layers = [{'z': z, 'lithoklasse_material': num_to_material[m]} for z, m in zip(z_grid, interp_material)]
    
    smoothed_per_point.append({
        'lat': pt['lat'],
        'lon': pt['lon'],
        'layers': smoothed_layers
    })


In [113]:
majority_layers = []
for i, z in enumerate(z_grid):
    materials_at_z = [pt['layers'][i]['lithoklasse_material'] for pt in smoothed_per_point]
    
    most_common = Counter(materials_at_z).most_common(1)[0][0]    
    majority_layers.append({
        'z': z, 'lithoklasse_material': most_common, 'lithoklasse_color': material_color_mapping[most_common]
        })

final_points = []
for pt in unique_points:
    final_points.append({
        'lat': pt['lat'],
        'lon': pt['lon'],
        'layers_smoothed': majority_layers  
    })


In [115]:
fig = go.Figure()

dx, dy = 0.002, 0.002
for pt in final_points:
    x, y = pt['lon'], pt['lat']
    layers = sorted(pt['layers_smoothed'], key=lambda l: l['z'], reverse=False)
    for i in range(len(layers)-1):
        z_bottom = layers[i]['z']
        z_top = layers[i+1]['z']
        color = layers[i]['lithoklasse_color']
        add_cuboid(fig, x, y, z_bottom, z_top, color, dx=dx, dy=dy)

    top_layer = layers[-1]
    add_cuboid(fig, x, y, top_layer['z'], top_layer['z']+0.5, top_layer['lithoklasse_color'], dx=dx, dy=dy)

fig.update_layout(
    scene=dict(
        xaxis_title='Longitude',
        yaxis_title='Latitude',
        zaxis_title='Depth',
        zaxis=dict(autorange='reversed')
    ),
    width=800, height=500
)

HTML(fig.to_html(include_plotlyjs='cdn'))

##### Gaussian or moving average smoothing along depth

In [129]:
all_materials = list(set(l['lithoklasse_material'] for pt in unique_points for l in pt['layers']))
material_to_num = {m: i for i, m in enumerate(all_materials)}
num_to_material = {i: m for m, i in material_to_num.items()}

smoothed_points = []

# Kernel size: sigma in Gaussian filter controls smoothness
sigma = 2  # adjust; larger = smoother

for pt in unique_points:
    layers = sorted(pt['layers'], key=lambda l: l['z'])
    z_vals = array([l['z'] for l in layers])
    mat_nums = array([material_to_num[l['lithoklasse_material']] for l in layers])
    
    smoothed_nums = gaussian_filter1d(mat_nums.astype(float), sigma=sigma)
    
    smoothed_materials = [num_to_material[int(round(n))] for n in smoothed_nums]
    
    smoothed_layers = [
        {'z': z, 'lithoklasse_material': mat, 'lithoklasse_color': material_color_mapping[mat]} 
        for z, mat in zip(z_vals, smoothed_materials)
        ]
    
    smoothed_points.append({
        'lat': pt['lat'],
        'lon': pt['lon'],
        'layers_smoothed': smoothed_layers
    })


In [144]:
import plotly.io as pio

pio.renderers.default = "notebook_connected"  # interactive widgets
# or "iframe" / "browser" if you want


In [146]:
fig = go.Figure()

dx, dy = 0.002, 0.002
for pt in smoothed_points:
    x, y = pt['lon'], pt['lat']
    layers = sorted(pt['layers_smoothed'], key=lambda l: l['z'], reverse=False)
    for i in range(len(layers)-1):
        z_bottom = layers[i]['z']
        z_top = layers[i+1]['z']
        color = layers[i]['lithoklasse_color']
        add_cuboid(fig, x, y, z_bottom, z_top, color, dx=dx, dy=dy)

    top_layer = layers[-1]
    add_cuboid(fig, x, y, top_layer['z'], top_layer['z']+0.5, top_layer['lithoklasse_color'], dx=dx, dy=dy)

fig.update_layout(
    scene=dict(
        xaxis_title='Longitude',
        yaxis_title='Latitude',
        zaxis_title='Depth',
        zaxis=dict(autorange='reversed')
    ),
    width=800, height=500
)

HTML(fig.to_html(include_plotlyjs='cdn'))

# Output as JSON

### prepare for output

# Prepare for Output

In [None]:
lithoclass_material = data['lithoklasse'].map(map_lithoclasses)
data['lithoclass_material'] = lithoclass_material

In [None]:
idx = IndexSlice
cropped = data.loc[idx[rotterdam_x_min:rotterdam_x_max, rotterdam_y_min:rotterdam_y_max, :], :].sort_index()
print(f"Reduced shape to {cropped.shape} compared to {data.shape}")

In [None]:
selected_columns = [
    'x', 'y', 'z', 'lithoklasse', 'lithoclass_material', 
    'kans_1_veen', 'kans_2_klei', 'kans_3_kleiig_zand', 'kans_4_vervallen', 'kans_5_zand_fijn',
    'kans_6_zand_matig_grof', 'kans_7_zand_grof', 'kans_8_grind', 'kans_9_schelpen'
]

cropped = cropped.reset_index()[selected_columns]
cropped.rename(columns={
    'lithoklasse':'lithoklasse_id', 
    'lithoclass_material':'lithoklasse'
    }, inplace=True)

cropped

In [None]:
profiled = cropped.groupby(['x', 'y']).apply(
    lambda g: sorted(g.to_dict(orient='records'), key=lambda d: d['z'], reverse=True)
).reset_index(name='data')

list_of_lists = profiled['data'].tolist()

### export all data in one file

for all data points, describe the identified lithoclass (through mapping) and select likelihood of all material being present 

In [None]:
base_name = f"B01-to-B09_lithoclass_materials_and_likelihood"
name_file =  f"_{rotterdam_x_min}-{rotterdam_x_max}_{rotterdam_y_min}-{rotterdam_y_max}_{datetime.now().date().isoformat()}.json"
file_path = dir_export + base_name + name_file

with open(file_path, 'w') as f:
    json.dump(list_of_lists, f)

print(f"stored {file_path}")

### export in batches

In [None]:
MAX_BYTES = 5 * 1024 * 1024  # 5 MB

batch = []
batch_size = 0
file_index = 1

output_dir = dir_export + f"json_5MB_chunks_{datetime.now().date().isoformat()}/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
for sublist in list_of_lists:
    sublist_bytes = len(json.dumps(sublist, separators=(',', ':')).encode('utf-8'))

    if batch_size + sublist_bytes > MAX_BYTES and batch:
        file_name = os.path.join(output_dir, f"litho_batch_{file_index}.json")
        with open(file_name, 'w') as f:
            json.dump(batch, f, separators=(',', ':'))
        print(f"Stored {file_name} ({batch_size / 1024**2:.2f} MB)")

        file_index += 1
        batch = []
        batch_size = 0

    batch.append(sublist)
    batch_size += sublist_bytes

if batch:
    file_name = os.path.join(output_dir, f"litho_batch_{file_index}.json")
    with open(file_name, 'w') as f:
        json.dump(batch, f, separators=(',', ':'))
    print(f"Stored {file_name} ({batch_size / 1024**2:.2f} MB)")

# Read JSON

In [None]:
output_dir = 'output/json_5MB_chunks_2025-11-24/'

In [None]:
ls_files_for_read = [file for file in glob(output_dir + '*.json')]

In [None]:
ls_df = []
for path in ls_files_for_read:
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        print(f"Reading {path}")

        dfs = []

        for f_sub in data:
            if isinstance(f_sub, list):
                if len(f_sub) > 0 and isinstance(f_sub[0], dict):
                    dfs.append(DataFrame(f_sub))
                else:
                    dfs.append(DataFrame({"value": f_sub}))
            
            elif isinstance(f_sub, dict):
                dfs.append(DataFrame([f_sub]))

            else:
                dfs.append(DataFrame({"value": [f_sub]}))

        file_df = concat(dfs, ignore_index=True)
        ls_df.append(file_df)


df_import = concat(ls_df)

In [None]:
print(
    f"comparing shapes of datasets:"
    f"\n original dataset: {cropped.shape},"
    f"\n re-imported dataset: {df_import.shape} and"
    f"\n duplicated removed: {df_import.drop_duplicates().shape}")