In [None]:
import os 
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import plotly.io as pio

from scipy.ndimage import gaussian_filter1d
from cmcrameri import cm
from glob import glob
from datetime import datetime
from pandas import concat, DataFrame, read_csv
from numpy import arange, array, linspace
from IPython.display import HTML

import functions as ft

sns.set_style('white')

# Introduction

ToDo
- describe workflow
- 0 replace by average of the surrounding; start from bottom to top indicate estimate as True (additional parameter) - to be saved
- map-batchID vs geo coordinates as JSON

# User Input

In [None]:
user_input_address = "Depot Boijmans Van Beuningen"

# Settings

In [None]:
url = "https://service.pdok.nl/cbs/gebiedsindelingen/2025/wfs/v1_0?request=GetFeature&service=WFS&version=2.0.0&typeName=gemeente_gegeneraliseerd&outputFormat=json"

In [None]:
url_openstreetmap = "https://nominatim.openstreetmap.org/search"

In [None]:
dir_geotop = 'input/GeoTOP_v01r6s1_csv_bestanden/'
dir_export = 'output/'

In [None]:
save = False

In [None]:
# projections for different coordinate systems
projection_rd_amersfoort = 'epsg:28992'
projection_geocoordinates = 'epsg:4326'

In [None]:
map_lithoclasses = dict({
    0: 'NaN', 1: 'veen', 2: 'klei', 3: 'kleiig_zand', 
    4: 'vervallen', 5: 'zand_fijn', 6: 'zand_matig_grof',
    7: 'zand_grof', 8: 'grind', 9: 'schelpen'
    })

In [None]:
material_color_mapping = dict({
    'NaN': '#ffffff',
    'veen': '#64564c',
    'klei':'#b2a38d', 
    'kleiig_zand':'#8a8783', 
    'vervallen':'#ee82ee', 
    'zand_fijn':'#000000', 
    'zand_matig_grof': '#c5c5c5',  
    'zand_grof': '#616160',
    'grind': '#ffff82',
    'schelpen': '#eb611e' 
    })


In [None]:
data = DataFrame()

In [None]:
axes_color = '#333333'
fontsize = 10

# Get coordinates from User Input

In [None]:
if not user_input_address:
    latitude, longitude = 51.9139529, 4.4711320
    print(f"No user input defined; fall back to default: {latitude}, {longitude} (lat, lon)")

else:
    try:
        geo = requests.get(
            url_openstreetmap, 
            headers={"User-Agent": "CaraLogic (contact: silvia@caralogic.com)"}, 
            params={"q": user_input_address, "format": "json", "limit": 1}
            )

        geo.raise_for_status()
        if len(geo.json()) == 0:
            print(f"no data found for {user_input_address}")
            latitude, longitude = None, None
        else:  
            location = geo.json()[0]
            latitude, longitude = float(location['lat']), float(location['lon']) 
    except:
        latitude, longitude = 51.9139529, 4.4711320

    print(f"Coordinates found for {user_input_address}: {latitude}, {longitude} (lat, lon)")

# Get and Prepare Data

### Get GeoTop Data

In [None]:
ls_files = sorted([file for file in glob(dir_geotop + '*.csv')])
ls_files

In [None]:
ls_data = [read_csv(file, index_col=[0,1,2], engine="pyarrow") for file in ls_files]

data = concat(ls_data).sort_index()
data.head(10)

In [None]:
data.info()

## Focus on City of Rotterdam

#### Get Rotterdam City Boundary 

In [None]:
municipalities = gpd.read_file(url)
municipalities.head()

In [None]:
rotterdam = municipalities[municipalities["statnaam"] == "Rotterdam"]

rotterdam_rd = rotterdam.to_crs(epsg=projection_rd_amersfoort.split('epsg:')[1])
rotterdam_rd_json = json.loads(rotterdam_rd.to_json())

rotterdam_geo = rotterdam.to_crs(epsg=projection_geocoordinates.split('epsg:')[1])
rotterdam_geo_json = json.loads(rotterdam_geo.to_json())

In [None]:
rotterdam_rd_border = DataFrame(rotterdam_rd_json['features'][0]['geometry']['coordinates'][0][0], columns=['x', 'y'])
rotterdam_rd_border

#### Crop To Rotterdam City

In [None]:
gdf_points = gpd.GeoDataFrame(
    data, 
    geometry=gpd.points_from_xy(data.reset_index().x, data.reset_index().y), 
    crs=projection_rd_amersfoort
)

In [None]:
points_in_rotterdam = gdf_points[gdf_points.geometry.within(rotterdam_rd.union_all())]
points_in_rotterdam.head()

In [None]:
print(f"Data points available within Rotterdam {points_in_rotterdam.shape}")

## Convert RD-Coordinates to Geo-Coordinates 

and plot in 2D plotly

In [None]:
points_in_rotterdam = ft.convert_rd_into_geocoordinates(points_in_rotterdam)
points_in_rotterdam.head()

In [None]:
unique_pairs = points_in_rotterdam.reset_index()[['lat', 'lon']].drop_duplicates()

df_unique_pairs = points_in_rotterdam.reset_index().loc[unique_pairs.index]
df_unique_pairs.head()

In [None]:
center_lat, center_lon = unique_pairs.median()

fig = px.scatter_map(
    df_unique_pairs,
    lat="lat", lon="lon", center={"lat": center_lat, "lon": center_lon},
    zoom=9, height=600, map_style="carto-positron"
    )

for feature in rotterdam_geo_json["features"]:
    fig.add_trace(go.Scattermapbox(
        lat=[
            coord[1] for polygon in feature["geometry"]["coordinates"] 
            for coord in (polygon[0] if feature["geometry"]["type"] == "MultiPolygon" else polygon)
            ],
        lon=[
            coord[0] for polygon in feature["geometry"]["coordinates"] 
            for coord in (polygon[0] if feature["geometry"]["type"] == "MultiPolygon" else polygon)
            ],
        mode="lines",
        line=dict(color="red", width=3),
        name="Rotterdam boundary"
    ))

fig.show()

if save:
    print('exporting figure rotterdam_datapoints.html... ')
    pio.write_html(fig, "output/rotterdam_datapoints.html", full_html=True, include_plotlyjs='cdn')


# Find Data Points in Data Set from Input

### Get Box Around Address 

In [None]:
print(f"finding closest point to {longitude}, {latitude}:\n")
points_around_input = ft.find_closest_points_to_input(
    data=points_in_rotterdam, latitude=latitude, longitude=longitude, delta_lat=0.002, delta_lon=0.004
    )

In [None]:
fig = px.scatter_map(points_around_input[['lat', 'lon']].drop_duplicates(), lat='lat', lon='lon', zoom=15, height=600)
fig.update_traces(marker=dict(size=10))
fig.update_layout(map_style="carto-positron")

fig.add_trace(
    go.Scattermap(
        lat=[latitude],
        lon=[longitude],
        mode="markers",
        marker=dict(
            size=16,
            color="orange",
            symbol="star" 
        ),
        name="User Input {latitude}, {longitude}".format(latitude=latitude, longitude=longitude)
    )
)

HTML(fig.to_html(include_plotlyjs='cdn'))

### Get Profiles for Data Points

In [None]:
points_around_input['lithoklasse_material'] = [map_lithoclasses[k] for k in points_around_input.lithoklasse]
points_around_input['lithoklasse_color'] = [material_color_mapping[c] 
                                            for c in points_around_input['lithoklasse_material']]

In [None]:
profiles, unique_points = ft.get_unique_points(points_around_input)

# Separate Data Exploration

### Class Distribution within Rotterdam

In [None]:
points_in_rotterdam.head()

In [None]:
lithoclass_overview = concat([
    points_in_rotterdam.lithoklasse.value_counts(),
    points_in_rotterdam.lithoklasse.value_counts(normalize=True)*100], axis=1)

lithoclass_overview

In [None]:
depth_dist = DataFrame(
    points_in_rotterdam
    .groupby(level='z')['lithoklasse']
    .value_counts(normalize=True)
    .mul(100)
    .unstack(fill_value=0)
)
depth_dist

### Visualizing Distribution

In [None]:
df_plot = depth_dist.sort_index()
colors = cm.vik(linspace(0, 1, len(df_plot.columns)))

In [None]:
fig, ax = plt.subplots(figsize=(5, 7))
df_plot.plot.area(ax=ax, cmap=cm.vik)

plt.gca().invert_xaxis() 
leg = ax.legend(
    title="Lithoklasse", loc='upper left', ncols=8,
    edgecolor=axes_color, borderpad=.65, fontsize=fontsize*0.75
    )
leg.get_frame().set_linewidth(.5)
    
for spine in ax.spines.values():
    spine.set_visible(False)

ax.axhline(y=ax.get_ylim()[0], color=axes_color, linewidth=1.2, zorder=10)
ax.axvline(x=ax.get_xlim()[0], color=axes_color, linewidth=1.2, zorder=10)

ax.tick_params(axis='x', colors=axes_color)
ax.tick_params(axis='y', colors=axes_color)

ax.grid(False)
ax.set_xlabel("Depth $z$, m", fontsize=fontsize)
ax.set_ylabel("Percentage, %", fontsize=fontsize)
ax.set_title("Vertical Distribution of Lithology Classes", loc='left', fontsize=fontsize*1.25)

plt.tight_layout()

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(4, 6))
for i, col in enumerate(df_plot.columns):
    sns.kdeplot(y=df_plot.index, weights=df_plot[col], bw_adjust=.2, lw=1.5, label=col, ax=ax, color=colors[i])

leg = ax.legend(
    title="Lithoklasse", loc=0, ncols=8,
    edgecolor=axes_color, borderpad=.65, fontsize=fontsize*0.75
    )
leg.get_frame().set_linewidth(.5)
    
for spine in ax.spines.values():
    spine.set_visible(False)

ax.axhline(y=ax.get_ylim()[0], color=axes_color, linewidth=1.2, zorder=10)
ax.axvline(x=ax.get_xlim()[0], color=axes_color, linewidth=1.2, zorder=10)

ax.tick_params(axis='x', colors=axes_color)
ax.tick_params(axis='y', colors=axes_color)

ax.grid(False)
ax.set_ylabel("Depth $z$, m", fontsize=fontsize)
ax.set_xlabel("Percentage, %", fontsize=fontsize)
ax.set_title("Distribution of Lithology Classes", loc='left', fontsize=fontsize*1.25)

plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(df_plot, cmap=cm.batlowW_r, cbar=True, cbar_kws={'label': 'Percentage, %'}, ax=ax)

plt.gca().invert_yaxis() 
plt.xticks(ticks=arange(len(df_plot.columns)), labels=df_plot.columns)

plt.xlabel("Lithoklasse")
plt.ylabel("Depth $z$, m")
plt.title("Lithology Distribution per Depth", loc='left')
plt.tight_layout()
plt.show()

# Data Vis 3D Projection

## Original Voxel

In [None]:
fig = ft.plot_voxel(
    unique_points, elev=30, azim=45, 
    layer_label='layers',  dx=0.002, dy=0.002, 
    save_name='output/smoothed_dataset',
    save=False, display_plot=True, figsize=(18,6)
    )

if save:
    print('exporting original voxel...')
    fig.savefig(f"output/original_voxel_around_{longitude}-{latitude}.png")

In [None]:
dic_original_figs = dict()
for azim in linspace(0, 360, 12):
    dic_original_figs[azim] = ft.plot_voxel(
        unique_points, elev=30, azim=azim,
        save_name='output/original_dataset', 
        layer_label='layers',  dx=0.002, dy=0.002, 
        save=True, display_plot=False, figsize=(18,6)
        )

## Smoothing 

### Gaussian Smoothing
also called moving average along the depth axis

In [None]:
all_materials = list(set(l['lithoklasse_material'] for pt in unique_points for l in pt['layers']))
material_to_num = {m: i for i, m in enumerate(all_materials)}
num_to_material = {i: m for m, i in material_to_num.items()}

smoothed_points = []

# Kernel size: sigma in Gaussian filter controls smoothness
sigma = 2  # adjust; larger = smoother

for pt in unique_points:
    layers = sorted(pt['layers'], key=lambda l: l['z'])
    z_vals = array([l['z'] for l in layers])
    mat_nums = array([material_to_num[l['lithoklasse_material']] for l in layers])
    
    smoothed_nums = gaussian_filter1d(mat_nums.astype(float), sigma=sigma)
    
    smoothed_materials = [num_to_material[int(round(n))] for n in smoothed_nums]
    
    smoothed_layers = [
        {'z': z, 'lithoklasse_material': mat, 'lithoklasse_color': material_color_mapping[mat]} 
        for z, mat in zip(z_vals, smoothed_materials)
        ]
    
    smoothed_points.append({
        'lat': pt['lat'],
        'lon': pt['lon'],
        'layers_smoothed': smoothed_layers
    })


In [None]:
fig = ft.plot_voxel(
    smoothed_points, elev=30, azim=45, 
    layer_label='layers_smoothed',  dx=0.002, dy=0.002, 
    save_name='output/smoothed_dataset',
    save=False, display_plot=True, figsize=(18,6)
    )

if save:
    print('exporting Gaussian smoothed voxel...')
    fig.savefig(f"output/20251211_Gauss-smoothed_voxel_around_{longitude}-{latitude}.png")

In [None]:
dic_smoothed_figs = dict()
for azim in linspace(0, 360, 12):
    dic_smoothed_figs[azim] = ft.plot_voxel(
        smoothed_points, layer_label='layers_smoothed', elev=30, azim=azim, save=True, 
        save_name='output/smoothed_dataset', display_plot=False, dx=0.002, dy=0.002, figsize=(18,6)
        )

### Majority Voting / clustering of adjacent same class layers (chosen)
Group in boxes whenever something changed in lat, lon or lithoclasse, <br>
creating a box_start and box_end
-> z_start - z_end!!

In [None]:
data_grouped = points_in_rotterdam.reset_index()

print(data_grouped.shape)
data_grouped.head()

In [None]:
data_grouped['group'] = (
    (data_grouped['lithoklasse'] != data_grouped['lithoklasse'].shift()) |
    (data_grouped['lon'] != data_grouped['lon'].shift()) |
    (data_grouped['lat'] != data_grouped['lat'].shift())
).cumsum()

print(data_grouped.shape)
data_grouped.head()

### 3D filling in the vertical space for lithoklasse 0

In [None]:
from collections import Counter
from typing import Literal
from numpy import isnan, sort, nan, full, zeros, zeros_like, round, mean, argsort, array_split, where
from joblib import Parallel, delayed  

In [None]:
def fill_lithoklasse_3d_parallel_safe(
    df: DataFrame,
    lon_col: str = "lon",
    lat_col: str = "lat",
    z_col: str = "z",
    litho_col: str = "lithoklasse",
    method: str = "mean_round",
    round_decimals: int = 6,
    n_jobs: int = 4
) -> DataFrame:
    """
    Fill lithoklasse==0 cells in a 3D grid using neighbors below and same-level neighbors.
    Parallelized safely using NumPy arrays only.
    Returns DataFrame with updated lithoklasse and estimated column.
    """

    df = df.copy()
    df["estimated"] = False

    # Round coordinates to normalize
    xs = round(df[lon_col].values.astype(float), round_decimals)
    ys = round(df[lat_col].values.astype(float), round_decimals)
    zs = round(df[z_col].values.astype(float), round_decimals)

    df["_gx"], df["_gy"], df["_gz"] = xs, ys, zs

    # Unique sorted coordinates -> integer indices
    ux = sort(df["_gx"].unique())
    uy = sort(df["_gy"].unique())
    uz = sort(df["_gz"].unique())
    nx, ny, nz = len(ux), len(uy), len(uz)

    x_map = {v: i for i, v in enumerate(ux)}
    y_map = {v: i for i, v in enumerate(uy)}
    z_map = {v: i for i, v in enumerate(uz)}

    arr = full((nx, ny, nz), nan, dtype=float)
    idx_map = full((nx, ny, nz), -1, dtype=int)
    present = zeros((nx, ny, nz), dtype=bool)

    for idx, row in df.iterrows():
        ix, iy, iz = x_map[row["_gx"]], y_map[row["_gy"]], z_map[row["_gz"]]
        arr[ix, iy, iz] = float(row[litho_col])
        idx_map[ix, iy, iz] = idx
        present[ix, iy, iz] = True

    offsets = [(-1,-1),(-1,0),(-1,1),(0,-1),(0,0),(0,1),(1,-1),(1,0),(1,1)]
    offsets_same = [o for o in offsets if o != (0,0)]

    zero_mask = (arr == 0) & present
    z_indices = arange(nz)
    
    # Process each level sequentially but parallelize cells within the level
    for iz in z_indices[1:]:  # skip bottom layer
        # Get all zero cells at this level
        ix_all, iy_all = where(zero_mask[:, :, iz])
        if len(ix_all) == 0:
            continue

        def process_cell(ix, iy):
            neighbors = []

            iz_below = iz - 1
            for dx, dy in offsets:
                x2, y2 = ix+dx, iy+dy
                if 0 <= x2 < nx and 0 <= y2 < ny and present[x2, y2, iz_below]:
                    v = arr[x2, y2, iz_below]
                    if not isnan(v) and int(v) != 0:
                        neighbors.append(int(round(v)))

            for dx, dy in offsets_same:
                x2, y2 = ix+dx, iy+dy
                if 0 <= x2 < nx and 0 <= y2 < ny and present[x2, y2, iz]:
                    v = arr[x2, y2, iz]
                    if not isnan(v) and int(v) != 0:
                        neighbors.append(int(round(v)))

            if len(neighbors) == 0:
                return None

            if method == "mean_round":
                new_val = int(round(mean(neighbors)))
            else:
                cnt = Counter(neighbors)
                max_count = max(cnt.values())
                candidates = [val for val, c in cnt.items() if c == max_count]
                new_val = int(min(candidates)) 

            return (ix, iy, new_val)

        results = Parallel(n_jobs=n_jobs)(
            delayed(process_cell)(ix_all[i], iy_all[i]) for i in range(len(ix_all))
        )

        for r in results:
            if r is None:
                continue
            ix, iy, val = r
            arr[ix, iy, iz] = val
            ridx = idx_map[ix, iy, iz]
            df.at[ridx, litho_col] = val
            df.at[ridx, "estimated"] = True
            zero_mask[ix, iy, iz] = False  # mark as filled

    df.drop(columns=["_gx", "_gy", "_gz"], inplace=True)
    return df


In [None]:
filled = fill_lithoklasse_3d_parallel_safe(
    data_grouped,
    method="mean_round",
    n_jobs=8 
)

In [None]:
likelihood_cols = [col for col in data_grouped.columns if col.startswith('kans_')]
likelihood_cols

When grouping, calculate the average for the columns but for lon, lat, z, lothology and likelihood columns, do the following specifically (agg_dict):
- lon / lat → keep single value or list of unique values
- z → min and max
- lithology → take the first value
- likelihood columns → take the mean

In [None]:
agg_dict = {
    'lon': lambda x: x.iloc[0] if x.nunique() == 1 else list(x.unique()),
    'lat': lambda x: x.iloc[0] if x.nunique() == 1 else list(x.unique()),

    'z': ['min', 'max'],

    'lithostrat': lambda x: x.iloc[0],
    'lithoklasse': lambda x: x.iloc[0],
}

In [None]:
for col in likelihood_cols:
    agg_dict[col] = 'mean'

df_grouped = data_grouped.groupby('group').agg(agg_dict)
df_grouped.columns = [
    f'{c[0]}_{c[1]}' if isinstance(c, tuple) else c
    for c in df_grouped.columns
]

In [None]:
df_grouped = df_grouped.rename(columns={
    'lon_<lambda>': 'lon',
    'lat_<lambda>': 'lat',
    'lithoklasse_<lambda>': 'lithoklasse',
    'lithostrat_<lambda>': 'lithostrat',
})

print(df_grouped.shape)
df_grouped = df_grouped.reset_index(drop=True)
df_grouped.head()

In [None]:
for grp  in df_grouped[df_grouped.lithoklasse == 0].groupby('lon'):
    print(grp)

Verify averaging to add to 100%

In [None]:
data_kans = data_grouped.filter(like='kans_').sum(axis=1)
data_kans

In [None]:
data_kans.plot(lw=0, marker='o', figsize=(13, 3.5))

In [None]:
df_grouped['lithoklasse_material'] = [map_lithoclasses[k] for k in df_grouped.lithoklasse]
df_grouped['lithoklasse_color'] = [material_color_mapping[c] for c in df_grouped['lithoklasse_material']]

In [None]:
filtered = df_grouped[df_grouped['lon'].isin(points_around_input.lon.unique())]
filtered = filtered[filtered['lat'].isin(points_around_input.lat.unique())]
filtered

In [None]:
profiles_grouped, unique_points_grouped = ft.get_unique_points(points_around_input=filtered)

In [None]:
fig = ft.plot_voxel(
    unique_points_grouped, elev=30, azim=45, 
    layer_label='layers',  dx=0.002, dy=0.002, 
    save_name='output/smoothed_dataset',
    save=False, display_plot=True, figsize=(18,6)
    )

if save:
    fig.savefig(f"output/20251211_Content-smoothed_voxel_around_{longitude}-{latitude}.png")

# Output as JSON

### Prepare for Output

In [None]:
columns_kans = df_grouped.filter(like='kans_').columns
df_grouped[columns_kans] = df_grouped[columns_kans].round(4)

In [None]:
df_grouped.rename(columns={'z_min': 'z_bottom', 'z_max': 'z_top'}, inplace=True)

In [None]:
lithoclass_material = df_grouped['lithoklasse'].map(map_lithoclasses)
df_grouped['lithoclass_material'] = lithoclass_material

In [None]:
selected_columns = [
    'lon', 'lat', 'z_top', 'z_bottom', 'lithoklasse', 'lithoclass_material', 
    'kans_1_veen_mean', 'kans_2_klei_mean', 'kans_3_kleiig_zand_mean', 
    'kans_4_vervallen_mean', 'kans_5_zand_fijn_mean', 'kans_6_zand_matig_grof_mean', 
    'kans_7_zand_grof_mean', 'kans_8_grind_mean', 'kans_9_schelpen_mean'
]

cropped = df_grouped.reset_index()[selected_columns]
cropped.rename(columns={
    'lithoklasse':'lithoklasse_id', 
    'lithoclass_material':'lithoklasse'
    }, inplace=True)

cropped

In [None]:
profiled = cropped.groupby(['lon', 'lat']).apply(
    lambda g: sorted(g.to_dict(orient='records'), key=lambda d: d['z_top'], reverse=True)
).reset_index(name='data')

list_of_lists = profiled['data'].tolist()

In [None]:
map_coordinates_batch = profiled[['lon', 'lat']]
map_coordinates_batch

In [None]:
lat_search = 51.924303
lon_search = 4.480202

In [None]:
closest_coordinates = ft.find_closest_points_to_input(
    map_coordinates_batch, lat_search, lon_search, delta_lat=0.001, delta_lon=0.001
    )

### Export all data in one file

for all data points, describe the identified lithoclass (through mapping) and select likelihood of all material being present 

In [None]:
base_name = f"B01-to-B09_lithoclass_materials_and_likelihood_geocoordinates"
name_file =  f"_Rotterdam_city_{datetime.now().date().isoformat()}.json"
file_path = dir_export + base_name + name_file

with open(file_path, 'w') as f:
    json.dump(list_of_lists, f)

print(f"stored {file_path}")

### Export in batches

In [None]:
MAX_BYTES = 5 * 1024 * 1024  # 5 MB

batch = []
batch_size = 0
file_index = 1

output_dir = dir_export + f"json_5MB_chunks_{datetime.now().date().isoformat()}/"
os.makedirs(output_dir, exist_ok=True)

In [None]:
print(
    f'Total number of data points: {len(list_of_lists)} '
    f'\nAmount of entries in first set: {len(list_of_lists[0])}'
    )

In [None]:
profiled['file_index'] = -1  

file_index = 0
for idx, sublist in enumerate(list_of_lists):
    sublist_bytes = len(json.dumps(sublist, separators=(',', ':')).encode('utf-8'))
    if batch_size + sublist_bytes > MAX_BYTES and batch:
        file_name = os.path.join(output_dir, f"litho_batch_{file_index}.json")
        with open(file_name, 'w') as f:
            json.dump(batch, f, separators=(',', ':'))
        print(f"Stored {file_name} ({batch_size / 1024**2:.2f} MB)")
        file_index += 1
        batch = []
        batch_size = 0
    
    batch.append(sublist)
    batch_size += sublist_bytes
    profiled.loc[idx, 'batchID'] = file_index  

if batch:
    file_name = os.path.join(output_dir, f"litho_batch_{file_index}.json")
    with open(file_name, 'w') as f:
        json.dump(batch, f, separators=(',', ':'))
    print(f"Stored {file_name} ({batch_size / 1024**2:.2f} MB)")

## Store map of batch to coordinates

In [None]:
profiled[['lon', 'lat', 'batchID']].to_csv(output_dir + 'map_coordinates2batch.txt', index_label=False)

In [None]:
concat([profiled[profiled.lon == lon] for lon in closest_coordinates.lon]).batchID.unique()

# Read JSON

In [None]:
output_dir = 'output/json_5MB_chunks_2025-12-04/'

In [None]:
ls_files_for_read = [file for file in glob(output_dir + '*.json')]

In [None]:
ls_df = []
for en, path in enumerate(ls_files_for_read):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        print(f"Reading file {en+1} from {len(ls_files_for_read)}...")

        dfs = []

        for f_sub in data:
            if isinstance(f_sub, list):
                if len(f_sub) > 0 and isinstance(f_sub[0], dict):
                    dfs.append(DataFrame(f_sub))
                else:
                    dfs.append(DataFrame({"value": f_sub}))
            
            elif isinstance(f_sub, dict):
                dfs.append(DataFrame([f_sub]))

            else:
                dfs.append(DataFrame({"value": [f_sub]}))

        file_df = concat(dfs, ignore_index=True)
        ls_df.append(file_df)


df_import = concat(ls_df)

In [None]:
print(
    f"comparing shapes of datasets:"
    f"\n original dataset: {cropped.shape},"
    f"\n re-imported dataset: {df_import.shape} and"
    f"\n duplicated removed: {df_import.drop_duplicates().shape}")