## Set main parameters

In [10]:
ROOT = "/home/STual/DAN-cadastre"

### Create subsets and format named entities

In [None]:
import random
from typing import List, Tuple
import glob
import os

def split_dataset(data: List[str], train_ratio: float, val_ratio: float, test_ratio: float, seed: int = None) -> Tuple[List[str], List[str], List[str]]:
    """
    Splits the dataset into train, validation, and test sets based on the given ratios.

    Args:
        data (List[str]): List of data samples (e.g., file paths).
        train_ratio (float): Proportion of data to use for the training set.
        val_ratio (float): Proportion of data to use for the validation set.
        test_ratio (float): Proportion of data to use for the test set.
        seed (int, optional): Random seed for reproducibility. Defaults to None.

    Returns:
        Tuple[List[str], List[str], List[str]]: A tuple containing the train, validation, and test sets.
    """
    if seed is not None:
        random.seed(seed)

    # Ensure the ratios sum to 1.0
    assert train_ratio + val_ratio + test_ratio == 1.0, "Train, validation, and test ratios must sum to 1.0"

    # Shuffle the data
    random.shuffle(data)

    # Calculate split indices
    total = len(data)
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)

    # Split the data
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]

    return train_data, val_data, test_data


# Example usage
data_samples = glob.glob(os.path.join(path, '*.png'))
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2
seed = 42

train_data, val_data, test_data = split_dataset(data_samples, train_ratio, val_ratio, test_ratio, seed)

print("Train Data:", train_data)
print("Validation Data:", val_data)
print("Test Data:", test_data)
subsets = {'train': train_data, 'val': val_data, 'test': test_data}

In [None]:
#using annotations.csv, can you concatenate the "texte" that have the same id zone. For each "texte", depending on the "nature" value, add the following tokens before and after the words
#tokens => parcelle: start: Ⓐ end: Ⓑ
        #=>nature: start: Ⓒ end: Ⓓ

# Create a dictionary to store the concatenated text for each id_zone
concatenated_text = {}

# Iterate over the DataFrame
for _, row in df.iterrows():
    if row['id_zone'] in concatenated_text:
        concatenated_text[row['id_zone']] += ' '
        if row['nature'] == 'parcelle':
            concatenated_text[row['id_zone']] += 'Ⓐ' + row['texte'] + 'Ⓑ'
        elif row['nature'] == 'rue' or row['nature'] == 'commune' or row['nature'] == 'riviere':
            concatenated_text[row['id_zone']] += 'Ⓒ' + row['texte'] + 'Ⓓ'
    else:
        if row['nature'] == 'parcelle':
            concatenated_text[row['id_zone']] = 'Ⓐ' + row['texte'] + 'Ⓑ'
        elif row['nature'] == 'rue' or row['nature'] == 'commune' or row['nature'] == 'riviere':
            concatenated_text[row['id_zone']] = 'Ⓒ' + row['texte'] + 'Ⓓ'

In [None]:
concatenated_text

### Create split.json

In [None]:
import re

splitjson = { "test" : {},
"val" : {},
"train" : {}
}

for subset in subsets:
    set_ = subset
    dataset = subsets[subset]

    for elem in dataset:
        fpath = elem.split("/")[-1]
        #use this regex _(.*).png with re lib
        pattern = re.compile(r'_(.*)\.png')
        match = pattern.search(fpath)
        id = match.group(1)
        subjson = {id:{"dataset_id":"synth_maps_dataset","image":{},"text":{}}}
        subjson[id]["image"]["iiif_url"] = elem
        subjson[id]["image"]["polygon"] = [
                    [0,0],
                    [0,2000],
                    [2000,2000],
                    [2000,0],[0,0]
                ]
        subjson[id]["text"] = concatenated_text[id]
        splitjson[set_].update(subjson)

In [None]:
#write finaljson as split.json file
import json
with open(f"{ROOT}/dataset-cartes/synth_maps_dataset/split.json", 'w') as f:
    json.dump(splitjson, f, indent=4)
    

### Create labels.json

In [None]:
#using the train_data, val_data and test_data, can you create a new folder for each subset, that contains a synth_maps_dataset folder, and copy the images in this last corresponding folder
import shutil
for subset in subsets:
    
    dataset = subsets[subset]
    for elem in dataset:
        fpath = elem.split("/")[-1]
        #use this regex _(.*).png with re lib
        pattern = re.compile(r'_(.*)\.png')
        match = pattern.search(fpath)
        id = match.group(1)
        os.makedirs(f"{ROOT}/dataset-cartes/synth_maps_dataset/images/{subset}", exist_ok=True)
        os.makedirs(f"{ROOT}/dataset-cartes/synth_maps_dataset/images/{subset}/synth_maps_dataset", exist_ok=True)
        shutil.copy(elem, f"{ROOT}/dataset-cartes/synth_maps_dataset/images/{subset}/synth_maps_dataset/"+fpath)

In [None]:
labelsjson = { "test" : {},
"val" : {},
"train" : {}
}

for subset in subsets:
    set_ = subset
    dataset = subsets[subset]
    for elem in dataset:
        subjson = {}
        fpath = elem.split("/")[-1]
        #use this regex _(.*).png with re lib
        pattern = re.compile(r'_(.*)\.png')
        match = pattern.search(fpath)
        id = match.group(1)
        url = f"images/{set_}/synth_maps_dataset/extrait_{id}.png"
        subjson[url] = concatenated_text[id]
        labelsjson[set_].update(subjson)

In [None]:
#write finaljson as split.json file
import json
with open(f'{ROOT}/dataset-cartes/synth_maps_dataset/labels.json', 'w') as f:
    json.dump(labelsjson, f, indent=4)

### Create charset.pkl file

In [None]:
#Open the labels.json file and create a new file named charset.pkl that contains the characters used in the text field of the labels.json file
import pickle
import json
with open(f'{ROOT}/dataset-cartes/synth_maps_dataset/labels.json', 'r') as f:
    data = json.load(f)
    text = ""
    for key in data:
        for k in data[key]:
            text += data[key][k]
    charset = set(text)
    with open(f'{ROOT}/dataset-cartes/synth_maps_dataset/charset.pkl', 'wb') as f:
        pickle.dump(charset, f)

### Get some statistics about DAN (help tot set config.json)

In [None]:
!teklia-dan dataset analyze \
    --labels /home/STual/DAN-cadastre/dataset-cartes/synth_maps_dataset/labels.json \
    --tokens /home/STual/DAN-cadastre/dataset-cartes/tokens.yml \
    --output-file /home/STual/DAN-cadastre/dataset-cartes/statistics.md