### Data subsets files creation

In [7]:
##### Uncomment and run in case the dataset split has to be performed again

from sklearn.model_selection import train_test_split
import pandas as pd

expressions = pd.read_csv('complete-expressions.csv')
expressions = expressions[['input','target']]

expressions_latex = expressions.loc[expressions['input'].str.contains('generar latex')]
expressions_ner = expressions.loc[expressions['input'].str.contains('reconocer entidades nombradas')]
expressions_trees = expressions.loc[expressions['input'].str.contains('generar lista')]

In [10]:
#Latex only
train_val_data, test_data = train_test_split(expressions_latex, test_size=0.1, shuffle=True)
train_data, validation_data = train_test_split(train_val_data, test_size=0.11, shuffle=True)

train_data.to_csv('data-splits/latex/train_split.csv', index=False)
validation_data.to_csv('data-splits/latex/validation_split.csv', index=False)
test_data.to_csv('data-splits/latex/test_split.csv', index=False)

In [12]:
train_latex_df = pd.read_csv('data-splits/latex/train_split.csv')

expressions_ner_striped = expressions_ner.sample(len(train_latex_df))
expressions_trees_striped = expressions_trees.sample(len(train_latex_df))

In [13]:
#Latex + NER
latex_ner_df = pd.concat([expressions_ner_striped,expressions_latex]).sample(frac=1)
latex_trees_df = pd.concat([expressions_trees_striped,expressions_latex]).sample(frac=1)
latex_ner_latex_df = pd.concat([expressions_ner_striped,expressions_trees_striped,expressions_latex]).sample(frac=1)

latex_ner_df.to_csv('data-splits/latex-ner/train_split.csv', index=False)
latex_trees_df.to_csv('data-splits/latex-trees/train_split.csv', index=False)
latex_ner_latex_df.to_csv('data-splits/complete/train_split.csv', index=False)

In [4]:
##### Uncomment and run in case the dataset split for only trees dataset has to be performed again

""" from sklearn.model_selection import train_test_split
import pandas as pd

expressions = pd.read_csv('complete-expressions.csv')
expressions = expressions[['input','target']]

expressions = expressions.loc[expressions['input'].str.contains('generar lista')]
expressions = expressions[0:50]

train_val_data, test_data = train_test_split(expressions, test_size=0.2, shuffle=True)
train_data, validation_data = train_test_split(train_val_data, test_size=0.25, shuffle=True)

train_data.to_csv('data-splits\\only-trees-small\\train_split.csv', index=False)
validation_data.to_csv('data-splits\\only-trees-small\\validation_split.csv', index=False)
test_data.to_csv('data-splits\\only-trees-small\\test_split.csv', index=False)  """

### Huggingface DatasetDict preprocessing

In [16]:
from expressions_dataset import ExpressionsDataset
    
expressions = ExpressionsDataset(dataset='latex-trees')

In [17]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(expressions.train_data, split='train')
validation_dataset = Dataset.from_pandas(expressions.validation_data, split='validation')
test_dataset = Dataset.from_pandas(expressions.test_data, split='test')

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 2571
    })
    validation: Dataset({
        features: ['input', 'target'],
        num_rows: 142
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 143
    })
})

In [19]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("vgaraujov/t5-base-spanish")
max_input_length = 128
max_target_length = 128

def preprocess_examples(examples):
    descriptions = examples['input']
    targets = examples['target']

    # Encode the descriptions
    model_inputs = tokenizer(descriptions, max_length=max_input_length, padding="max_length", truncation=True, return_tensors='pt')
    # Encode the targets: trees or annotations
    labels = tokenizer(targets, max_length=max_input_length, padding="max_length", truncation=True, return_tensors='pt').input_ids

    labels_with_ignore_index = []
    for labels_example in labels:
      labels_example = [label if label != 0 else -100 for label in labels_example]
      labels_with_ignore_index.append(labels_example)
    
    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

dataset = dataset.map(preprocess_examples, batched=True)



Map:   0%|          | 0/2571 [00:00<?, ? examples/s]

Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

We save the preprocessed dataset to disk

In [20]:
dataset.save_to_disk('preprocessed-dataset-latex-trees')

Saving the dataset (0/1 shards):   0%|          | 0/2571 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/142 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/143 [00:00<?, ? examples/s]

In [2]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("vgaraujov/t5-base-spanish")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
string = "\\begin{cases} - VC - VC.x - VC.x^{2} - VC.x^{3} - VC.x^{4} - VC.x^{5} - VC.x^{6} - VC.x^{7} - VC.x^{8} \\end{cases}"
ids = tokenizer(string).input_ids

In [4]:
tokenizer.decode(ids, skip_special_tokens=True)

'\\begin{cases} - VC - VC.x - VC.x^{2} - VC.x^{3} - VC.x^{4} - VC.x^{5} - VC.x^{6} - VC.x^{7} - VC.x^{8} \\end{cases}'

And it is possible to reload the saved dataset as follows

In [33]:
from datasets import load_from_disk
reloaded_dataset = load_from_disk('preprocessed-dataset')