In [14]:
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer, Trainer
from datasets import load_from_disk


# We reload the saved dataset that was preprocessed in dataset-prep notebook.
dataset = load_from_disk('preprocessed-dataset-latex')

# We upload the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("vgaraujov/t5-base-spanish")

# We upload the T5 pretrained model base
model = T5ForConditionalGeneration.from_pretrained('vgaraujov/t5-base-spanish')
#model = T5ForConditionalGeneration.from_pretrained('./results/latex')
#model = T5ForConditionalGeneration.from_pretrained('./results/latex-ner')
#model = T5ForConditionalGeneration.from_pretrained('./results/latex-trees')
#model = T5ForConditionalGeneration.from_pretrained('./results/complete')



In [15]:
from transformers import Trainer, DataCollatorForSeq2Seq


data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Crear el entrenador
trainer = Trainer(
    model=model,                         
    tokenizer=tokenizer,                 
    data_collator=data_collator,
)

model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [16]:
predictions = trainer.predict(dataset['test'])

  0%|          | 0/18 [00:00<?, ?it/s]

In [17]:
predictions


PredictionOutput(predictions=(array([[[ 1.2773583 ,  3.8190114 ,  2.9986262 , ..., -4.46272   ,
         -3.5805202 ,  0.6499685 ],
        [-0.86647874,  1.6427323 ,  4.147688  , ..., -2.5420032 ,
         -3.117785  , -0.57773197],
        [-1.1411113 , -1.4236448 ,  5.5018215 , ..., -2.6014123 ,
         -1.9399649 , -0.5542981 ],
        ...,
        [ 1.171685  ,  3.9959712 ,  2.9309154 , ..., -4.278045  ,
         -3.4396124 ,  0.6063939 ],
        [ 1.1720676 ,  3.9956298 ,  2.9312243 , ..., -4.279063  ,
         -3.440767  ,  0.6068993 ],
        [ 1.172512  ,  3.9960682 ,  2.931536  , ..., -4.279534  ,
         -3.4414701 ,  0.6078959 ]],

       [[ 1.2749777 ,  3.805031  ,  3.0213141 , ..., -4.4991283 ,
         -3.61095   ,  0.6595073 ],
        [-1.7859082 ,  1.294733  ,  3.9292848 , ..., -2.2316995 ,
         -4.53111   ,  0.09840891],
        [-1.5912012 ,  2.468893  ,  5.9766603 , ..., -3.5110373 ,
         -3.4888043 , -1.1441418 ],
        ...,
        [ 1.1857646 ,  3

In [28]:
descriptions = dataset['test']['input'][0:19]
inputs = tokenizer(descriptions, return_tensors='pt', padding=True)

print(descriptions)

['generar pseudolatex: Polinomio de grado 7 completo con coeficientes enteros', 'generar pseudolatex: Sistema de tres ecuaciones. Una de ellas de tres variables y las dos restantes de dos variables o incógnitas.', 'generar pseudolatex: Polinomio completo de séptimo grado con todos sus coeficientes decimales negativos. El polinomio está ordenado en orden descendente', 'generar pseudolatex: un polinomio de noveno grado incompleto con variable z', 'generar pseudolatex: Sistema de ecuaciones', 'generar pseudolatex: Sistema de ecuacion de 2 ecuaciones lineales de dos variables, con coeficientes distintos de 0', 'generar pseudolatex: Polinomio de grado 1 incompleto con 1 término', 'generar pseudolatex: polinomio de grado 4 con cinco terminos', 'generar pseudolatex: Sistema de ecuaciones homogeneo', 'generar pseudolatex: Sistema de ecuaciones de 4 incógnitas y dos ecuaciones, compatible indeterminado', 'generar pseudolatex: Polinomio con coeficientes complejo y naturales, de noveno grado e in

In [29]:
outputs = model.generate(inputs=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=200)
decodings = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [31]:
import pandas as pd

filename = './test-logs/outputs-base.csv'

results = {'inputs': descriptions, 'outputs': decodings}

pd.DataFrame(data=results).to_csv(filename, index=False)