In [1]:
# Montar los datos desde Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Definir la ruta en la que están los datos

import os

data_path: str = "/content/drive/MyDrive/Monografia/Data"

output_file = os.path.join(data_path, "bulk_data.conllu")
train_file_path = os.path.join(data_path, "train_data.conllu")
test_file_path = os.path.join(data_path, "test_data.conllu")

# Eliminar los archivos creados para asegurar que el notebook sea idempotente

if os.path.exists(data_path):
    os.remove(output_file)
if os.path.exists(train_file_path):
    os.remove(train_file_path)
if os.path.exists(test_file_path):
    os.remove(test_file_path)

# Verificar que la ruta contiene los treebanks correctos

os.listdir(data_path)

['et_edt-ud-dev.conllu',
 'et_edt-ud-test.conllu',
 'et_edt-ud-train.conllu',
 'fi_tdt-ud-dev.conllu',
 'fi_tdt-ud-test.conllu',
 'fi_tdt-ud-train.conllu']

In [3]:
# Installar dependencias

!pip install conllu



In [4]:
import conllu

# Según UD los treebanks elegidos contienen 17 categorías gramaticales más un valor nulo pero vale la pena verificarlo
all_pos = set()
for file in os.listdir(data_path):
    if file.endswith(".conllu") and file != "bulk_data.conllu":
        with open(os.path.join(data_path, file), "r", encoding="utf-8") as f:
            sentences = conllu.parse_incr(f)
            for sentence in sentences:
                for token in sentence:
                    if token['upos']:
                        all_pos.add(token['upos'])

print(len(all_pos))
print(all_pos)

18
{'INTJ', 'PRON', 'ADV', 'CCONJ', 'SYM', 'X', 'AUX', 'ADJ', 'ADP', 'SCONJ', 'PUNCT', 'PROPN', '_', 'DET', 'NOUN', 'PART', 'NUM', 'VERB'}


In [5]:
# Nos interesa añadir el lenguaje original a cada sentencia como metadato antes de unificar, para lo cual se define una función

def add_language_metadata(file_path: str, language: str):
  """
  Opens a conllu file and adds the language as metadata to each sentence.

  Args:
    file_path: The path to the conllu file.
    language: The language code to add as metadata (e.g., 'es', 'en').
  """
  with open(file_path, "r", encoding="utf-8") as f:
    data = f.read()
    sentences = conllu.parse(data)

  for sentence in sentences:
    # Add or update the 'newdoc id' metadata with the language information
    sentence.metadata['language'] = language

  # Write the modified sentences back to the file
  with open(file_path, "w", encoding="utf-8") as f:
    for sentence in sentences:
      f.write(sentence.serialize())

In [6]:
# Se añaden los metadatos a cada archivo, empleando los dos primeros caracteres del nombre del archivo como código del idioma

for file_name in os.listdir(data_path):
  if file_name.endswith(".conllu") and file_name != "bulk_data.conllu":
    file_path = os.path.join(data_path, file_name)
    language_code = file_name[:2]  # Get the first two characters of the filename
    add_language_metadata(file_path, language_code)

In [7]:
# Se unifican los datos en un único archivo

with open(output_file, "w") as output:
    for file in os.listdir(data_path):
        if file.endswith(".conllu") and file != "bulk_data.conllu":
            with open(os.path.join(data_path, file), "r") as input:
                output.write(input.read())

In [8]:
# Se verifica que el etiquetado haya funcionado correctamente

for file in os.listdir(data_path):
  if file.endswith(".conllu"):
    print(file)
    with open(os.path.join(data_path, file), "r") as f:
      sentences = conllu.parse_incr(f)
      for _ in range(3):
        print(sentences.__next__())
      print('\n')

et_edt-ud-dev.conllu
TokenList<Aga, mulle, tundub, ,, et, kogu, maailm, ootab, muusikamaailmalt, midagi, erutavalt, uut, minimalismi, kõrvale, ., metadata={sent_id: "aja_ee199920_1477", text: "Aga mulle tundub, et kogu maailm ootab muusikamaailmalt midagi erutavalt uut minimalismi kõrvale.", language: "et"}>
TokenList<Kust, tuli, mõte, kirjutada, ooper, ", Writing, to, Vermeer, ", ?, metadata={sent_id: "aja_ee199920_1478", text: "Kust tuli mõte kirjutada ooper "Writing to Vermeer"?", language: "et"}>
TokenList<Mind, on, Vermeeri, looming, alati, fastsineerinud, ., metadata={sent_id: "aja_ee199920_1479", text: "Mind on Vermeeri looming alati fastsineerinud.", language: "et"}>


et_edt-ud-test.conllu
TokenList<Palju, olulisi, komponente, ,, nagu, liha, ja, kala, ,, hangime, siiski, Eestist, ., metadata={sent_id: "aja_ee199920_1971", text: "Palju olulisi komponente, nagu liha ja kala, hangime siiski Eestist.", language: "et"}>
TokenList<Loomulikult, kuuluvad, meie, kohalikku, ostusedeliss

In [9]:
# Divide el archivo combinado en datos de entrenamiento y de prueba

import random

with open(output_file, "r", encoding="utf-8") as input_f, \
     open(train_file_path, "w", encoding="utf-8") as train_f, \
     open(test_file_path, "w", encoding="utf-8") as test_f:

    sentences = conllu.parse_incr(input_f)

    for sentence in sentences:
        if random.random() < 0.8:
            train_f.write(sentence.serialize())
        else:
            test_f.write(sentence.serialize())