In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import os

In [12]:
os.makedirs("./data/wiki40b-txt/")

In [8]:
lang_code = "en"
r1 = "_START_ARTICLE_\n[^_]*"
r2 = "_START_PARAGRAPH_\n"
r3 = "_START_SECTION_\n[^_]*"
r4 = "_NEWLINE_"

REGEX = re.compile(f"({r1}|{r2}|{r3}|{r4})")

## Generating txt files from the Wiki-40b dataset

In [9]:
def process_tf_dataset(ds, num_tokens, output_file):
    # Turn to a numpy df so that we can easily extract text
    # numpy_items = tfds.as_numpy(ds)
    token_count = 0

    with open(output_file, "a") as f:
        for batch in ds.as_numpy_iterator():
            # text is the feature we want to extract
            for item in batch.get("text"):
                text = item.decode("UTF-8")
                text = re.sub(REGEX, " ", text)
                text = re.sub("\s+", " ", text).strip()
                f.write(text)
                f.write("\n")
                token_count += len(text.split())
                if num_tokens > 0 and token_count > num_tokens:
                    break

In [20]:
# load from validation split of the wiki-40b dataset, which has 163597 entries.
# Don't run repeatedly once you loaded.
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="train",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [21]:
# generating pure txt file for train split in wiki-40b
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".train"
)

2023-07-18 23:14:18.518598: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2023-07-18 23:14:18.519921: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype string and shape [128]
	 [[{{node Placeholder/_2}}]]


In [18]:
# don't know why changing the split matters
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="test",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".test"
)

In [None]:
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="validation",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".validation"
)
# why this has so many entries?
# 3 and 300 have same results, but -1 a lot more.

In [22]:
dirname = os.path.dirname("wiki40b-txt/")

In [23]:
dirname

'wiki40b-txt'

In [None]:
Ichijō Fuyuyoshi (一条 冬良, July 29, 1465 – April 21, 1514), son of regent Kaneyoshi, was a kugyō or court noble of the Muromachi period (1336–1573) of Japan. He held a regent position kampaku two times from 1488 to 1493 and from 1497 to 1501. He adopted Fusamichi as son who was also his daughter's husband.

## Testing training a BPETokenizer from scratch on different languages

In [1]:
import argparse
import os

from tokenizers import AddedToken
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sample = {
    "en": "Hello, y'all! How are you 😁? (just testing the tokenizer)",
    "ja": "おやすみなさい",
    "it": "Stiamo cercando una gioielleria.",
    "ja_en": "おやすみなさい"
             "Hello, y'all! How are you 😁? (just testing the tokenizer)",
    "it_en": "Stiamo cercando una gioielleria."
             "Hello, y'all! How are you 😁? (just testing the tokenizer)",
}

In [4]:
def train_tokenizer():

    bpe_tokenizer = ByteLevelBPETokenizer()

    files = [f"./data/wiki40b-txt/en.small"]

    bpe_tokenizer.train(files=files, vocab_size=32000, min_frequency=2)

    tokenizer_path = f'./data/tokenizer/en'
    if not os.path.exists(tokenizer_path):
        os.makedirs(tokenizer_path)

    # save the vocab.json and merges.txt files of the trained bpe tokenizer
    bpe_tokenizer.save_model(tokenizer_path)

    model_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, tokenizer_type="gpt2")
    model_tokenizer.model_max_length = 512
    model_tokenizer.add_special_tokens({"pad_token": AddedToken("<pad>", normalized=True)})

    print(f'Tokenizer vocab size: {len(model_tokenizer)}')
    print(f'Tokenizer max sequence length: {model_tokenizer.model_max_length} \n')

    # save the full model tokenizer configuration files
    model_tokenizer.save_pretrained(tokenizer_path)

    output = model_tokenizer.encode_plus(sample["en"])
    print(output.tokens(), '\n')

In [5]:
train_tokenizer()






Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer vocab size: 32002
Tokenizer max sequence length: 512 

['H', 'ello', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', '?', 'Ġ(', 'just', 'Ġtesting', 'Ġthe', 'Ġto', 'ken', 'izer', ')'] 



## Dependency Parsing with UDPipe

In [2]:
from ufal.udpipe import Model, Pipeline, ProcessingError
import sys
import argparse
import os
from mosestokenizer import (
    MosesPunctuationNormalizer,
    MosesTokenizer,
    MosesSentenceSplitter,
)
# from indicnlp.tokenize.sentence_tokenize import sentence_split as indic_sent_tokenize
# from indicnlp.tokenize.indic_tokenize import trivial_tokenize as indic_word_tokenize
# from hazm import sent_tokenize as persian_sent_tokenize
# from hazm import word_tokenize as persian_word_tokenize
# from hazm import Normalizer as PersianNormalizer

In [3]:
UDPIPE_MODEL_LOOKUP = {
    "en": "udpipe_models/english-lines-ud-2.5-191206.udpipe"
}

In [None]:
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang", help="2-letter language code such as en, ru, vi, etc.", default="en"
    )
    parser.add_argument(
        "--udpipe_model_path", help="path to UDPipe model file for this language"
    )
    parser.add_argument(
        "--data_dir",
        help="path to data directory with original (normal-order) text",
        default="./data/wiki40b-txt/en_small.txt",
    )
    parser.add_argument(
        "--parse_dir",
        help="path to directory where CONLLU parses of sentences should be stored",
        default="./parse",
    )
    parser.add_argument(
        "--partitions",
        default="train,test,valid",
        help="comma-seprated list of partitions",
    )
    parser.add_argument("--test_run", action="store_true")
    args = parser.parse_args()

    # create output directory if it doesn't yet exist
    if not os.path.exists(args.parse_dir):
        os.system(f"mkdir -p {args.parse_dir}")

    # load UDPipe Model
    sys.stderr.write("Loading model: ")
    if args.udpipe_model_path is None:
        model = Model.load(UDPIPE_MODEL_LOOKUP[args.lang])
        sys.stderr.write(f"{model}\n")
    else:
        model = Model.load(args.udpipe_model_path)
    if not model:
        sys.stderr.write(f"Cannot load model from file '{args.udpipe_model_path}'\n")
        sys.exit(1)
    sys.stderr.write("done\n")

    # create pipeline
    pipeline = Pipeline(
        model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
    )
    err = ProcessingError()

    # Make sentence tokenizer
    if args.lang == "hi":
        pass
    elif args.lang == "fa":
        persian_normalizer = PersianNormalizer()
    else:
        sent_tokenize = MosesSentenceSplitter(args.lang)
        word_tokenize = MosesTokenizer(args.lang, no_escape=True)
        normalize = MosesPunctuationNormalizer(args.lang)

    # iterate over partitions
    for partition in args.partitions.split(","):
        input_path = os.path.join(args.data_dir, f"{args.lang}.{partition}")
        if args.test_run:
            output_path = os.path.join(
                args.parse_dir, f"{args.lang}.{partition}.tiny.conllu"
            )
        else:
            output_path = os.path.join(
                args.parse_dir, f"{args.lang}.{partition}.conllu"
            )

        with open(input_path) as f_in, open(output_path, "w") as f_out:

            doc_counter = 0

            # use iterator over lines in f_in to save memory
            for document in f_in:

                # Moses tokenizer will fail if the line is blank
                if (len(document.strip())) == 0:
                    sys.stderr.write("There was a blank line in the input file\n")
                    continue

                if args.lang == "fa":
                    document = persian_normalizer.normalize(document)
                    sentences = persian_sent_tokenize(document)
                    sentences_tokenized = [persian_word_tokenize(s) for s in sentences]
                elif args.lang == "hi":
                    # split sentences
                    sentences = indic_sent_tokenize(document, lang="hi")
                    # sentences_tokenized = [word_tokenize(normalize(s)) for s in sentences]
                    sentences_tokenized = [indic_word_tokenize(s) for s in sentences]
                else:
                    # split sentences
                    sentences = sent_tokenize([document])
                    sentences_tokenized = [
                        word_tokenize(normalize(s)) for s in sentences
                    ]

                sentences = [" ".join(s) for s in sentences_tokenized]
                sentences = "\n".join(sentences)

                # Process data
                processed = pipeline.process(sentences, err)
                if err.occurred():
                    sys.stderr.write(
                        f"An error occurred in run_udpipe: {err.message}\n"
                    )
                    sys.exit(1)

                f_out.write(processed)

                doc_counter += 1

                if args.test_run and doc_counter >= 5:
                    exit()

In [14]:
# load UDPipe Model
sys.stderr.write("Loading model: ")
udpipe_model_path = 0
if udpipe_model_path == 0:
    model = Model.load(UDPIPE_MODEL_LOOKUP["en"])
    sys.stderr.write(f"{model}\n")
else:
    model = Model.load(udpipe_model_path)
if not model:
    sys.stderr.write(f"Cannot load model from file '{udpipe_model_path}'\n")
    sys.exit(1)
sys.stderr.write("done\n")

Loading model: <Swig Object of type 'model *' at 0x112a9b8b0>
done


5

In [8]:
model = Model.load(UDPIPE_MODEL_LOOKUP["en"])

In [9]:
sys.stderr.write(f"{model}\n")

None


5

In [15]:
lang = "en"
partitions = "train,test,valid"
parse_dir = "./parse/"
test_run = False

In [17]:
# create pipeline
pipeline = Pipeline(
    model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
)
err = ProcessingError()

# Make sentence tokenizer
sent_tokenize = MosesSentenceSplitter(lang)
word_tokenize = MosesTokenizer(lang, no_escape=True)
normalize = MosesPunctuationNormalizer(lang)

# iterate over partitions
for partition in partitions.split(","):
    input_path = os.path.join("./data/wiki40b-txt/", f"{lang}_{partition}.txt")
    if test_run:
        output_path = os.path.join(
            parse_dir, f"{lang}_{partition}.tiny.conllu"
        )
    else:
        output_path = os.path.join(
            parse_dir, f"{lang}_{partition}.conllu"
        )

    with open(input_path) as f_in, open(output_path, "w") as f_out:

        doc_counter = 0

        # use iterator over lines in f_in to save memory
        for document in f_in:

            # Moses tokenizer will fail if the line is blank
            if (len(document.strip())) == 0:
                sys.stderr.write("There was a blank line in the input file\n")
                continue
            # split sentences
            sentences = sent_tokenize([document])
            sentences_tokenized = [
                word_tokenize(normalize(s)) for s in sentences
            ]

            sentences = [" ".join(s) for s in sentences_tokenized]
            sentences = "\n".join(sentences)

            # Process data
            processed = pipeline.process(sentences, err)
            if err.occurred():
                sys.stderr.write(
                    f"An error occurred in run_udpipe: {err.message}\n"
                )
                sys.exit(1)

            f_out.write(processed)

            doc_counter += 1

            if test_run and doc_counter >= 5:
                exit()

NameError: name 'args' is not defined