In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import os

2023-08-30 04:14:22.470340: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.makedirs("./data/wiki40b-txt/")

In [None]:
lang_code = "en"
r1 = "_START_ARTICLE_\n[^_]*"
r2 = "_START_PARAGRAPH_\n"
r3 = "_START_SECTION_\n[^_]*"
r4 = "_NEWLINE_"

REGEX = re.compile(f"({r1}|{r2}|{r3}|{r4})")

## Generating txt files from the Wiki-40b dataset

In [None]:
def process_tf_dataset(ds, num_tokens, output_file):
    # Turn to a numpy df so that we can easily extract text
    # numpy_items = tfds.as_numpy(ds)
    token_count = 0

    with open(output_file, "a") as f:
        for batch in ds.as_numpy_iterator():
            # text is the feature we want to extract
            for item in batch.get("text"):
                text = item.decode("UTF-8")
                text = re.sub(REGEX, " ", text)
                text = re.sub("\s+", " ", text).strip()
                f.write(text)
                f.write("\n")
                token_count += len(text.split())
                if num_tokens > 0 and token_count > num_tokens:
                    break

In [None]:
# load from validation split of the wiki-40b dataset, which has 163597 entries.
# Don't run repeatedly once you loaded.
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="train",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
# generating pure txt file for train split in wiki-40b
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".train"
)

2023-07-18 23:14:18.518598: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2023-07-18 23:14:18.519921: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype string and shape [128]
	 [[{{node Placeholder/_2}}]]


In [None]:
# don't know why changing the split matters
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="test",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".test"
)

In [None]:
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="validation",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".validation"
)
# why this has so many entries?
# 3 and 300 have same results, but -1 a lot more.

In [None]:
dirname = os.path.dirname("wiki40b-txt/")

In [None]:
dirname

'wiki40b-txt'

In [None]:
Ichijō Fuyuyoshi (一条 冬良, July 29, 1465 – April 21, 1514), son of regent Kaneyoshi, was a kugyō or court noble of the Muromachi period (1336–1573) of Japan. He held a regent position kampaku two times from 1488 to 1493 and from 1497 to 1501. He adopted Fusamichi as son who was also his daughter's husband.

## Testing training a BPETokenizer from scratch on different languages

In [None]:
import argparse
import os

from tokenizers import AddedToken
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
sample = {
    "en": "Hello, y'all! How are you 😁? (just testing the tokenizer)",
    "ja": "おやすみなさい",
    "it": "Stiamo cercando una gioielleria.",
    "ja_en": "おやすみなさい"
             "Hello, y'all! How are you 😁? (just testing the tokenizer)",
    "it_en": "Stiamo cercando una gioielleria."
             "Hello, y'all! How are you 😁? (just testing the tokenizer)",
}

In [None]:
def train_tokenizer():

    bpe_tokenizer = ByteLevelBPETokenizer()

    files = [f"./data/wiki40b-txt/en.small"]

    bpe_tokenizer.train(files=files, vocab_size=32000, min_frequency=2)

    tokenizer_path = f'./data/tokenizer/en'
    if not os.path.exists(tokenizer_path):
        os.makedirs(tokenizer_path)

    # save the vocab.json and merges.txt files of the trained bpe tokenizer
    bpe_tokenizer.save_model(tokenizer_path)

    model_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, tokenizer_type="gpt2")
    model_tokenizer.model_max_length = 512
    model_tokenizer.add_special_tokens({"pad_token": AddedToken("<pad>", normalized=True)})

    print(f'Tokenizer vocab size: {len(model_tokenizer)}')
    print(f'Tokenizer max sequence length: {model_tokenizer.model_max_length} \n')

    # save the full model tokenizer configuration files
    model_tokenizer.save_pretrained(tokenizer_path)

    output = model_tokenizer.encode_plus(sample["en"])
    print(output.tokens(), '\n')

In [None]:
train_tokenizer()






Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer vocab size: 32002
Tokenizer max sequence length: 512 

['H', 'ello', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', '?', 'Ġ(', 'just', 'Ġtesting', 'Ġthe', 'Ġto', 'ken', 'izer', ')'] 



## Dependency Parsing with UDPipe

In [None]:
from ufal.udpipe import Model, Pipeline, ProcessingError
import sys
import argparse
import os
from mosestokenizer import (
    MosesPunctuationNormalizer,
    MosesTokenizer,
    MosesSentenceSplitter,
)
# from indicnlp.tokenize.sentence_tokenize import sentence_split as indic_sent_tokenize
# from indicnlp.tokenize.indic_tokenize import trivial_tokenize as indic_word_tokenize
# from hazm import sent_tokenize as persian_sent_tokenize
# from hazm import word_tokenize as persian_word_tokenize
# from hazm import Normalizer as PersianNormalizer

In [None]:
UDPIPE_MODEL_LOOKUP = {
    "en": "udpipe_models/english-lines-ud-2.5-191206.udpipe"
}

In [None]:
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang", help="2-letter language code such as en, ru, vi, etc.", default="en"
    )
    parser.add_argument(
        "--udpipe_model_path", help="path to UDPipe model file for this language"
    )
    parser.add_argument(
        "--data_dir",
        help="path to data directory with original (normal-order) text",
        default="./data/wiki40b-txt/en_small.txt",
    )
    parser.add_argument(
        "--parse_dir",
        help="path to directory where CONLLU parses of sentences should be stored",
        default="./parse",
    )
    parser.add_argument(
        "--partitions",
        default="train,test,valid",
        help="comma-seprated list of partitions",
    )
    parser.add_argument("--test_run", action="store_true")
    args = parser.parse_args()

    # create output directory if it doesn't yet exist
    if not os.path.exists(args.parse_dir):
        os.system(f"mkdir -p {args.parse_dir}")

    # load UDPipe Model
    sys.stderr.write("Loading model: ")
    if args.udpipe_model_path is None:
        model = Model.load(UDPIPE_MODEL_LOOKUP[args.lang])
        sys.stderr.write(f"{model}\n")
    else:
        model = Model.load(args.udpipe_model_path)
    if not model:
        sys.stderr.write(f"Cannot load model from file '{args.udpipe_model_path}'\n")
        sys.exit(1)
    sys.stderr.write("done\n")

    # create pipeline
    pipeline = Pipeline(
        model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
    )
    err = ProcessingError()

    # Make sentence tokenizer
    if args.lang == "hi":
        pass
    elif args.lang == "fa":
        persian_normalizer = PersianNormalizer()
    else:
        sent_tokenize = MosesSentenceSplitter(args.lang)
        word_tokenize = MosesTokenizer(args.lang, no_escape=True)
        normalize = MosesPunctuationNormalizer(args.lang)

    # iterate over partitions
    for partition in args.partitions.split(","):
        input_path = os.path.join(args.data_dir, f"{args.lang}.{partition}")
        if args.test_run:
            output_path = os.path.join(
                args.parse_dir, f"{args.lang}.{partition}.tiny.conllu"
            )
        else:
            output_path = os.path.join(
                args.parse_dir, f"{args.lang}.{partition}.conllu"
            )

        with open(input_path) as f_in, open(output_path, "w") as f_out:

            doc_counter = 0

            # use iterator over lines in f_in to save memory
            for document in f_in:

                # Moses tokenizer will fail if the line is blank
                if (len(document.strip())) == 0:
                    sys.stderr.write("There was a blank line in the input file\n")
                    continue

                if args.lang == "fa":
                    document = persian_normalizer.normalize(document)
                    sentences = persian_sent_tokenize(document)
                    sentences_tokenized = [persian_word_tokenize(s) for s in sentences]
                elif args.lang == "hi":
                    # split sentences
                    sentences = indic_sent_tokenize(document, lang="hi")
                    # sentences_tokenized = [word_tokenize(normalize(s)) for s in sentences]
                    sentences_tokenized = [indic_word_tokenize(s) for s in sentences]
                else:
                    # split sentences
                    sentences = sent_tokenize([document])
                    sentences_tokenized = [
                        word_tokenize(normalize(s)) for s in sentences
                    ]

                sentences = [" ".join(s) for s in sentences_tokenized]
                sentences = "\n".join(sentences)

                # Process data
                processed = pipeline.process(sentences, err)
                if err.occurred():
                    sys.stderr.write(
                        f"An error occurred in run_udpipe: {err.message}\n"
                    )
                    sys.exit(1)

                f_out.write(processed)

                doc_counter += 1

                if args.test_run and doc_counter >= 5:
                    exit()

In [None]:
# load UDPipe Model
sys.stderr.write("Loading model: ")
udpipe_model_path = 0
if udpipe_model_path == 0:
    model = Model.load(UDPIPE_MODEL_LOOKUP["en"])
    sys.stderr.write(f"{model}\n")
else:
    model = Model.load(udpipe_model_path)
if not model:
    sys.stderr.write(f"Cannot load model from file '{udpipe_model_path}'\n")
    sys.exit(1)
sys.stderr.write("done\n")

Loading model: <Swig Object of type 'model *' at 0x112a9b8b0>
done


5

In [None]:
model = Model.load(UDPIPE_MODEL_LOOKUP["en"])

In [None]:
sys.stderr.write(f"{model}\n")

None


5

In [None]:
lang = "en"
partitions = "train,test,valid"
parse_dir = "./parse/"
test_run = False

In [None]:
# create pipeline
pipeline = Pipeline(
    model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
)
err = ProcessingError()

# Make sentence tokenizer
sent_tokenize = MosesSentenceSplitter(lang)
word_tokenize = MosesTokenizer(lang, no_escape=True)
normalize = MosesPunctuationNormalizer(lang)

# iterate over partitions
for partition in partitions.split(","):
    input_path = os.path.join("./data/wiki40b-txt/", f"{lang}_{partition}.txt")
    if test_run:
        output_path = os.path.join(
            parse_dir, f"{lang}_{partition}.tiny.conllu"
        )
    else:
        output_path = os.path.join(
            parse_dir, f"{lang}_{partition}.conllu"
        )

    with open(input_path) as f_in, open(output_path, "w") as f_out:

        doc_counter = 0

        # use iterator over lines in f_in to save memory
        for document in f_in:

            # Moses tokenizer will fail if the line is blank
            if (len(document.strip())) == 0:
                sys.stderr.write("There was a blank line in the input file\n")
                continue
            # split sentences
            sentences = sent_tokenize([document])
            sentences_tokenized = [
                word_tokenize(normalize(s)) for s in sentences
            ]

            sentences = [" ".join(s) for s in sentences_tokenized]
            sentences = "\n".join(sentences)

            # Process data
            processed = pipeline.process(sentences, err)
            if err.occurred():
                sys.stderr.write(
                    f"An error occurred in run_udpipe: {err.message}\n"
                )
                sys.exit(1)

            f_out.write(processed)

            doc_counter += 1

            if test_run and doc_counter >= 5:
                exit()

There was a blank line in the input file


## Visualization of dependency graphs before and after reverse_content_head

In [None]:
import json

In [None]:
def reverse_content_head(sentence, validate=True):
    """Apply dependency parse convention change (deviation from vanilla UD)

    Args:
        sentence (List[Dict[str,int]]): a list of dictionaries, each corresponding to a word,
        with the UD header names as dictionary keys

    Returns:
        List[Dict[str,int]]: same format as input
    """
    CH_CONVERSION_ORDER = ["cc", "case", "cop", "mark"]
    # find paths that should be reverted
    for dep in CH_CONVERSION_ORDER:
        for i in range(len(sentence)):
            if sentence[i]["dep"] == dep or sentence[i]["dep"].startswith(dep + ":"):
                head = sentence[i]["head"] - 1
                grandp = sentence[head]["head"] - 1
                assert head > -1

                # grandp -> head -> i
                # grandp -> i -> head
                sentence[i]["head"] = grandp + 1
                sentence[head]["head"] = i + 1

                sentence[i]["dep"] = sentence[head]["dep"]
                sentence[head]["dep"] = "lifted_" + dep
                assert sentence[i]["index"] == i + 1

    # make sure none of the original dependency relations remain
    for i in range(len(sentence)):
        if sentence[i]["dep"] in CH_CONVERSION_ORDER:
            if validate:
                sys.stderr.write(json.dumps(sentence))
                sys.stderr.write("\n")
            return None

    return sentence

In [None]:
Sentence = "She eats lunch before she goes to the park."

In [None]:
import re

chinese_string = "你好，世界！"  # Replace with your Chinese string

# Remove punctuation symbols using a regular expression
chinese_string_without_punctuation = re.sub(r'[^\w\s]', '', chinese_string)

# Calculate the number of characters in the cleaned string
character_count = len(chinese_string_without_punctuation)

print("Number of characters (excluding punctuation):", character_count)

Number of characters (excluding punctuation): 4


In [None]:
English_string = "Hello, it's nice to meet you!"
character_count = len(English_string.split())
print(character_count)
print(English_string.split())

6
['Hello,', "it's", 'nice', 'to', 'meet', 'you!']


## Obtaining Counterfactual Grammars, switching verb and object

In [30]:
x = """
1	As	as	ADP	_	_	3	case	_	_
2	a	a	DET	IND-SG	Definite=Ind|PronType=Art	3	det	_	_
3	youth	youth	NOUN	SG-NOM	Number=Sing	5	obl	_	_
4	he	he	PRON	PERS-P3SG-NOM	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	5	nsubj	_	_
5	followed	follow	VERB	PAST	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
6	a	a	DET	IND-SG	Definite=Ind|PronType=Art	8	det	_	_
7	religious	religious	ADJ	POS	Degree=Pos	8	amod	_	_
8	pilgrimage	pilgrimage	NOUN	SG-NOM	Number=Sing	5	obj	_	_
9	that	that	PRON	REL	PronType=Rel	10	nsubj	_	_
10	took	take	VERB	PAST	Mood=Ind|Tense=Past|VerbForm=Fin	8	acl:relcl	_	_
11	him	he	PRON	PERS-P3SG-ACC	Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs	10	obj	_	_
12	to	to	ADP	_	_	13	case	_	_
13	Transylvania	Transylvania	PROPN	SG-NOM	Number=Sing	10	obl	_	_
14	College	College	PROPN	SG-NOM	Number=Sing	13	flat	_	_
15	,	,	PUNCT	Comma	_	5	punct	_	_
16	a	a	DET	IND-SG	Definite=Ind|PronType=Art	17	det	_	_
17	Disciples	Disciple	NOUN	SG-NOM	Number=Sing	5	obj	_	_
18	of	of	ADP	_	_	20	case	_	_
19	Christ	Christ	ADJ	SPL	Degree=Sup	20	amod	_	_
20	school	school	NOUN	SG-NOM	Number=Sing	17	nmod	_	_
21	in	in	ADP	_	_	22	case	_	_
22	Lexington	Lexington	PROPN	SG-NOM	Number=Sing	20	nmod	_	_
23	,	,	PUNCT	Comma	_	22	punct	_	_
24	Kentucky	Kentucky	PROPN	SG-NOM	Number=Sing	22	conj	_	_
25	.	.	PUNCT	Period	_	5	punct	_	_
"""

In [31]:
split = list(map(lambda x: x.split("\t"), x.split("\n")))
sentence = split[1:26]

In [32]:
len(sentence)

25

In [33]:
sentence[0]

['1', 'As', 'as', 'ADP', '_', '_', '3', 'case', '_', '_']

"Arthur Ford was born in Titusville, Florida and grew up in Fort Pierce, Florida" 
"Arthur Ford was in Titusville, Florida born and in Fort Pierce, Florida grew up"
"Arthur Ford was born Titusville, FLorida in and grew up Fort Pierce, Forida in"
"As a youth he followed a religious pilgrimage that took him to Transylvania College, a Disciples of Christ school in Lexington, Kentucky."
"As a youth he a religious pilgrimage that him to Transylvania College, a Disciples of Christ school in Lexington, Kentucky took followed."

In [35]:
HEADER = [
    "index",
    "word",
    "lemma",
    "posUni",
    "posFine",
    "morph",
    "head",
    "dep",
    "_",
    "_",
]

In [36]:
result = []
for i in range(len(sentence)):
    if sentence[i][0].startswith("#"):
        if sentence[i][0].startswith("# newdoc"):
            newdoc = True
        continue
    if "-" in sentence[i][0]:  # if it is NUM-NUM
        continue
    if "." in sentence[i][0]:
        continue

    # sentence = list of dicts, where each key is a field name (see HEADER)
    sentence[i] = dict([(y, sentence[i][x]) for x, y in enumerate(HEADER)])
    sentence[i]["head"] = int(sentence[i]["head"])
    sentence[i]["index"] = int(sentence[i]["index"])
    sentence[i]["word"] = sentence[i]["word"].lower()

    # if self.storeMorph:
    #     sentence[i]["morph"] = sentence[i]["morph"].split("|")

    sentence[i]["dep"] = sentence[i]["dep"].lower()
    result.append(sentence[i])

In [37]:
result

[{'index': 1,
  'word': 'as',
  'lemma': 'as',
  'posUni': 'ADP',
  'posFine': '_',
  'morph': '_',
  'head': 3,
  'dep': 'case',
  '_': '_'},
 {'index': 2,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronType=Art',
  'head': 3,
  'dep': 'det',
  '_': '_'},
 {'index': 3,
  'word': 'youth',
  'lemma': 'youth',
  'posUni': 'NOUN',
  'posFine': 'SG-NOM',
  'morph': 'Number=Sing',
  'head': 5,
  'dep': 'obl',
  '_': '_'},
 {'index': 4,
  'word': 'he',
  'lemma': 'he',
  'posUni': 'PRON',
  'posFine': 'PERS-P3SG-NOM',
  'morph': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'dep': 'nsubj',
  '_': '_'},
 {'index': 5,
  'word': 'followed',
  'lemma': 'follow',
  'posUni': 'VERB',
  'posFine': 'PAST',
  'morph': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': 0,
  'dep': 'root',
  '_': '_'},
 {'index': 6,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronType=Art',

In [38]:
def reverse_content_head(sentence, validate=True):
    """Apply dependency parse convention change (deviation from vanilla UD)

    Args:
        sentence (List[Dict[str,int]]): a list of dictionaries, each corresponding to a word,
        with the UD header names as dictionary keys

    Returns:
        List[Dict[str,int]]: same format as input
    """
    CH_CONVERSION_ORDER = ["cc", "case", "cop", "mark"]
    # find paths that should be reverted
    for dep in CH_CONVERSION_ORDER:
        for i in range(len(sentence)):
            if sentence[i]["dep"] == dep or sentence[i]["dep"].startswith(dep + ":"):
                head = sentence[i]["head"] - 1
                grandp = sentence[head]["head"] - 1
                assert head > -1

                # grandp -> head -> i
                # grandp -> i -> head
                sentence[i]["head"] = grandp + 1
                sentence[head]["head"] = i + 1

                sentence[i]["dep"] = sentence[head]["dep"]
                sentence[head]["dep"] = "lifted_" + dep
                assert sentence[i]["index"] == i + 1

    # make sure none of the original dependency relations remain
    for i in range(len(sentence)):
        if sentence[i]["dep"] in CH_CONVERSION_ORDER:
            if validate:
                sys.stderr.write(json.dumps(sentence))
                sys.stderr.write("\n")
            return None

    return sentence


In [39]:
sentence = reverse_content_head(result, validate=True)

In [40]:
sentence

[{'index': 1,
  'word': 'as',
  'lemma': 'as',
  'posUni': 'ADP',
  'posFine': '_',
  'morph': '_',
  'head': 5,
  'dep': 'obl',
  '_': '_'},
 {'index': 2,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronType=Art',
  'head': 3,
  'dep': 'det',
  '_': '_'},
 {'index': 3,
  'word': 'youth',
  'lemma': 'youth',
  'posUni': 'NOUN',
  'posFine': 'SG-NOM',
  'morph': 'Number=Sing',
  'head': 1,
  'dep': 'lifted_case',
  '_': '_'},
 {'index': 4,
  'word': 'he',
  'lemma': 'he',
  'posUni': 'PRON',
  'posFine': 'PERS-P3SG-NOM',
  'morph': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'dep': 'nsubj',
  '_': '_'},
 {'index': 5,
  'word': 'followed',
  'lemma': 'follow',
  'posUni': 'VERB',
  'posFine': 'PAST',
  'morph': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': 0,
  'dep': 'root',
  '_': '_'},
 {'index': 6,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronTyp

In [41]:
sentence

[{'index': 1,
  'word': 'as',
  'lemma': 'as',
  'posUni': 'ADP',
  'posFine': '_',
  'morph': '_',
  'head': 5,
  'dep': 'obl',
  '_': '_'},
 {'index': 2,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronType=Art',
  'head': 3,
  'dep': 'det',
  '_': '_'},
 {'index': 3,
  'word': 'youth',
  'lemma': 'youth',
  'posUni': 'NOUN',
  'posFine': 'SG-NOM',
  'morph': 'Number=Sing',
  'head': 1,
  'dep': 'lifted_case',
  '_': '_'},
 {'index': 4,
  'word': 'he',
  'lemma': 'he',
  'posUni': 'PRON',
  'posFine': 'PERS-P3SG-NOM',
  'morph': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'dep': 'nsubj',
  '_': '_'},
 {'index': 5,
  'word': 'followed',
  'lemma': 'follow',
  'posUni': 'VERB',
  'posFine': 'PAST',
  'morph': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': 0,
  'dep': 'root',
  '_': '_'},
 {'index': 6,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronTyp

### Trying Stanza instead

In [None]:
!pip install stanza
import stanza

# Load the English pipeline
stanza.download('en')  # Download the English model
nlp = stanza.Pipeline('en')  # Initialize the English pipeline

In [4]:
nlp = stanza.Pipeline('en', processors='tokenize,lemma,pos,depparse')

In [8]:
from stanza.utils.conll import CoNLL

In [25]:
doc = nlp("Bill seems honest. Bill is honest. \n I believe so.")
# dict = doc.sentences[2].to_dict()
# dict

In [26]:
# list of list of dictionaries
dicts = doc.to_dict()

In [29]:
len(dicts)

3

In [28]:
dicts[0]

[{'id': 1,
  'text': 'Bill',
  'lemma': 'Bill',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': 2,
  'deprel': 'nsubj',
  'start_char': 0,
  'end_char': 4},
 {'id': 2,
  'text': 'seems',
  'lemma': 'seem',
  'upos': 'VERB',
  'xpos': 'VBZ',
  'feats': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
  'head': 0,
  'deprel': 'root',
  'start_char': 5,
  'end_char': 10},
 {'id': 3,
  'text': 'honest',
  'lemma': 'honest',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'head': 2,
  'deprel': 'xcomp',
  'start_char': 11,
  'end_char': 17},
 {'id': 4,
  'text': '.',
  'lemma': '.',
  'upos': 'PUNCT',
  'xpos': '.',
  'head': 2,
  'deprel': 'punct',
  'start_char': 17,
  'end_char': 18}]

In [30]:
CoNLL.doc2conll(doc)

  CoNLL.doc2conll(doc)


[['# text = Bill seems honest.',
  '# sent_id = 0',
  '1\tBill\tBill\tPROPN\tNNP\tNumber=Sing\t2\tnsubj\t_\tstart_char=0|end_char=4',
  '2\tseems\tseem\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t0\troot\t_\tstart_char=5|end_char=10',
  '3\thonest\thonest\tADJ\tJJ\tDegree=Pos\t2\txcomp\t_\tstart_char=11|end_char=17',
  '4\t.\t.\tPUNCT\t.\t_\t2\tpunct\t_\tstart_char=17|end_char=18'],
 ['# text = Bill is honest.',
  '# sent_id = 1',
  '1\tBill\tBill\tPROPN\tNNP\tNumber=Sing\t3\tnsubj\t_\tstart_char=19|end_char=23',
  '2\tis\tbe\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\tcop\t_\tstart_char=24|end_char=26',
  '3\thonest\thonest\tADJ\tJJ\tDegree=Pos\t0\troot\t_\tstart_char=27|end_char=33',
  '4\t.\t.\tPUNCT\t.\t_\t3\tpunct\t_\tstart_char=33|end_char=34'],
 ['# text = I believe so.',
  '# sent_id = 2',
  '1\tI\tI\tPRON\tPRP\tCase=Nom|Number=Sing|Person=1|PronType=Prs\t2\tnsubj\t_\tstart_char=37|end_char=38',
  '2\tbelieve\tbelieve\tVERB\tVBP\t

In [20]:
CoNLL.write_doc2conll(doc, 'lala.conllu')

In [21]:
doc = nlp("I'd like to work with you.")

In [22]:
CoNLL.write_doc2conll(doc, 'lala.conllu')

In [None]:
doc = CoNLL.conll2doc('lala.conllu')

In [None]:
doc.to_dict()

In [67]:
conll_u_parse_2

[{'id': 1,
  'text': 'Bill',
  'lemma': 'Bill',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': 2,
  'deprel': 'nsubj',
  'start_char': 0,
  'end_char': 4,
  'ner': 'S-PERSON',
  'multi_ner': ('S-PERSON',)},
 {'id': 2,
  'text': 'seems',
  'lemma': 'seem',
  'upos': 'VERB',
  'xpos': 'VBZ',
  'feats': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
  'head': 0,
  'deprel': 'root',
  'start_char': 5,
  'end_char': 10,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 3,
  'text': 'honest',
  'lemma': 'honest',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'head': 2,
  'deprel': 'xcomp',
  'start_char': 11,
  'end_char': 17,
  'ner': 'O',
  'multi_ner': ('O',)}]

In [2]:
import stanza
stanza.download('zh', verbose=False)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def read_conllu_file(path):
    """Reads all data from a CoNNL-U format file and returns a list of 
    strings, each corresponding to a sentence in the dataset. Assumes 
    properly formed CoNNL-U format data.
    NOTE: this function is deprecated and we instead use an iterator
    over the data file to save memory.
    TODO: can I delete this?

    Args:
        path (str): path to file

    Returns:
        List[str]: list of strings, where each string is a CoNNL-U record
        for a sentence (one line per word, tab-separated fields)
    """
    with open(path) as f:
        data = f.read().strip()
    data = data.split("\n\n")
    return data

In [68]:
conll_u_parse_2

[{'id': 1,
  'text': 'Bill',
  'lemma': 'Bill',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': 2,
  'deprel': 'nsubj',
  'start_char': 0,
  'end_char': 4,
  'ner': 'S-PERSON',
  'multi_ner': ('S-PERSON',)},
 {'id': 2,
  'text': 'seems',
  'lemma': 'seem',
  'upos': 'VERB',
  'xpos': 'VBZ',
  'feats': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
  'head': 0,
  'deprel': 'root',
  'start_char': 5,
  'end_char': 10,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 3,
  'text': 'honest',
  'lemma': 'honest',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'head': 2,
  'deprel': 'xcomp',
  'start_char': 11,
  'end_char': 17,
  'ner': 'O',
  'multi_ner': ('O',)}]

In [52]:
# Define the sentence to parse
sentence = "As a youth he followed a religious pilgrimage that took him to Transylvania College, a Disciples of Christ school in Lexington, Kentucky."

# Process the sentence
doc = nlp(sentence)

# Access CoNLL-U format parse for the first sentence
conll_u_parse = doc.sentences[0].to_dict()

# Print the CoNLL-U format parse
print(conll_u_parse)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 95.3MB/s]                    
2023-08-30 15:15:23 INFO: Downloading default packages for language: en (English) ...
2023-08-30 15:15:25 INFO: File exists: /Users/sally/stanza_resources/en/default.zip
2023-08-30 15:15:29 INFO: Finished downloading models and saved to /Users/sally/stanza_resources.
2023-08-30 15:15:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 85.5MB/s]                    
2023-08-30 15:15:31 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| cons

[{'id': 1, 'text': 'As', 'lemma': 'as', 'upos': 'ADP', 'xpos': 'IN', 'head': 3, 'deprel': 'case', 'start_char': 0, 'end_char': 2, 'ner': 'O', 'multi_ner': ('O',)}, {'id': 2, 'text': 'a', 'lemma': 'a', 'upos': 'DET', 'xpos': 'DT', 'feats': 'Definite=Ind|PronType=Art', 'head': 3, 'deprel': 'det', 'start_char': 3, 'end_char': 4, 'ner': 'O', 'multi_ner': ('O',)}, {'id': 3, 'text': 'youth', 'lemma': 'youth', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing', 'head': 5, 'deprel': 'obl', 'start_char': 5, 'end_char': 10, 'ner': 'O', 'multi_ner': ('O',)}, {'id': 4, 'text': 'he', 'lemma': 'he', 'upos': 'PRON', 'xpos': 'PRP', 'feats': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs', 'head': 5, 'deprel': 'nsubj', 'start_char': 11, 'end_char': 13, 'ner': 'O', 'multi_ner': ('O',)}, {'id': 5, 'text': 'followed', 'lemma': 'follow', 'upos': 'VERB', 'xpos': 'VBD', 'feats': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin', 'head': 0, 'deprel': 'root', 'start_char': 14, 'end_char': 22, 

In [54]:
conll_u_parse

[{'id': 1,
  'text': 'As',
  'lemma': 'as',
  'upos': 'ADP',
  'xpos': 'IN',
  'head': 3,
  'deprel': 'case',
  'start_char': 0,
  'end_char': 2,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 2,
  'text': 'a',
  'lemma': 'a',
  'upos': 'DET',
  'xpos': 'DT',
  'feats': 'Definite=Ind|PronType=Art',
  'head': 3,
  'deprel': 'det',
  'start_char': 3,
  'end_char': 4,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 3,
  'text': 'youth',
  'lemma': 'youth',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': 'Number=Sing',
  'head': 5,
  'deprel': 'obl',
  'start_char': 5,
  'end_char': 10,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 4,
  'text': 'he',
  'lemma': 'he',
  'upos': 'PRON',
  'xpos': 'PRP',
  'feats': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'deprel': 'nsubj',
  'start_char': 11,
  'end_char': 13,
  'ner': 'O',
  'multi_ner': ('O',)},
 {'id': 5,
  'text': 'followed',
  'lemma': 'follow',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Number=Sin

In [57]:
import copy

In [58]:
sent = copy.deepcopy(conll_u_parse)

In [59]:
def makeCoarse(x):
    if ":" in x:
        return x[: x.index(":")]
    return x

In [62]:
def get_all_children(sentence):
    """ Coarsify all the dependent relations, track all children """
    for line in sentence:
        # make the dependency relation label coarse (ignore stuff after colon)
        line["coarse_dep"] = makeCoarse(line["deprel"])

        # identify the root, and skip to next word
        if line["coarse_dep"] == "root":
            root = line["id"]
            continue

        if line["coarse_dep"].startswith("punct"):
            continue

        headIndex = line["head"] - 1
        sentence[headIndex]["children"] = sentence[headIndex].get("children", []) + [line["id"]]
    

In [63]:
get_all_children(sent)
sent

[{'id': 1,
  'text': 'As',
  'lemma': 'as',
  'upos': 'ADP',
  'xpos': 'IN',
  'head': 3,
  'deprel': 'case',
  'start_char': 0,
  'end_char': 2,
  'ner': 'O',
  'multi_ner': ('O',),
  'coarse_dep': 'case'},
 {'id': 2,
  'text': 'a',
  'lemma': 'a',
  'upos': 'DET',
  'xpos': 'DT',
  'feats': 'Definite=Ind|PronType=Art',
  'head': 3,
  'deprel': 'det',
  'start_char': 3,
  'end_char': 4,
  'ner': 'O',
  'multi_ner': ('O',),
  'coarse_dep': 'det'},
 {'id': 3,
  'text': 'youth',
  'lemma': 'youth',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': 'Number=Sing',
  'head': 5,
  'deprel': 'obl',
  'start_char': 5,
  'end_char': 10,
  'ner': 'O',
  'multi_ner': ('O',),
  'children': [1, 2, 1, 2],
  'coarse_dep': 'obl'},
 {'id': 4,
  'text': 'he',
  'lemma': 'he',
  'upos': 'PRON',
  'xpos': 'PRP',
  'feats': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'deprel': 'nsubj',
  'start_char': 11,
  'end_char': 13,
  'ner': 'O',
  'multi_ner': ('O',),
  'coarse_dep': 'nsubj'

In [18]:
sent

[{'index': 1,
  'word': 'as',
  'lemma': 'as',
  'posUni': 'ADP',
  'posFine': '_',
  'morph': '_',
  'head': 5,
  'dep': 'obl',
  '_': '_',
  'coarse_dep': 'obl',
  'children': [3]},
 {'index': 2,
  'word': 'a',
  'lemma': 'a',
  'posUni': 'DET',
  'posFine': 'IND-SG',
  'morph': 'Definite=Ind|PronType=Art',
  'head': 3,
  'dep': 'det',
  '_': '_',
  'coarse_dep': 'det'},
 {'index': 3,
  'word': 'youth',
  'lemma': 'youth',
  'posUni': 'NOUN',
  'posFine': 'SG-NOM',
  'morph': 'Number=Sing',
  'head': 1,
  'dep': 'lifted_case',
  '_': '_',
  'children': [2],
  'coarse_dep': 'lifted_case'},
 {'index': 4,
  'word': 'he',
  'lemma': 'he',
  'posUni': 'PRON',
  'posFine': 'PERS-P3SG-NOM',
  'morph': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
  'head': 5,
  'dep': 'nsubj',
  '_': '_',
  'coarse_dep': 'nsubj'},
 {'index': 5,
  'word': 'followed',
  'lemma': 'follow',
  'posUni': 'VERB',
  'posFine': 'PAST',
  'morph': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': 0,
  'dep': 

In [64]:
def dfs_with_stack(graph, start_node):
    stack = [start_node]
    visited = set()

    while stack:
        node = stack.pop()
        if node not in visited:
            visited.add(node)
            print(node)  # Process the node

            for neighbor in graph[node]:
                if neighbor not in visited:
                    stack.append(neighbor)

In [65]:
def swap_order(verb_idx, obj_idx, sentence, result):
# Helper function for processing verb and object chunks
    # verb_list = sentence[verb_idx]['children']
    # obj_list = sentence[obj_idx]['children']
    # verb_list = verb_list - obj_list
    result[verb_idx], result[obj_idx] = sentence[obj_idx]["id"], sentence[verb_idx]["id"]

In [27]:
def swap(sentence, root):
# DFS for swaping verb and object
# TODO: edge cases: 1. multiple obj
#                   2. went to school happily -> to school went happily
    result = [i for i in range(1, len(sentence) + 1)]
    stack = [root]
    visited = set()

    while stack:
        node = stack.pop()
        if node not in visited:
            visited.add(node)
            print(node) # print out index of the node being processed

            if not sentence[node-1].get("children", None):
                continue
            for c in sentence[node-1]["children"]:
                if sentence[node-1]['posUni'] == 'VERB' and sentence[c-1]['coarse_dep'] == 'obj':
                    verb_idx, obj_idx = node - 1, c - 1
                    swap_order(verb_idx, obj_idx, sentence, result)
                if c not in visited:
                    stack.append(c)
    return result

In [28]:
res = swap(sent, 5)

5
17
18
20
21
22
24
19
16
8
10
12
13
14
11
9
7
6
4
1
3
2


In [29]:
res

[1,
 2,
 3,
 4,
 17,
 6,
 7,
 5,
 9,
 11,
 10,
 12,
 13,
 14,
 15,
 16,
 5,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

In [47]:
def swap(sentence, node):
    """
    sentence: list of dicts
    node: the index of the current dependency tree node
    """
    # node is the index in CoNLL-U format parses. Start with the root.
    if not sentence[node-1]["children"]:
        return
    for c in sentence[node-1]["children"]:
        swap(sentence, c)
    if sentence[node-1]['posUni'] == 'VERB':
        for c in sentence[node-1]["children"]:
            if sentence[c-1]['coarse_dep'] == 'obj':
                verb_idx, obj_idx = node-1, c-1
                swap_order(verb_idx, obj_idx, sentence, result)

In [48]:
def swap_order(verb_idx, obj_idx, sentence, result):
    verb_list = sentence[verb_idx]['children']
    obj_list = sentence[obj_idx]['children']
    # verb_list = verb_list - obj_list
    
    

SyntaxError: invalid syntax (4028664378.py, line 4)

In [None]:
def swap_order(verb_head, object_head):
	verb <- verb_head.get_constituent()
	object <- object_head.get_constituent()
	Verb, object = object, verb

In [None]:
def swap(node):
	# The node is the root during first call
    children <- node.get_all_children()
    for c in children:
        c <- swap(c)
    # Find all verb/object pairs in children
    verb <- find_verb(children)
    if verb exists:
        object <- find_object(children, verb)
        if object exists:
            node <- swap_order(verb, object)
    return node

def get_all_children():
    # This one gets all direct children

def swap_order(verb_head, object_head):
	verb <- verb_head.get_constituent()
	object <- object_head.get_constituent()
	Verb, object = object, verb

In [None]:
def swap(node):
    # Base case: If the node is a leaf, return it
    if node.is_leaf():
        return node

    # Recursively swap verb/object pairs in children
    for c in node.get_all_children():
        c = swap(c)

    # Find verb/object pairs in children
    verb = find_verb(node.get_all_children())
    if verb is not None:
        obj = find_object(node.get_all_children(), verb)
        if obj is not None:
            node = swap_order(node, verb, obj)

    return node

In [None]:
def reversePair(sentence, model):
    x, y = model[0], model[1]
    # rev = False # only reverse it once
    # moved = [None] * len(sentence)

    for line in sentence:
        key = line["coarse_dep"]
        if key == x:
            x_idx = line["index"] - 1
        elif key == y:
            y_idx = line["index"] - 1
        else:
            continue
    
    if x_idx and y_idx:
        sentence[x_idx], sentence[y_idx] = sentence[y_idx], sentence[x_idx]

        for line in sentence:
            if line["head"] == x_idx + 1:
                line["reordered_head"] = y_idx + 1
            elif line["head"] == y_idx + 1:
                line["reordered_head"] = x_idx + 1
            else:
                continue
    return sentence