<a href="https://colab.research.google.com/github/rozariwang/DS_project/blob/main/CoLi_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Computational Linguistics 23/24 Final Project** <br>
Name: Ho-Hsuan Wang <br>
Student Number: 7038925 <br>
Date: 22nd March <br>

## 1. Download and Inspect Datasets

In [2]:
pip install requests conllu


Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [3]:
import requests
from conllu import parse
from io import StringIO

Link to UD_Chinese-GSD repository: https://github.com/UniversalDependencies/UD_Chinese-GSD

Description: UD_Chinese-GSD is a traditional Chinese Universal Dependencies Treebank annotated and converted by Google.

In [4]:
links_to_datasets = {
    "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-train.conllu",
    "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-test.conllu",
    "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-dev.conllu",
}

def download_and_parse(url):
    response = requests.get(url)
    response.raise_for_status()
    file_content = response.text
    parsed_data = parse(file_content)
    return parsed_data

# Column headers and descriptions
columns_info = {
    "ID": "Token ID, integer or decimal for multiword tokens",
    "FORM": "Form or spelling of the word",
    "LEMMA": "Lemma or base form of the word",
    "UPOS": "Universal Part-of-Speech tag",
    "XPOS": "Language-specific part-of-speech tag",
    "FEATS": "Morphological features",
    "HEAD": "Head of the current token in a dependency parse",
    "DEPREL": "Dependency relation to the HEAD",
    "DEPS": "Enhanced dependency graph",
    "MISC": "Any other annotation"
}

datasets = {name: download_and_parse(url) for name, url in links_to_datasets.items()}

# Print column information
print("CoNLL-U Format Columns:")
for column, description in columns_info.items():
    print(f"{column}: {description}")
print("\n---\n")

# Print dataset information
for name, data in datasets.items():
    print(f"Dataset: {name}, Sentences: {len(data)}")
    if data:
        first_sentence = data[0]
        print(f"# sent_id = {first_sentence.metadata['sent_id']}")
        print(f"# text = {first_sentence.metadata['text']}")
        if 'translit' in first_sentence.metadata:
            print(f"# translit = {first_sentence.metadata['translit']}")
        for token in first_sentence:
            print(f"{token['id']}\t{token['form']}\t{token['lemma']}\t{token['upos']}\t{token['xpos']}\t{'_'}\t{token.get('head', '_')}\t{token['deprel']}\t{'_'}\t{'|'.join([f'{k}={v}' for k, v in token['misc'].items()]) if token['misc'] else '_'}")
    print("\n---\n")

CoNLL-U Format Columns:
ID: Token ID, integer or decimal for multiword tokens
FORM: Form or spelling of the word
LEMMA: Lemma or base form of the word
UPOS: Universal Part-of-Speech tag
XPOS: Language-specific part-of-speech tag
FEATS: Morphological features
HEAD: Head of the current token in a dependency parse
DEPREL: Dependency relation to the HEAD
DEPS: Enhanced dependency graph
MISC: Any other annotation

---

Dataset: train, Sentences: 3997
# sent_id = train-s1
# text = 看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。
# translit = kànshìjiǎndān,zhǐshì'èrxuǎnyīzuòjuézé,dànqíshítāmendàibiǎodeshìnǐzhōuzāodeqīnpénghǎoyou,shìzhegěinǐbùtóngdeyìjiàn,dànzhuīgēnjiūdǐ,zuìhòujuédìngdeháishìzìjǐ.
1	看似	看似	VERB	VV	_	5	advcl	_	SpaceAfter=No|Translit=kànshì|LTranslit=kànshì
2	簡單	簡單	ADJ	JJ	_	1	xcomp	_	SpaceAfter=No|Translit=jiǎndān|LTranslit=jiǎndān
3	，	，	PUNCT	,	_	1	punct	_	SpaceAfter=No|Translit=,|LTranslit=,
4	只	只	ADV	RB	_	5	advmod	_	SpaceAfter=No|Translit=zhǐ|LTranslit=zhǐ
5	是	是	VERB

## Neural Model

In [5]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [6]:
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast
from transformers import Trainer, TrainingArguments
import requests
from conllu import parse

In [24]:
model_name = "xlm-roberta-base"

# Download and parse the train dataset
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-train.conllu"

dataset = download_and_parse(url)

# Initialize the tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

def align_tokens_and_labels(sentence, label_field='upos'):
    tokens = [token['form'] for token in sentence]  # Original words from the sentence
    labels = [token[label_field] for token in sentence]  # Corresponding labels

    # Tokenize the sentence, indicating that the input is pre-tokenized
    tokenized_input = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
    word_ids = tokenized_input.word_ids(batch_index=0)

    aligned_labels = []
    # Iterate over word_ids to assign labels correctly
    for i, word_idx in enumerate(word_ids):
        if word_idx is None:
            aligned_labels.append(-100)  # Assign -100 to special tokens
        else:
            # If the token starts with "▁" and is not just "▁", assign the label
            if tokenized_input.tokens()[i].startswith("▁") and len(tokenized_input.tokens()[i]) > 1:
                aligned_labels.append(labels[word_idx])
            elif not tokenized_input.tokens()[i].startswith("▁"):
                # Assign the label to the actual word token (not the underscore)
                aligned_labels.append(labels[word_idx])
            else:
                # If it's just "▁", assign -100
                aligned_labels.append(-100)

    return tokenized_input, aligned_labels



# Align tokens and labels for the first sentence as an example
first_sentence = dataset[0]
tokenized_input, aligned_labels = align_tokens_and_labels(first_sentence)


print(first_sentence)
print("Tokenized input:", tokenized_input)
print("Aligned labels:", aligned_labels)

# Convert token IDs back to tokens (words/subwords)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0])

# Print each token with its aligned label
for token, label in zip(tokens, aligned_labels):
    print(f"{token} --> {label}\n")




TokenList<看似, 簡單, ，, 只, 是, 二, 選, 一, 做, 決擇, ，, 但, 其實, 他們, 代表, 的, 是, 你, 周遭, 的, 親朋, 好友, ，, 試, 著, 給, 你, 不同, 的, 意見, ，, 但, 追根究底, ，, 最後, 決定, 的, 還是, 自己, 。, metadata={sent_id: "train-s1", text: "看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。", translit: "kànshìjiǎndān,zhǐshì'èrxuǎnyīzuòjuézé,dànqíshítāmendàibiǎodeshìnǐzhōuzāodeqīnpénghǎoyou,shìzhegěinǐbùtóngdeyìjiàn,dànzhuīgēnjiūdǐ,zuìhòujuédìngdeháishìzìjǐ."}>
Tokenized input: {'input_ids': tensor([[     0,      6, 113875,      6,  32564,      6,      4,      6,   5344,
              6,    354,  87744,      6,   6995,  45690,      6,   2213,      6,
          33808, 235211,      6,      4,  53072,      6,  16827,      6,   8056,
              6,   6959,      6,     43,      6,    354,  73675,      6,   6271,
          44162,      6,     43,      6,  11638, 182529,      6,  81070,      6,
              4,      6,  12324,      6,   3094,      6,   5862,  73675,      6,
           5714,      6,     43,      6,  31505,      6,      4, 