In [7]:
import json
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import stanza

In [125]:
# loading data
data = open("data/train.model-agnostic.json")
data = json.load(data)

In [126]:
# converting to dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,hyp,tgt,src,ref,task,model
0,"Don't worry, it's only temporary.",Don't worry. It's only temporary.,Не волнуйся. Это только временно.,either,MT,
1,Tom is never where he should be.,Tom is never where he's supposed to be.,"Тома никогда нет там, где он должен быть.",either,MT,
2,It's hard for me to work with Tom.,I have trouble working with Tom.,Мне сложно работать с Томом.,either,MT,
3,"Water, please.",I'd like some water.,"Воду, пожалуйста.",either,MT,
4,I didn't expect Tom to betray me.,I didn't think that Tom would betray me.,"Я не ожидал, что Том предаст меня.",either,MT,
...,...,...,...,...,...,...
29995,"Yeah, I'm listening.",,"Yeah, I'm listening.",src,PG,
29996,Time?,,The time?,src,PG,
29997,Plague?,,A plague?,src,PG,
29998,"Tango, Tango.",,Tango.,src,PG,


In [127]:
# getting rid of column model
df = df.drop( columns = ["model"])

In [128]:
# improve dataframe for further processing

# replace empty cells so that it later aligns
df.replace("", "Not given", inplace = True)

In [129]:
# remove punctuation

def remove_punctuation(string):

    punctuation = [".",",","!","?"]

    for punc in punctuation:

        string = string.replace(punc, "")

    return string

In [130]:
df["hyp"] = df["hyp"].apply(remove_punctuation)
df["src"] = df["src"].apply(remove_punctuation)
df["tgt"] = df["tgt"].apply(remove_punctuation)

In [131]:
df

Unnamed: 0,hyp,tgt,src,ref,task
0,Don't worry it's only temporary,Don't worry It's only temporary,Не волнуйся Это только временно,either,MT
1,Tom is never where he should be,Tom is never where he's supposed to be,Тома никогда нет там где он должен быть,either,MT
2,It's hard for me to work with Tom,I have trouble working with Tom,Мне сложно работать с Томом,either,MT
3,Water please,I'd like some water,Воду пожалуйста,either,MT
4,I didn't expect Tom to betray me,I didn't think that Tom would betray me,Я не ожидал что Том предаст меня,either,MT
...,...,...,...,...,...
29995,Yeah I'm listening,Not given,Yeah I'm listening,src,PG
29996,Time,Not given,The time,src,PG
29997,Plague,Not given,A plague,src,PG
29998,Tango Tango,Not given,Tango,src,PG


In [132]:
nlp = stanza.Pipeline('en', processors='tokenize,lemma,pos')

2024-10-31 11:44:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-10-31 11:44:55 INFO: Downloaded file to C:\Users\User\stanza_resources\resources.json
2024-10-31 11:44:56 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-10-31 11:44:56 INFO: Using device: cpu
2024-10-31 11:44:56 INFO: Loading: tokenize
2024-10-31 11:44:56 INFO: Loading: mwt
2024-10-31 11:44:56 INFO: Loading: pos
2024-10-31 11:44:56 INFO: Loading: lemma
2024-10-31 11:44:56 INFO: Done loading processors!


In [96]:
hyp = list(df["hyp"])
hyp_doc = nlp(".".join(hyp[:1000]))
CoNLL.write_doc2conll(hyp_doc,"data/train_hyp.conllu")

In [141]:
tgt = list(df["tgt"])
tgt_doc = nlp(".".join(tgt[:100]))
CoNLL.write_doc2conll(tgt_doc,"data/train_tgt.conllu")

In [140]:
src = list(df["src"])
src_doc = nlp(".".join(src[:100]))
CoNLL.write_doc2conll(src_doc,"data/train_src.conllu")

I created for each column one conllu file, which are aligning. So the sentences with the same ids are belonging together.

Problems:

* with the translation task, the lemmatization with the input is not working because of the different language
*