In [6]:
import pandas as pd

def load_dataset(filename):
    return pd.read_csv(filename)

# Load dataset
filename = "../data/random_good_translations.csv"
df = load_dataset(filename)

# Set parameters
src = "eng"
tgt = "ovp"

In [7]:
df

Unnamed: 0,ovp,eng
0,tüsüga-noka uhuw̃a ui-w̃ui-ku,They wrote those weasels.
1,kwana-deika taa ai-nia-gaa-wei,You and I are going to read these smellers.
2,tsiipa-uu tübbi-neika ma-gwana-ku,That bird smelled this rock.
3,küna-ii tuunapi-noka u-dsibui-gaa-wei,This wood is going to climb that food.
4,tüwoobü-ii katü-peika a-gwana-wei,This earth will smell the one who has sat.
...,...,...
95,tuunapi-ii habi-wei,This food will lie down.
96,tübinohi-peika taa a-gwati-dü,You and I hit this one who has played.
97,hukaw̃ia-weidü-uu kwisha'i-weidoka ui-dsibui-wei,That one who will walk will climb those who wi...
98,wo'abi-ii nobi-noka ui-w̃ui-ti,This worm is writing those houses.


In [1]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint="/home/self/dev/kubishi-ml/train/Helsinki-NLP/opus-mt-en-mul_train-base-model-fixed-source/checkpoint-500"
en_to_pt_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/self/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/self/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:

def tag_words(strings):
    word_tags = {}

    for string in strings:
        # Tokenize the string into words
        words = word_tokenize(string)
        # POS tagging
        pos_tags = pos_tag(words)
        # Store each word with its corresponding POS tag in the dictionary
        for word, tag in pos_tags:
            word_tags[word] = tag

    return word_tags


In [8]:
tgt_input = df[tgt]
src_input = df[src]

In [10]:
results = []
pt_outputs = []
en_outputs=[]
for input_text in src_input:
    inputs = tokenizer(input_text,  return_tensors="pt").to("cuda")

    outputs = en_to_pt_model.generate(**inputs,  output_scores=True,max_length=512, min_length=1, top_p=0.9, do_sample=True)


    out = tokenizer.decode(outputs[0], skip_special_tokens=True)

    pt_outputs.append(out)



In [13]:
for idx in range(len(src_input)):
    print(src_input[idx])
    print(pt_outputs[idx])
    print(tgt_input[idx])
    print("__")

They wrote those weasels.
tüsüga-noka uhuw̃a ui-w̃ui-ku
tüsüga-noka uhuw̃a ui-w̃ui-ku 
__
You and I are going to read these smellers.
kwana-deika taa ai-nia-gaa-wei
kwana-deika taa ai-nia-gaa-wei 
__
That bird smelled this rock.
tsiipa-uu tübbi-neika ma-gwana-ku
tsiipa-uu tübbi-neika ma-gwana-ku 
__
This wood is going to climb that food.
küna-ii tuunapi-noka u-dsibui-gaa-wei
küna-ii tuunapi-noka u-dsibui-gaa-wei 
__
This earth will smell the one who has sat.
tüwoobü-ii katü-peika a-gwana-wei
tüwoobü-ii katü-peika a-gwana-wei 
__
This one who will read will drink that mosquito.
nia-weidü-ii wo'ada-noka u-hibi-wei
nia-weidü-ii wo'ada-noka u-hibi-wei 
__
This one who talks to is walking.
yadohi-dü-ii hukaw̃ia-ti
yadohi-dü-ii hukaw̃ia-ti 
__
That one who hears stood.
naka-dü-uu wünü-ku
naka-dü-uu wünü-ku 
__
This cup has hit us.
apo-ii ni-gwati-pü
apo-ii ni-gwati-pü 
__
This one who will read hears this dog.
nia-weidü-ii isha'pugu-neika ma-naka-dü
nia-weidü-ii isha'pugu-neika ma-naka-dü 
_

In [15]:
filename = "../data/random_no_verb.csv"
test_df = load_dataset(filename)

In [16]:
tgt_input_test = test_df[tgt]
src_input_test = test_df[src]

In [17]:
results = []
pt_outputs = []
en_outputs=[]
for input_text in src_input_test:
    inputs = tokenizer(input_text,  return_tensors="pt").to("cuda")

    outputs = en_to_pt_model.generate(**inputs,  output_scores=True,max_length=512, min_length=1, top_p=0.9, do_sample=True)


    out = tokenizer.decode(outputs[0], skip_special_tokens=True)

    pt_outputs.append(out)





In [18]:
for idx in range(len(src_input_test)):
    print(src_input_test[idx])
    print(pt_outputs[idx])
    print(tgt_input_test[idx])
    print("__")

That dog will ponder.
isha'pugu-uu tünia-wei
isha'pugu-uu [ponder]-wei 
__
That cottontail is going to relinquish.
katünu-uu tübbi-gaa-wei
tabuutsi'-uu [relinquish]-gaa-wei 
__
This bird is going to stray from those bird snakes.
tsiipa-ii tsiipa-noka ui-gwana-gaa-wei
tsiipa-ii wükada-noka ui-[stray]-gaa-wei 
__
That corn is going to perch.
maishibü-uu pahabichi-gaa-wei
maishibü-uu [perch]-gaa-wei 
__
This jackrabbit hobbled these coyotes.
kamü-ii isha'pugu-neika ai-gwana-ku
kamü-ii isha'-eika ai-[hobble]-ku 
__
This apple is straying.
aaponu'-ii kwatsa'i-ti
aaponu'-ii [stray]-ti 
__
This bird has darted.
tsiipa-ii tübbi-pü
tsiipa-ii [dart]-pü 
__
That dog dawdled those horses.
isha'-uu pugu-noka ui-gwana-ku
isha'pugu-uu pugu-noka ui-[dawdle]-ku 
__
That coffee is going to retort.
kafe-uu retorti-gaa-wei
koopi'-uu [retort]-gaa-wei 
__
That river recoils.
payahuupü-uu tünia-dü
payahuupü-uu [recoil]-dü 
__
That bird snake is clutching.
wükada-uu wükada-ti
wükada-uu [clutch]-ti 
__
That wo