In [1]:
def lines(filename):
    with open(filename) as f:
        lines = f.readlines()        
        return [line.split('\t') for line in lines]
    
dataf="/pi/ai/seq2seq/jpn-eng/jpn.txt"
pairs=lines(dataf)

In [2]:
print(pairs[:5])
print(len(pairs))

[['Hi.', 'やっほー。\n'], ['Hi.', 'こんにちは！\n'], ['Run.', '走れ。\n'], ['Run.', '走って！\n'], ['Who?', '誰？\n']]
43009


In [4]:
for pair in pairs[:5]:
    print(pair[0])

Hi.
Hi.
Run.
Run.
Who?


In [5]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_sm")

In [7]:
from ipywidgets import IntProgress
from IPython.display import display

max_count=len(pairs)
f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar

def step(val=1):
    f.value += val # signal to increment the progress bar   

verb_maps={}

for pair in pairs:
    doc = nlp(pair[0])
    # Finding a verb with a subject from below — good
    verbs = set()
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            verbs.add(possible_subject.head)
    if len(verbs)>0:
        verb_maps[pair[0]]=verbs
    step()

print(len(verb_maps))

32521


In [10]:
print(pairs[2000], verb_maps[pairs[2000][0]])
print(pairs[3000], verb_maps[pairs[3000][0]])

["I'm not a kid.", 'もう子どもじゃないんだから。\n'] {'m}
["Who's with Tom?", 'トムと一緒にいるのは誰ですか？\n'] {'s}


In [12]:
import resources_pb2 as res
import protobuf_utils

def parse(pair, rs):
    doc = nlp(pair[0])
    data = doc.to_bytes()
    lang=res.RsLang(entries=pair, store=data)
    rs.append(lang)

rs=[]
for i in range(2000,2100):
    parse(pairs[i], rs)
langs=res.RsLangs(langs=rs)

In [13]:
print(len(langs.langs))

100


In [14]:
protobuf_utils.write_proto_to(langs, './data/langs/samples_100.data')

In [23]:
load_langs=res.RsLangs()
protobuf_utils.read_proto(load_langs, './data/langs/samples_100.data')
print(len(load_langs.langs))

100


In [19]:
from spacy.tokens import Doc

def get_verbs(doc):
    verbs = set()
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            verbs.add(possible_subject.head)
    return verbs
        
for lang in langs.langs:
    doc=Doc(nlp.vocab).from_bytes(lang.store)
    print(lang.entries[0], get_verbs(doc))

I'm not a kid. {'m}
I'm not alone. {'m}
I'm not angry! {'m}
I'm not angry. {'m}
I'm not drunk. {'m}
I'm not lying. {lying}
I'm on a diet. {'m}
I'm on a diet. {'m}
I'm satisfied. {'m}
I'm satisfied. {'m}
I'm so hungry. {'m}
I'm surprised. {'m}
I'm too short. {'m}
I'm unmarried. {'m}
I'm very busy. {'m}
I'm very busy. {'m}
I'm very busy. {'m}
I'm voting no. {voting}
I've found it. {found}
I've got time. {got}
Is Tom around? {Is}
Is Tom asleep? {Is}
Is Tom famous? {Is}
Is Tom guilty? {Is}
Is Tom lonely? {Is}
Is Tom nearby? {Is}
Is Tom stupid? {Is}
Is he at home? {Is}
Is he correct? {Is}
Is it raining? {Is}
Is it serious? {Is}
Is it too big? {Is}
Is that Tom's? {Is}
Is that a cat? {Is}
Is that right? {Is}
Is that right? {Is}
Is that yours? {Is}
Is there more? set()
Is this Tom's? {Is}
Is this Tom's? {Is}
Is this yours? {Is}
Isn't it cool? {Is}
Isn't it cute? {Is}
Isn't it hard? {Is}
Isn't it hard? {Is}
Isn't it nice? {Is}
Isn't it true? {Is}
Isn't it true? {Is}
Isn't it true? {Is}
Isn't th

In [24]:
import sagas.nlu.spacy_helper as sr

sents='I am a student'
doc=nlp(sents)
for token in doc:
    print(token.text, token.pos_, token.dep_)
sr.get_verbs(doc)

I PRON nsubj
am VERB ROOT
a DET det
student NOUN attr


{am}