In [1]:
import nltk

In [2]:
PATH_TO_FCFG_FILE = "bhw4.fcfg"

with open(PATH_TO_FCFG_FILE, "r") as f:
    print(f.read())

% start S

# Grammar productions

# S expansion

S -> VP
S -> NP[case=nom, gend=?g, num=?n] VP[gend=?g, num=?n]
S -> NP[case=dat] TV[refl=true, num=?n] NP[case=nom, num=?n]

# VP expansion

VP[tense=?t, num=?n] -> IV[tense=?t, num=?n]
VP[tense=?t, gend=?g, num=?n, refl=?r] -> TV[tense=?t, gend=?g, num=?n, refl=?r] NP[case=acc]
VP[tense=?t, num=?n] -> DV[num=?n, tense=?t] NP[case=acc] NP[case=dat]
# NP expansion

NP[case=?c, gend=?g, num=?n] -> N[case=?c, gend=?g, num=?n]
NP[case=?c, gend=?g, num=?n] -> N[case=?c, gend=?g, num=?n] NP[case=gen]

# Lexical productions

# Nouns

N[gend=fem,  case=nom, num=sg] -> 'devushka'
N[gend=fem,  case=nom, num=pl] -> 'devushki'
N[gend=fem,  case=gen, num=sg] -> 'devushki'
N[gend=fem,  case=acc, num=sg] -> 'devushku'
N[gend=fem,  case=dat, num=pl] -> 'devushkam'
N[gend=fem,  case=dat, num=sg] -> 'devushke'

N[gend=masc, case=nom, num=sg] -> 'malychik'
N[gend=masc, case=nom, num=pl] -> 'malychiki' | 'babushki'
N[gend=masc, type=anim, case=acc, num=sg] 

In [3]:
vocab = set()

In [4]:
def get_vocab(sentences):
    vocab = set()
    for s in sentences:
        tokens = s.split()
        for t in tokens:
            vocab.add(t)
    return list(sorted(list(vocab)))
def update_vocab(vocab, new_words):
    return vocab.union(set(new_words))
def delta_vocab(old_vocab, new_vocab):
    return new_vocab.difference(old_vocab)

In [5]:
parser = nltk.load_parser(PATH_TO_FCFG_FILE, trace=0)

In [6]:
tokens = "xodit".split()
for tree in parser.parse(tokens):
    print(tree)

(S[] (VP[num='sg', tense='pres'] (IV[num='sg', tense='pres'] xodit)))


In [7]:
ok = ['xodit', 'xodjat', 'devushka xodit', 'malychik xodit', 'devushki xodjat', 'malychiki xodjat']
fail = ['devushki xodit', 'devushku xodit', 'devushka xodjat', 'devushka xodit malychika']

In [8]:
def test(sentence):
    tokens = sentence.split()
    res = parser.parse(tokens)
    print(s, len(list(res)))

In [9]:
for s in ok:
    test(s)
print('----------')
for s in fail:
    test(s)

xodit 1
xodjat 1
devushka xodit 1
malychik xodit 1
devushki xodjat 1
malychiki xodjat 1
----------
devushki xodit 0
devushku xodit 0
devushka xodjat 0
devushka xodit malychika 0


In [10]:
old_vocab = vocab
vocab = update_vocab(old_vocab, get_vocab(ok+fail))
delta_vocab(old_vocab, vocab)

{'devushka',
 'devushki',
 'devushku',
 'malychik',
 'malychika',
 'malychiki',
 'xodit',
 'xodjat'}

In [11]:
tokens = "devushku xodit".split()
for tree in parser.parse(tokens):
    print(tree)

In [12]:
transitive = ['devushka vidit malychika', 'malychik vidit devushku', 'devushka vidit dom', 'malychik vidit dom', 'vidit malychika', 'vidit devushku', 'vidit dom', 'vidjat malychika', 'vidjat devushku', 'vidjat dom']
print(len(transitive))
transitive_wrong = ['devushka vidit malychik']
transitive_ignore = ['devushka vidit doma', 'devushku vidit malychika']
print(len(transitive+transitive_wrong+transitive_ignore))

10
13


In [13]:
old_vocab = vocab
vocab = update_vocab(old_vocab, get_vocab(transitive+transitive_wrong+transitive_ignore))
delta_vocab(old_vocab, vocab)

{'dom', 'doma', 'vidit', 'vidjat'}

In [14]:
transitive_2 = ['devushka vidit dom malychika', 'malychik vidit dom devushki', 'devushke nravitsja malychik', 'malychiku nravitsja devushka', 'devushkam nravitsja malychik', 'malychikam nravitsja devushka', 'devushke nravjatsja malychiki', 'malychiku nravjatsja devushki', 'devushka darit podarok malychiku', 'malychik darit podarok devushke', 'babushki devushki darjat podarok malychiku', 'darjat podarok babushkam devushki']

In [15]:
print(len(transitive_2))

12


In [16]:
print("\n".join(get_vocab(transitive+transitive_wrong+transitive_2)))

babushkam
babushki
darit
darjat
devushka
devushkam
devushke
devushki
devushku
dom
malychik
malychika
malychikam
malychiki
malychiku
nravitsja
nravjatsja
podarok
vidit
vidjat


In [17]:
for s in transitive:
    test(s)
print('----------')
for s in transitive_wrong:
    test(s)
print('----------')
transitive_genitive = transitive_2[0:2]
for s in transitive_genitive:
    test(s)
transitive_dative = transitive_2[2:8]
print('----------')
for s in transitive_dative:
    test(s)
print('----------')
transitive_ditransitive = transitive_2[8:]
for s in transitive_ditransitive:
    test(s)

devushka vidit malychika 1
malychik vidit devushku 1
devushka vidit dom 1
malychik vidit dom 1
vidit malychika 1
vidit devushku 1
vidit dom 1
vidjat malychika 1
vidjat devushku 1
vidjat dom 1
----------
devushka vidit malychik 0
----------
devushka vidit dom malychika 1
malychik vidit dom devushki 1
----------
devushke nravitsja malychik 1
malychiku nravitsja devushka 1
devushkam nravitsja malychik 1
malychikam nravitsja devushka 1
devushke nravjatsja malychiki 1
malychiku nravjatsja devushki 1
----------
devushka darit podarok malychiku 1
malychik darit podarok devushke 1
babushki devushki darjat podarok malychiku 1
darjat podarok babushkam devushki 1


In [18]:
past = ['devushka videla malychika', 'devushka videla dom', 'videla malychika', 'videla dom', 'malychik videl devushku', 'malychik videl dom', 'videl devushku', 'videl dom', 'devushki videli malychika', 'malychiki videli devushku', 'videli malychika', 'videli devushku', 'videli dom']
past_wrong = ['devushka videl malychika', 'devushka videli malychika']
print(len(past+past_wrong))
old_vocab = vocab
vocab = update_vocab(old_vocab, get_vocab(past+past_wrong))
delta_vocab(old_vocab, vocab)

15


{'videl', 'videla', 'videli'}

In [19]:
for s in past:
    test(s)
print("-"*8)
for s in past_wrong:
    test(s)

devushka videla malychika 1
devushka videla dom 1
videla malychika 1
videla dom 1
malychik videl devushku 1
malychik videl dom 1
videl devushku 1
videl dom 1
devushki videli malychika 1
malychiki videli devushku 1
videli malychika 1
videli devushku 1
videli dom 1
--------
devushka videl malychika 0
devushka videli malychika 0


In [20]:
negatives = ['v dome jestymaslo', 'v dome jestymasla', 'v dome njet masla', 'v dome njet maslo', 'v dome jestyblinchiki', 'v dome jestyblinchikov', 'v dome njet blinchikov', 'v dome njet blinchiki', 'devushka ne vidit malychika', 'devushka ne vidit malychika', 'devushka ne vidit malychikov', 'devushka ne vidit dom', 'devushka ne vidit doma', 'devushka ne vidit domov', 'devushka vidit doma', 'malychik ne vidit devushku', 'malychik ne vidit devushki', 'malychik vidit devushki'] 
wrong_indexes = [2, 4, 6, 8, 15, 18]
negatives_wrong = [negatives[i-1] for i in wrong_indexes]
negatives_correct = [negatives[i] for i in range(len(negatives)) if i-1 not in wrong_indexes]
print("\n".join(negatives_wrong))
print("-"*8)
print("\n".join(negatives_correct))
print("-"*8)
old_vocab = vocab
vocab = update_vocab(old_vocab, get_vocab(negatives))
delta_vocab(old_vocab, vocab)

v dome jestymasla
v dome njet maslo
v dome jestyblinchikov
v dome njet blinchiki
devushka vidit doma
malychik vidit devushki
--------
v dome jestymaslo
v dome jestymasla
v dome njet masla
v dome jestyblinchiki
v dome njet blinchikov
devushka ne vidit malychika
devushka ne vidit malychikov
devushka ne vidit dom
devushka ne vidit doma
devushka ne vidit domov
devushka vidit doma
malychik ne vidit devushku
malychik vidit devushki
--------


{'blinchiki',
 'blinchikov',
 'dome',
 'domov',
 'jestyblinchiki',
 'jestyblinchikov',
 'jestymasla',
 'jestymaslo',
 'malychikov',
 'masla',
 'maslo',
 'ne',
 'njet',
 'v'}