In [1]:
import numpy as np
import re
import itertools
import collections
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [2]:
M = np.load('word2vec.npy')
word2index = dict()
with open('word_lst.txt') as file:
    for counter, line in enumerate(file):
        word = line.strip()
        word2index[word] = counter

def word2vec(word):
    idx = word2index.get(word)
    if idx is None:
        return np.zeros(200)
    return M[idx]

        
def get_vec(word):
    try:
        retval = M[word2index[word]]
    except KeyError:
        retval = np.zeros(200)

In [3]:
def import_dataset(filename):
    
    start_tag = '<S>'
    end_tag = '</S>'
    
    sentence_lst = [] # Sentence is a list word which is list of candidate roots
    sentence_correct_lst = []
    with open(filename) as file:
        start_parsing = False
        for line in file:
            if start_parsing:
                if line.startswith(end_tag):
                    start_parsing = False
                    sentence_lst.append(sentence)
                    sentence_correct_lst.append(sentence_correct)
                else:
                    root_set = set()
                    candidate_lst = line.split()[1:]
                    for parse in candidate_lst:
                        try:
                            root_candidate = parse[:parse.index('+')]
                        except ValueError:
                            continue
                        root_set.add(root_candidate.lower())
                    if root_set:
                        sentence.append(list(root_set))
                    
                        correct = candidate_lst[0][:candidate_lst[0].index('+')]
                        sentence_correct.append(correct.lower())
                        
                
            else:
                if line.startswith(start_tag):
                    start_parsing = True
                    sentence = []
                    sentence_correct = []
                    
        return sentence_correct_lst, sentence_lst

In [4]:
train_dataset_filename = 'dataset/train.merge'
train_sentence_correct_lst, train_sentence_lst = import_dataset(train_dataset_filename)

In [11]:

model = Sequential([
    Dense(100, input_shape=(400,)),
    Activation('relu'),
    Dense(40),
    Activation('relu'),
    Dense(40),
    Activation('relu'),
    Dense(2),
    Activation('softmax'),
])
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

def generate_samples(sentences):
    for sentence in sentences:
        for w1, w2 in zip(sentence, sentence[1:]):
            itert = itertools.product(w1,w2)
            yield next(itert), 1
            for others in itert:
                yield others, 0
samples = list(generate_samples(train_sentence_lst))

In [None]:
def gen_subsamples():
    subsample_size = 10000
    for subsample in range(0, len(samples), subsample_size):
        subsamples = samples[subsample: subsample + subsample_size]
        train_data = np.array([np.append(word2vec(w1), word2vec(w2)) for (w1,w2),_ in subsamples])
        train_labels = np.array([[v==0, v==1] for _, v in subsamples])
        print(subsample, len(samples))
        yield train_data, train_labels
#train_data, train_labels = next(gen_subsamples())
#model.fit(train_data, train_labels, epochs=10, batch_size=32)
for train_data, train_labels in gen_subsamples():
    model.fit(train_data, train_labels, epochs=10, batch_size=32)

In [30]:
model.save("dense_100_40_40_2.keras")

In [31]:
model = keras.models.load_model("dense_100_40_40_2.keras")

In [32]:
model.predict(np.array([np.append(word2vec("new"), word2vec("york"))]))

array([[0., 1.]], dtype=float32)

In [27]:
dataset_filename = 'dataset/test.merge'
sentence_correct_lst, sentence_lst = import_dataset(dataset_filename)

In [28]:
class ScoreModel:
    def __init__(self, verbose=False):
        self.verbose = verbose
    
    def predict(self, sentence):
        max_score = float('-inf')
        predict_sentence = None
        self._cache = {}
        for element in itertools.product(*sentence):
            score = self.calc_sentence_score(element)
            if self.verbose:
                print(element)
                print('Score: %.2f' % score)
                print()
            if score > max_score:
                max_score = score
                predict_sentence = list(element)
        return predict_sentence
    
    def pair_score(self, word1, word2):
        if (word1, word2) in self._cache:
            return self._cache[(word1, word2)]
        try:
            vec1 = M[word2index[word1]]
            vec2 = M[word2index[word2]]
        except:
            return 0
        #return np.abs(np.dot(vec1, vec2))
        self._cache[(word1, word2)] = model.predict(np.array([np.append(vec1, vec2)]))[0][1]
        return self._cache[(word1, word2)]

    def calc_sentence_score(self, sentence):
        score = 0

        if len(sentence) <= 1:
            return score

        for i in range(len(sentence) - 1):
            word1 = sentence[i]
            word2 = sentence[i + 1]
            score += self.pair_score(word1, word2)

        return score

In [29]:
correct_count = 0
false_count = 0

mdl = ScoreModel()
for num, (sentence, sentence_correct) in enumerate(zip(sentence_lst, sentence_correct_lst)):
    predict_sentence = mdl.predict(sentence)
    for word1, word2 in zip(predict_sentence, sentence_correct):
        if word1 == word2:
            correct_count += 1
        else:
            false_count += 1

total_count = correct_count + false_count
accuracy = correct_count / total_count

print('Total number of words : %s' % total_count)
print('Correctly predicted : %s' % correct_count)
print('Accuracy : %.3f' % accuracy)

Total number of words : 861
Correctly predicted : 813
Accuracy : 0.944


In [None]:
def parse_sentence(sentence):
    sentence = '\'' + sentence + '\''
    word_lst = !./trnltk/parser.py {sentence}
    #print(word_lst)
    retval = []
    for word in word_lst:
        root_lst = word.split()
        retval.append(root_lst)
    return retval

    yüklenen yükle+Verb^DB+Verb+Pass+Pos^DB+Adj+PresPart yük+Noun+A3sg+Pnon+Nom^DB+Verb+Acquire+Pos^DB+Adj+PresPart

    Sentence Examples
    
    dolar fiyatları beş TL seviyesinde bulunurken Euro fiyatları altı TL seviyesinde hareket ediyor
    yorulunca alın damarları gözükmeye başladı
    istediğiniz kadar ürün alın 
    sözlerine çok alındı (Wrong)
    ya iyi olarak ölürsün ya da kötüye dönüşecek kadar uzun yaşarsın
    nedir amacımız bunu göndermekle uzaylılara karsı bir sinerji yaratalım dostluk olsun mu
    kafanızı kullansaydınız o taşların doğada bulunan 4 elementi simgelediğini anlardınız

In [None]:
sentence = 'istediğiniz kadar ürün alın'
parsed_sentence = parse_sentence(sentence)

combination_count = 1
for word in parsed_sentence:
  combination_count *= len(word)
print('Total number of possibilities : %s' % combination_count)

print(parsed_sentence)
print(sentence)

vmodel = ScoreModel(verbose=True)
prediction = vmodel.predict(parsed_sentence)

print('-------')
print('Predicted roots : ')
print(prediction)