In [1]:
import numpy as np
import re
import itertools

In [2]:
M = np.load('word2vec.npy')
word2index = dict()
with open('word_lst.txt') as file:
    for counter, line in enumerate(file):
        word = line.strip()
        word2index[word] = counter

def pair_score(word1, word2):
    try:
        vec1 = M[word2index[word1]]
        vec2 = M[word2index[word2]]
    except:
        return 0
    return np.abs(np.dot(vec1, vec2))

def calc_sentence_score(sentence):
    score = 0
    
    if len(sentence) <= 1:
        return score
    
    for i in range(len(sentence) - 1):
        word1 = sentence[i]
        word2 = sentence[i + 1]
        score += pair_score(word1, word2)
    
    return score
        
def get_vec(word):
    try:
        retval = M[word2index[word]]
    except KeyError:
        retval = np.zeros(200)

In [4]:
def import_dataset(filename):
    
    start_tag = '<S>'
    end_tag = '</S>'
    
    sentence_lst = [] # Sentence is a list word which is list of candidate roots
    sentence_correct_lst = []
    with open(filename) as file:
        start_parsing = False
        for line in file:
            if start_parsing:
                if line.startswith(end_tag):
                    start_parsing = False
                    sentence_lst.append(sentence)
                    sentence_correct_lst.append(sentence_correct)
                else:
                    root_set = set()
                    candidate_lst = line.split()[1:]
                    for parse in candidate_lst:
                        try:
                            root_candidate = parse[:parse.index('+')]
                        except ValueError:
                            continue
                        root_set.add(root_candidate.lower())
                    if root_set:
                        sentence.append(list(root_set))
                    
                        correct = candidate_lst[0][:candidate_lst[0].index('+')]
                        sentence_correct.append(correct.lower())
                        
                
            else:
                if line.startswith(start_tag):
                    start_parsing = True
                    sentence = []
                    sentence_correct = []
                    
        return sentence_correct_lst, sentence_lst

In [5]:
dataset_filename = 'dataset/test.merge'
sentence_correct_lst, sentence_lst = import_dataset(dataset_filename)

In [6]:
class ScoreModel:
    def __init__(self, verbose=False):
        self.verbose = verbose
    
    def predict(self, sentence):
        max_score = float('-inf')
        predict_sentence = None
        for element in itertools.product(*sentence):
            score = calc_sentence_score(element)
            if self.verbose:
                print(element)
                print('Score: %.2f' % score)
                print()
            if score > max_score:
                max_score = score
                predict_sentence = list(element)
        return predict_sentence

In [10]:
correct_count = 0
false_count = 0

mdl = ScoreModel()
for sentence, sentence_correct in zip(sentence_lst, sentence_correct_lst):
    predict_sentence = mdl.predict(sentence)

    for word1, word2 in zip(predict_sentence, sentence_correct):
        if word1 == word2:
            correct_count += 1
        else:
            false_count += 1

total_count = correct_count + false_count
accuracy = correct_count / total_count

print('Total number of words : %s' % total_count)
print('Correctly predicted : %s' % correct_count)
print('Accuracy : %.3f' % accuracy)

Total number of words : 861
Correctly predicted : 810
Accuracy : 0.941


In [8]:
def parse_sentence(sentence):
    sentence = '\'' + sentence + '\''
    word_lst = !./trnltk/parser.py {sentence}
    #print(word_lst)
    retval = []
    for word in word_lst:
        root_lst = word.split()
        retval.append(root_lst)
    return retval

    yüklenen yükle+Verb^DB+Verb+Pass+Pos^DB+Adj+PresPart yük+Noun+A3sg+Pnon+Nom^DB+Verb+Acquire+Pos^DB+Adj+PresPart

    Sentence Examples
    
    dolar fiyatları beş TL seviyesinde bulunurken Euro fiyatları altı TL seviyesinde hareket ediyor
    yorulunca alın damarları gözükmeye başladı
    istediğiniz kadar ürün alın 
    sözlerine çok alındı (Wrong)
    ya iyi olarak ölürsün ya da kötüye dönüşecek kadar uzun yaşarsın
    nedir amacımız bunu göndermekle uzaylılara karsı bir sinerji yaratalım dostluk olsun mu
    kafanızı kullansaydınız o taşların doğada bulunan 4 elementi simgelediğini anlardınız

In [9]:
sentence = 'istediğiniz kadar ürün alın'
parsed_sentence = parse_sentence(sentence)

combination_count = 1
for word in parsed_sentence:
  combination_count *= len(word)
print('Total number of possibilities : %s' % combination_count)

print(parsed_sentence)
print(sentence)

vmodel = ScoreModel(verbose=True)
prediction = vmodel.predict(parsed_sentence)

print('-------')
print('Predicted roots : ')
print(prediction)

Total number of possibilities : 4
[['iste'], ['kadar'], ['ürün', 'ürü'], ['al', 'alın']]
istediğiniz kadar ürün alın
('iste', 'kadar', 'ürün', 'al')
Score: 38.53

('iste', 'kadar', 'ürün', 'alın')
Score: 26.36

('iste', 'kadar', 'ürü', 'al')
Score: 0.00

('iste', 'kadar', 'ürü', 'alın')
Score: 0.00

-------
Predicted roots : 
['iste', 'kadar', 'ürün', 'al']
