# Shaolin-dev Notebook

In [6]:
! pip install tqdm



In [1]:
import os
import time
import pickle
from tqdm import tqdm

In [2]:
def read_words():
    begin = {}
    end = {}
    with open('rijeci.txt', 'r') as f:
        words = f.read()
        words = words.split('\n')

        for word in words:
            begin[word] = word[:2]
            end[word] = word[-2:]
            
#     print_dict(begin, 10)
#     print_dict(end, 10)
    return begin, end
# read_words()

In [3]:
def frequency(begin, end):
    """
    Returns a dict with all possible postifixes mapped to their 'fitness' 
    i.e. number of words starting with those 2 letters.
    """
    begin_values = list(begin.values())
    end_values = list(end.values())

    freq = {}
    for ev in tqdm(end_values):
        points = begin_values.count(ev)
        if ev not in freq:
            freq[ev] = points
        else:
            if points > freq[ev]:
                freq[ev] = points

    return freq

In [4]:
begin, end = read_words()
freq = frequency(begin, end)
    

100%|██████████| 159837/159837 [05:59<00:00, 444.10it/s]


In [10]:
def print_dict(dic, last=-1):
    i = 0
    for word in dic:
        print(word + ' ' * (20 - len(word)) + str(dic[word]))
        i += 1
        if i == last:
            return


def load_shaolin(file_path):
    with open(file_path, 'rb') as f:
        unpickled = pickle.load(f)
        return unpickled

In [9]:
def calc_words_fitness(begin = None, end = None, freq = None, save = False, try_to_load = True):
    
    if 'word_fitness.pkl' in os.listdir() and try_to_load:
        with open('word_fitness.pkl', 'rb') as wf:
            return pickle.load(wf)
        
    if begin is None or end is None:
        begin, end = read_words()
        freq = frequency(begin, end)
    elif freq is None:
        freq = frequency(begin, end)
    
    words = begin.keys()

    word_fitness = {}
    for word in words:
        begin_of_the_word = begin[word]
        if begin_of_the_word in freq:
            word_fitness[word] = freq[begin_of_the_word]
        else:
            word_fitness[word] = 0
    
    if 'word_fitness.pkl' not in os.listdir() or save:
        with open('word_fitness.pkl', 'wb') as wf:
            pickle.dump(word_fitness, wf)
            
    return word_fitness

In [10]:
words_fitness = calc_words_fitness(begin, end, freq, save = True, try_to_load = False)

In [11]:
def invert_dict_words(d):
    
    d_values = list(d.values())
    inv = {}
    for val in d_values:
        inv[val] = []
        
    for key in d:
        inv[d[key]].append(key)
        
    return inv

def get_prefixes_and_postfixes_dicts(word_begin, word_end):
    prefixes_words = invert_dict_words(word_begin)
    postfixes_words = invert_dict_words(word_end)
    
    return prefixes_words, postfixes_words

In [20]:
prefixes, postfixes = get_prefixes_and_postfixes_dicts(begin, end)

True


(1061, 2606)

In [24]:
def num_of_possible_next_words(word, prefixes, previous_words = []):
    ret_list = list(filter(lambda w: False if w in previous_words else True, prefixes[word[:2]]))
    return ret_list

In [26]:
# Test for function above
num_of_possible_next_words('kakiti', prefixes, ['kabac', 'kabadahija'])

['kabadahijin',
 'kabaj',
 'kabala',
 'kabalin',
 'kabalini',
 'kabalist',
 'kabalisticki',
 'kabalistov',
 'kabanica',
 'kabao',
 'kabare',
 'kabaret',
 'kabaretov',
 'kabaretski',
 'kabaretsko',
 'kabas',
 'kabasaj',
 'kabashaj',
 'kabashi',
 'kabasi',
 'kabbaj',
 'kabel',
 'kabelka',
 'kabelov',
 'kabelski',
 'kabic',
 'kabicek',
 'kabil',
 'kabiljagic',
 'kabiljo',
 'kabilski',
 'kabina',
 'kabinet',
 'kabinetov',
 'kabinetski',
 'kabinetsko',
 'kabinin',
 'kabinski',
 'kablar',
 'kablarevic',
 'kablaric',
 'kablic',
 'kablica',
 'kablicev',
 'kablinovic',
 'kabliranje',
 'kabljanac',
 'kablogram',
 'kablogramov',
 'kablovic',
 'kablovski',
 'kablovsko',
 'kabok',
 'kabotaza',
 'kabotazin',
 'kabrio',
 'kabriolet',
 'kabrioletov',
 'kabrioletski',
 'kabriov',
 'kabudaja',
 'kac',
 'kaca',
 'kacak',
 'kacakov',
 'kacan',
 'kacandol',
 'kacanic',
 'kacanski',
 'kacapor',
 'kacar',
 'kacarevic',
 'kacarik',
 'kacarovski',
 'kacarski',
 'kacavenda',
 'kacer',
 'kacera',
 'kaceska',
 'k

In [12]:
class Node(object):
    def __init__(self, word, father = None, fitness = -1):
        self.word = word
        self.fitness = fitness
        self.children = []
        self.father = father

    def add_child(self, obj):
        self.children.append(obj)
        
    def __str__(self):
        return ('(' + self.word + ', ' + str(self.fitness) + ')')
    
    def get_list_of_ancestors(self):
        ret = []
        curr = self
        while curr.father is not None:
            ret.append(curr.father)
            curr = curr.father

In [29]:
word_begin, word_end = read_words()
words = list(word_begin.keys())
pre, post = get_prefixes_and_postfixes_dicts(word_begin, word_end)