In [1]:
import json

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/reddit/siacoin_words_dataset_lg.csv').drop(columns=['target'])

df.head(n = 2)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,they,charge,ridiculous,amounts,to,make,small,changes,to,the
1,1,charge,ridiculous,amounts,to,make,small,changes,to,the,code


In [3]:
### [ 1,2,3,4,5 ] -> sm: [3,4,5], md_sm: [1,2,3,4,5], md: [null], lg: [null]
class TextGeneratorModel(object):
    def __init__(self, vocab, model, limit, weight):
        self.vocab = vocab
        self.model = model
        self.limit = limit
        self.weight = weight
        
        self.vocab_size = len(self.vocab)
        self.word_indices = dict((tk, i) for i, tk in enumerate(self.vocab))
        self.indices_word = dict((i, tk) for i, tk in enumerate(self.vocab))
        
    def get_weight(self):
        return self.weight
    
    def get_line(self, sentence):
        return np.array([ 
            self.word_indices[token]
            for token
            in sentence
        ])
    
    def predict_next(self, sentence: list) -> int:
        ## translate it,
        line = self.get_line(sentence)
    
        ## start: 0, end: len -> [limit: len]
        start = len(sentence) - self.limit
        if start < 0:
            return -1
        
        X_new = np.array([line[start:]])
        predicted_class = self.model.predict_classes(X_new).tolist()
        
        index = predicted_class[0]
        return (index, self.indices_word[index])
    
class TextGeneratorEnsemble(object):
    def __init__(self, generators):
        self.generators = generators
        
    def predict_next(self, sentence):
        weights = []
        results = []
        
        for generator in self.generators:
            predicted_class = generator.predict_next(sentence)
            if predicted_class[0] == -1:
                continue
            
            results.append(predicted_class)
            weights.append(generator.get_weight())
            
        if len(results) == 0:
            return -1
        
        s = np.sum(weights)
        index = np.random.choice(
            list(range(len(results))),
            1,
            p = weights / s
        )[0]
        
        return results[index][1], results
    
class TextGenerator(object):
    def __init__(self, ensemble, sentence):
        self.ensemble = ensemble
        self.sentence = sentence
        
    def get_next_word(self):
        next_token, results = ensemble.predict_next(self.sentence)
        self.sentence = np.append(self.sentence, next_token)
        return next_token, results

In [4]:
from keras.models import load_model

Using TensorFlow backend.


In [5]:
models = [
    { 'type': '_sm', 'limit': 3, 'weight': 1  },
    { 'type': '_sm_md', 'limit': 5, 'weight': 1 },
    { 'type': '_md', 'limit': 7, 'weight': 3 },
    { 'type': '_lg', 'limit': 10, 'weight': 1 },
]

text_generators = []
for model in models:
    key = model['type']
    
    vocab = []
    with open(f'../data/reddit/models/siacoin_vocab{key}.json', 'r') as vocab_input:
        vocab = json.loads(vocab_input.read())
    
    template = f'../data/reddit/models/siacoin_model{key}.h5'
    siacoin_model = load_model(template)
    siacoin_text_generator = TextGeneratorModel(
        vocab,
        siacoin_model,
        model['limit'],
        model['weight']
    )
    
    text_generators.append(siacoin_text_generator)

ensemble = TextGeneratorEnsemble(text_generators)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [6]:
np.random.seed(899)

limit = 10
seed_index = np.random.randint(
    0,
    len(df.index)
)

data_frame = df.iloc[seed_index]
line = data_frame.tolist()[1:limit+1]

text_generator = TextGenerator(ensemble, line)

In [7]:
[ tk for tk in line]

['what',
 'are',
 'my',
 'current',
 'values',
 'for',
 'host',
 'config',
 'besides',
 'looking']

In [8]:
for i in range(0, 25):
    next_token, choices = text_generator.get_next_word()
    print(
        next_token, '-', [ tk for i, tk in choices ]
    )

into - ['into', 'and', 'into', 'into']
the - ['the', 'the', 'the', 'the']
json - ['json', 'developing', 'json', 'json']
on - ['on', 'on', 'on', 'on']
disk - ['disk', 'disk', 'disk', 'value']
which - ['which', 'which', 'which', 'week']
will - ['are', 'are', 'are', 'will']
i - ['be', 'facilitate', 'not', 'i']
just - ['never', 'be', 'just', 'do']
remove - ['be', 'adjustments', 'remove', 'a']
yet - ['my', 'another', 'this', 'yet']
from - ['that', 'to', 'from', 'is']
my - ['your', 'the', 'my', 'there']
role - ['time', 'computer', 'role', 'of']
and - ['streaming', 'in', 'and', 'what']
let - ['writing', 'still', 'let', 'the']
us - ['is', 'us', 'us', 'siacoin']
talk - ['know', 'talk', 'see', 'to']
for - ['about', 'down', 'to', 'for']
proofs - ['a', 'proofs', 'the', 'their']
of - ['with', 'of', 'of', 'through']
monetizing - ['blown', 'hundreds', 'monetizing', 'the']
account - ['have', 'a', 'coins', 'account']
pricing - ['out', 'and', 'pricing', 'over']
to - ['as', 'it', 'to', 'and']
