In [1]:
import pickle
import numpy as np
import random
from tqdm import tqdm
import os
import os.path
from collections import Counter
from get_data_for_w2v import *

import datetime

import tensorflow as tf


def loggin(*params):
    print(" ".join([ str(p) for p in params ]))
    with open("log.txt", "a") as logfile:
        logfile.write(datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") + " " + " ".join([ str(p) for p in params ]) + "\n")


    
print(tf.VERSION)

1.3.0


In [None]:
get_emb_data()
word_to_inx, inx_to_word = get_dicts()
vocab_size = len(word_to_inx)
loggin("prepared_text len:",len(get_prepared_text()))
loggin("vocab_size:",vocab_size)

def encode(prepared_text):
    return np.array([word_to_inx[w] for w in prepared_text.split() ])
    
def decode(seq):
    return ' '.join([inx_to_word[x] for x in seq])


def sequenses_generator(data, batch_len, seq_length, randomized = True):
    data_len = len(data)
    while True:
        X = []
        rand_inx = -1
        while len(X) < batch_len:
            if randomized:
                rand_inx = np.random.randint(data_len)
            else:
                rand_inx = (rand_inx + 1)%data_len
                
            poem_len = len(data[rand_inx])
            
            if poem_len < seq_length:
                continue
            if randomized:
                shift = np.random.randint(poem_len-seq_length)
            else:
                shift = 0
                
            seq_in = data[rand_inx][shift:seq_length+shift]
            X.append(seq_in)
        yield np.array(X)
        
        
def generate(model, max_size, sampling = False, pattern = None):
    """
        use sampling of charters insted of argmax, use sance seed))
    """
    
    #add random pattern and sampling
    if not pattern:
        pattern = encode('')
    else:
        pattern = encode(pattern)
        
    model.reset_state()

    last_out = None

    for w in pattern:
        last_out = model.step(w).argmax()

    generated = []

    for inx in range(max_size):
        last_out = model.step(last_out)
        last_out[word_to_inx['<rear_w>']] = 0
        last_out = last_out/np.sum(last_out)
        if sampling:
            last_out = np.random.choice(range(len(last_out)), p = last_out )
        else:
            last_out = last_out.argmax()
         
        if inx_to_word[last_out] == '#':
            break
        generated.append(last_out)
        
    return np.array(generated)

In [None]:
from os import listdir
from os.path import isfile, join

def rnn_placeholders(state):
    """Convert RNN state tensors to placeholders with the zero state as default."""
    if isinstance(state, tf.contrib.rnn.LSTMStateTuple):
        c, h = state
        c = tf.placeholder_with_default(c, c.shape, c.op.name)
        h = tf.placeholder_with_default(h, h.shape, h.op.name)
        return tf.contrib.rnn.LSTMStateTuple(c, h)
    elif isinstance(state, tf.Tensor):
        h = state
        h = tf.placeholder_with_default(h, h.shape, h.op.name)
        return h
    else:
        structure = [rnn_placeholders(x) for x in state]
        return tuple(structure)
    
def lstm_cell(state_size):
    return tf.contrib.rnn.BasicLSTMCell(state_size)


class Model(object):
    def __init__(self, sess, seq_length, vocab_size, embed_dim, verbas = True,  state_sizes=[128,128], grad_clip = False):
        
        self.sess = sess
        self.state_sizes = state_sizes
        
        if verbas: loggin('Creating NN ....')
        
        #data paceholder
        self.train_input = tf.placeholder(tf.int64, [None, seq_length])
        
        #1)deffine learning graph
        #embedings for input data
        emb_layer = tf.contrib.layers.embed_sequence(self.train_input, 
                                            vocab_size=vocab_size, 
                                            embed_dim=embed_dim,
                                            scope="data_emb")
        
        #LSTM RNN layers
        cells = [lstm_cell(_) for _ in state_sizes ]
        self.rnn_cell = tf.contrib.rnn.MultiRNNCell(cells)
                
        with tf.variable_scope("rnn_layer"):
            lstm_output, lstm_states = tf.nn.dynamic_rnn(self.rnn_cell, emb_layer, dtype = tf.float32)
                
        #Deffine truncated seqs for output and loss calculation
        skeep_words = 4
        start_position = skeep_words
        count = seq_length-skeep_words-1
        
        trancated_lstm_output = tf.slice(lstm_output, begin = [0,start_position,0], size = [-1,count,-1])
        targets = tf.slice(self.train_input, begin = [0,start_position+1], size = [-1,count])       
        
        #Output layer
        output_layer = tf.contrib.layers.fully_connected(trancated_lstm_output, vocab_size,
                                                         activation_fn=None, scope='FC_out')
        
        #Calsulate loss
        input_data_shape = tf.shape(targets)
        self.loss = tf.contrib.seq2seq.sequence_loss(output_layer,
                                                     targets,
                                                     tf.ones([input_data_shape[0], input_data_shape[1]]))
        
        
        
        self.learning_rate = tf.placeholder(tf.float32)
        optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
        if grad_clip:
            gvs = optimizer.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
            self.optimize = optimizer.apply_gradients(capped_gvs)
        else:
            self.optimize = optimizer.minimize(self.loss)
            
        
        #2)deffine acc func
        pred_classes = tf.reshape(tf.argmax(output_layer, axis=2),[-1])
        y_classes = tf.reshape(targets,[-1])
            
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(pred_classes, y_classes), tf.float32))
        
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('accuracy', self.accuracy)
        self.merged_summary = tf.summary.merge_all()
        
        #3)predict next charter graph
        self.input = tf.placeholder(tf.int32)
        self.input_state = rnn_placeholders(self.rnn_cell.zero_state(1,tf.float32)) 
        pred_emb_layer = tf.contrib.layers.embed_sequence(tf.reshape(self.input, [1,1]), 
                                            vocab_size=vocab_size, 
                                            embed_dim=embed_dim,
                                            scope="data_emb",
                                            reuse=True)

        with tf.variable_scope("rnn_layer"):
            tf.get_variable_scope().reuse_variables()
            predict_rnn_output, self.output_state = tf.nn.dynamic_rnn(self.rnn_cell,
                                                                pred_emb_layer, dtype = tf.float32,
                                                                initial_state = self.input_state )

        predict_output_layer = tf.contrib.layers.fully_connected(predict_rnn_output, vocab_size,
                                                         activation_fn=None, scope='FC_out', reuse = True)

        self.predict_output = tf.reshape(tf.nn.softmax(predict_output_layer),[-1])
        
        if verbas: 
            loggin("input tensor shape:",self.train_input.get_shape())
            loggin("\t Learning graph:")   
            loggin("emb_layer shape:",emb_layer.get_shape())    
            loggin("lstm_output shape:", lstm_output.get_shape())
            loggin("start_position for slice:",start_position)
            loggin("slice count:",count) 
            loggin("output_layer shape:",output_layer.get_shape())
            loggin("targets shape:",targets.get_shape())
            loggin("self.loss shape:",self.loss.get_shape())
            
            loggin("\t Acc:")  
            loggin("pred_classes shape:", pred_classes.get_shape())
            loggin("y_classes shape:", y_classes.get_shape())
            
            loggin("\t Prediction graph:")     
            loggin("pred_emb_layer shape:",pred_emb_layer.get_shape())
            loggin("predict_rnn_output shape:",predict_rnn_output.get_shape())
            loggin("predict_output shape:",self.predict_output.get_shape())
        
        
        
        
    def train_on_batch(self, batch, learning_rate = 1e-3):
        feed = {self.train_input: batch,self.learning_rate:learning_rate}
        return self.sess.run([ self.loss, self.optimize ], feed)[0]
    
    def get_loss(self, batch):
        feed = {self.train_input: batch}
        return self.sess.run(self.loss, feed)
    
    def get_accuracy(self, batch):
        feed = {self.train_input: batch}
        return self.sess.run(self.accuracy, feed)
    
    def get_summary(self, batch):
        feed = {self.train_input: batch}
        return self.sess.run(self.merged_summary, feed)
    
    def get_accuracy_for_seq(self,seq):
        encoded = encode(seq)
        predicted = []
        self.reset_state()
        for _ in encoded:
            predicted.append( self.step(_) )
        eq_count = sum( [ int(_[1] == _[0].argmax()) for _ in zip(predicted[:-1], list(encoded)[1:]) ] )
        return eq_count/len(predicted)
    
    def step(self ,w_inx):
        feed = {self.input: w_inx, self.input_state: self.curent_state}
        out, self.curent_state = self.sess.run([self.predict_output, self.output_state], feed)
        return out
        
    
    def reset_state(self):
        self.curent_state = self.sess.run(self.rnn_cell.zero_state(1,tf.float32))
      
    def __deffault_fname__(self):
        fname = "statesize-%s-cellcount-%s-.ckpt" % (
            str(self.state_sizes[-1]),str(len(self.state_sizes)) )
        return fname
        
    def save(self, fname = None, loss = None):
        savepath = "model_waights"
        
        if not fname: fname = self.__deffault_fname__()
        if loss: fname = "loss-%s-"%(str(loss))+fname
        fname = join(savepath, fname)
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, fname)
        print("Model saved in file: %s" % save_path)
        
    def load(self, fname = None):
        savepath = "model_waights"
        
        if not fname: fname = self.__deffault_fname__()
        fname = join(savepath, fname)
        saver = tf.train.Saver()
        saver.restore(self.sess, fname)

    def load_best(self):
        savepath = "model_waights"
        files = [f for f in listdir(savepath) if isfile(join(savepath, f))]
        
        all_checkpoint_parameters = []
        for fname in files:
            checkpoint_parameters = {"fname":fname}
            parts_of_name = fname.split('-')
            while len(parts_of_name)>1:
                key = parts_of_name[0]
                value = parts_of_name[1]
                checkpoint_parameters[key] = value
                parts_of_name = parts_of_name[2:]
                all_checkpoint_parameters.append(checkpoint_parameters)
                
        all_checkpoint_parameters = [par for par in all_checkpoint_parameters
                                     if "statesize" in par and "cellcount" in par and "loss" in par  ]
        
        all_checkpoint_parameters = [par for par in all_checkpoint_parameters
                                     if int(par["statesize"])==self.state_sizes[-1] and int(par["cellcount"])==len(self.state_sizes) ]       
         
        if not len(all_checkpoint_parameters):
            loggin('No checkpoints for this model')
            return
        
        all_checkpoint_parameters = sorted(all_checkpoint_parameters, key=lambda x: float(x['loss']))  
        fname = all_checkpoint_parameters[0]['fname'].split('.ckpt')[0]+'.ckpt'
        loggin('loaded from %s' % fname)
        self.load(fname)
        
    

In [6]:
#test very simple model
with tf.Graph().as_default():
    with tf.Session() as sess:
        model = Model(sess, seq_length=20, vocab_size=vocab_size, embed_dim=300, verbas = True,  state_sizes=[16,16])
        sess.run(tf.global_variables_initializer())
        gen = sequenses_generator(get_data(), 16, 20)
        batch = gen.__next__()
        model.train_on_batch(batch, learning_rate = 1e-3)
        model.reset_state()
        loggin(model.step(1))
        loggin(model.get_accuracy_for_seq("а и а нет любовь <period>"))
        loggin(model.get_accuracy(batch))
        loggin(model.get_summary(batch))
        loggin(model.get_loss(batch))
        loggin("1:", decode(generate(model, max_size = 20, sampling = False, pattern = None)))
        loggin("\n")
        loggin("2:", decode(generate(model, max_size = 20, sampling = True, pattern = None)))
        loggin("\n")
        loggin("3:", decode(generate(model, max_size = 20, sampling = False, pattern = "а и а нет")))
        loggin("\n")
        loggin("4:", decode(generate(model, max_size = 20, sampling = True, pattern = "а и а нет")))
        loggin("\n")

Creating NN ....
input tensor shape: (?, 20)
	 Learning graph:
emb_layer shape: (?, 20, 300)
lstm_output shape: (?, 20, 16)
start_position for slice: 4
slice count: 15
output_layer shape: (?, 15, 6631)
targets shape: (?, 15)
self.loss shape: ()
	 Acc:
pred_classes shape: (?,)
y_classes shape: (?,)
	 Prediction graph:
pred_emb_layer shape: (1, 1, 300)
predict_rnn_output shape: (1, 1, 16)
predict_output shape: (6631,)
[ 0.0001508   0.0001508   0.0001508  ...,  0.00015081  0.00015082
  0.0001508 ]
0.0
0.0375
b'\n\x0b\n\x04loss\x153\xc1\x0cA\n\x0f\n\x08accuracy\x15\x9a\x99\x19='
8.79717
1: полюбив полюбив полюбив полюбив полюбив полюбив полюбив подругой подругой подругой туманных туманных туманных туманных туманных туманных судьбой судьбой судьбой судьбой


2: спят засвищет страданий страстных крови безнадежной высоко сонных вражда зарею дед красивой наугад живые смеется новь уединенный ярче <comma> второй


3: полюбив полюбив полюбив полюбив полюбив полюбив подругой подругой подругой спир

In [None]:
def train(model, eps_count = 10, batch_len = 256, batchs_in_ep = 256, learning_rate = 1e-3):
    
    print('gen, before training sampling:+',decode(generate(model, max_size = 20, sampling = True)),'+')
    print('gen, before training no sampling:+',decode(generate(model, max_size = 20, sampling = False)),'+')
    
    data = get_data()
    # 5% for test/val data
    test_size = len(data)//100 * 2

    train_data = data[test_size:]
    test_data = data[:test_size]
    
    batch_for_test_loss = sequenses_generator(test_data, len([ 1 for d in test_data if len(d)>=seq_length]),
                                              seq_length, randomized = False).__next__()

    data_gen = sequenses_generator(train_data, batch_len, seq_length)
    
    min_loss = model.get_loss(batch_for_test_loss)
    
    for ep in range(eps_count):
        for batches_processed in tqdm(range(batchs_in_ep)):
            train_x = data_gen.__next__()
            train_loss = model.train_on_batch(train_x, learning_rate = learning_rate)
        
        loss = model.get_loss(batch_for_test_loss)
        acc = model.get_accuracy(batch_for_test_loss)
        
        if min_loss > loss:
            min_loss = loss
            model.save(loss = loss)
            
        loggin('ep %s acc %s, last loss %s, train_loss: %s' % ( ep,str(acc), str(loss), str(train_loss) ) )
        loggin('gen, sampling:+',decode(generate(model, max_size = 20, sampling = True)),'+')
        loggin('gen, no sampling:+',decode(generate(model, max_size = 20, sampling = False)),'+')

In [None]:
# NN structure
seq_length = 40
embed_dim=300
state_sizes = [1024,1024,1024]

#learning parameters
eps_count = 10
batch_len = 256
batchs_in_ep = 300
learning_rate = 1e-4
#if False load best parameters for model
new_model = False

with tf.Graph().as_default():
    with tf.Session() as sess:
        model = Model(sess, seq_length = seq_length, vocab_size = vocab_size,
                      embed_dim = embed_dim, verbas = True,  state_sizes = state_sizes, grad_clip = True)
        
        sess.run(tf.global_variables_initializer())
        if not new_model: model.load_best()
        
        train(model, eps_count = eps_count, batch_len = batch_len, batchs_in_ep = batchs_in_ep, learning_rate = learning_rate)
        

Creating NN ....
input tensor shape: (?, 40)
	 Learning graph:
emb_layer shape: (?, 40, 300)
lstm_output shape: (?, 40, 1024)
start_position for slice: 4
slice count: 35
output_layer shape: (?, 35, 6631)
targets shape: (?, 35)
self.loss shape: ()
	 Acc:
pred_classes shape: (?,)
y_classes shape: (?,)
	 Prediction graph:
pred_emb_layer shape: (1, 1, 300)
predict_rnn_output shape: (1, 1, 1024)
predict_output shape: (6631,)
loaded from loss-5.0461-statesize-1024-cellcount-3-.ckpt
INFO:tensorflow:Restoring parameters from model_waights/loss-5.0461-statesize-1024-cellcount-3-.ckpt
gen, before training sampling:+ спирт отчий прекрасные сибирь сокола робкие прекрасные прекрасные палач прекрасные старшая молодым небесам палач знакома плащ сокола музыки знакома стужи +
gen, before training no sampling:+ прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные 

100%|██████████| 300/300 [05:43<00:00,  1.14s/it]


Model saved in file: model_waights/loss-5.06741-statesize-1024-cellcount-3-.ckpt
ep 0 acc 0.25291, last loss 5.06741, train_loss: 4.54541
gen, sampling:+ палач мертвой заре дожди пустыней коня палач единственной молча трудно прекрасные сокола прекрасные прекрасные класс руси палач полюбив делить бои +


  0%|          | 0/300 [00:00<?, ?it/s]

gen, no sampling:+ прекрасные прекрасные палач сокола прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач +


100%|██████████| 300/300 [05:43<00:00,  1.14s/it]


ep 1 acc 0.251852, last loss 5.08184, train_loss: 4.39345
gen, sampling:+ палач сокола прекрасные палач приятель чужим дядя палач сокола прекрасные глыбы старшая нежно узнала палач знакома небесным прекрасные песен старшая +


  0%|          | 0/300 [00:00<?, ?it/s]

gen, no sampling:+ прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные прекрасные палач прекрасные прекрасные +


100%|██████████| 300/300 [05:43<00:00,  1.14s/it]


ep 2 acc 0.251852, last loss 5.10235, train_loss: 4.41621
gen, sampling:+ палач растворилась красавицы палач ну союз засвищет посвящается ища зыбкой сестра посвящается прекрасные службы никак жестокой палач язвы мертвой самых +


  0%|          | 0/300 [00:00<?, ?it/s]

gen, no sampling:+ палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные +


100%|██████████| 300/300 [05:43<00:00,  1.15s/it]


ep 3 acc 0.253968, last loss 5.13099, train_loss: 4.34652
gen, sampling:+ старшая язвы гусар повеет прекрасные класс счастливым голове союз степь реже недуги палач вышли прекрасные какой палач дрожа прекрасные союз +


  0%|          | 0/300 [00:00<?, ?it/s]

gen, no sampling:+ палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные +


100%|██████████| 300/300 [05:43<00:00,  1.14s/it]


ep 4 acc 0.255026, last loss 5.16478, train_loss: 4.34507
gen, sampling:+ сошлись палач о смех прекрасные прекрасные прекрасные прекрасные палач имя плод плечами палач жалею палач утеха ловил знакома крепка полным +


  0%|          | 0/300 [00:00<?, ?it/s]

gen, no sampling:+ палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные палач прекрасные +


 15%|█▍        | 44/300 [00:50<04:52,  1.14s/it]