In [1]:
import math
import numpy as np
import h5py
import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  
import sys
sys.path.append("..")
import tensorflow as tf
from Vocabulary import *
import time
tf.keras.backend.clear_session()
import csv
import cloudpickle
from csv_writer import *

In [2]:
class lr_On_plato:
    lowest_loss = -1
    lowest_time = 0
    patience = 10
    factor = 0.005
    
    def notify_loss(self,loss,epoch):
        if(self.lowest_loss == -1):
            self.lowest_loss = loss
            self.lowest_time = epoch
        if(loss < self.lowest_loss):
            self.lowest_loss = loss
            self.lowest_time = epoch
        if(loss > self.lowest_loss and self.lowest_time + 10 < epoch):
            self.lowest_loss = loss
            self.lowest_time = epoch
            print("decreased LR")
            self.factor = self.factor * 0.5
    
    def get_lr(self,epoch):
        return self.factor
        

In [3]:
class ModelTrainer:
    def __init__(self,vocab_length,block_path,vector_size = 100):
        self.vector_size = vector_size
        # AND HERE IT IS AGAIN
        self.block_length = 5000
        self.amount_split = math.ceil(vocab_length/float(self.block_length))
        print('amout_split: ' + str(self.amount_split))
        self.block_path = block_path
        self.vocab_length = vocab_length
    
    #start training for first time
    def prepare(self,filename):
        self.f = h5py.File('S:\\{filename}.hdf5'.format(filename=filename), "w")#plus experiment name
        #initalize all the HDF files
        self.con_weights = self.f.create_dataset("context-weights", (self.vocab_length, self.vector_size))
        self.weights = self.f.create_dataset("weights",(self.vector_size,self.vocab_length))
        self.context_bias = self.f.create_dataset("context-bias", (self.vocab_length,1))
        self.bias = self.f.create_dataset("bias", (1,self.vocab_length))
        self.csv_writer = CSV_writer(filename+".csv")

        self.init_matrices()
    
    #return to training
    def resume(self,filename):
        self.f = h5py.File('S:\\{filename}.hdf5'.format(filename=filename), "r+")#plus experiment name
        #initalize all the HDF files
        self.con_weights = self.f.get("context-weights")
        self.weights = self.f.get("weights")
        self.context_bias = self.f.get("context-bias")
        self.bias = self.f.get("bias")
        self.csv_writer = CSV_writer(filename+".csv",appendmode=True)

    
    def init_matrices(self,chunk_size=10000):
        self.init_hdf_matrix(self.weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.con_weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.context_bias,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.bias,-0.5,0.5,chunk_size)
    
    def init_hdf_matrix(self,hdf_data,min_value,max_value,block_length):
        if len(hdf_data) > len(hdf_data[0]):
            iterations = int(math.ceil(len(hdf_data) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data)-block_length*i)
                hdf_data[i*block_length:(i+1)*block_length , :] = np.random.rand(current_size,len(hdf_data[0]))/self.vector_size
        else:
            iterations = int(math.ceil(len(hdf_data[0]) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data[0])-block_length*i)
                hdf_data[:,i*block_length:(i+1)*block_length] = np.random.rand(len(hdf_data),current_size)/self.vector_size
            
    
    def load_block(self,zeile,spalte):
        # load the hdf coocurence block
        if(zeile >= spalte):
            template = "co_occurence_{i}_{j}_with_diag.hdf5".format(i=zeile,j=spalte)
        else:
            template = "co_occurence_{i}_{j}_with_diag.hdf5".format(i=spalte,j=zeile)
        
        file_path =  self.block_path + '\\' + template
        
        tmp_hf = h5py.File(file_path, "r")
        coocurrence = tmp_hf.get("co-ocurrence")[:]
        if (spalte > zeile):
            coocurrence = np.transpose(coocurrence)
        self.tf_co_occurences = tf.convert_to_tensor(coocurrence,dtype=tf.dtypes.float32)
        coocurrence = None
        tmp_hf.close()
        

    def load_weights(self):
        self.tf_weights     = tf.Variable(initial_value=self.weights[:,:],dtype=tf.dtypes.float32)
        self.tf_con_weights =  tf.Variable(initial_value=self.con_weights[:,:],dtype=tf.dtypes.float32)
        self.tf_bias        = tf.Variable(initial_value=self.bias[:,:],dtype=tf.dtypes.float32)
        self.tf_con_bias    = tf.Variable(initial_value=self.context_bias[:,:],dtype=tf.dtypes.float32)
        
        
    def save_weights(self):
        self.context_bias[:,:] = self.tf_con_bias.numpy()
        self.bias[:,:] = self.tf_bias.numpy()
        self.con_weights[:,:] = self.tf_con_weights.numpy()
        self.weights[:,:] = self.tf_weights.numpy()
    
    def _close_files(self):
        self.f.close()
        self.csv_writer.close()
   
    def inner_loss(self,weights,context_weights,bias_mat,con_bias_mat,co_occurences):
        bias_terms = bias_mat + con_bias_mat
        weight_matrix = tf.matmul(context_weights,weights)
        log_X = tf.math.log(co_occurences + 1)
        summe = bias_terms + weight_matrix - log_X
        summe = tf.math.square(summe)
        summe = self.cut_function2(co_occurences) * summe
        reduced = tf.math.reduce_sum(summe)
        return reduced
    
    def loss(self,zeile,spalte,co_occurences):
        rest_zeilen = math.ceil(self.vocab_length - zeile*self.block_length)
        rest_spalten= math.ceil(self.vocab_length - spalte*self.block_length)
        weights     = tf.slice(self.tf_weights,(0,spalte*self.block_length)    , (-1,min(self.block_length,rest_spalten)))
        con_weights = tf.slice(self.tf_con_weights,(zeile*self.block_length,0) , (min(self.block_length, rest_zeilen),-1))
        bias        = tf.slice(self.tf_bias,(0,spalte*self.block_length)       , (-1,min(self.block_length,rest_spalten)))
        con_bias    = tf.slice(self.tf_con_bias,(zeile*self.block_length,0)    , (min(self.block_length, rest_zeilen),-1))
        
        ones_symetrical = tf.ones((self.block_length,self.block_length), dtype=tf.dtypes.float32, name=None)
    
        #just the words context
        if(zeile == self.amount_split - 1):
            difference = self.block_length - con_bias.shape[0]
            add2_context_bias   = tf.zeros((difference,1),dtype=tf.dtypes.float32)
            add2_context_weights = tf.zeros((difference,self.vector_size),dtype=tf.dtypes.float32)
            
            con_weights       = tf.concat([con_weights,add2_context_weights],axis = 0)
            con_bias_mat   = tf.concat([con_bias,add2_context_bias],axis = 0) * ones_symetrical
        else:
            con_weights       = con_weights
            con_bias_mat   = con_bias * ones_symetrical
        
        co_occurences = self.tf_co_occurences
        #just the words without context
        if(spalte == self.amount_split - 1):
            difference = self.block_length - bias.shape[1]
            add2_bias = tf.zeros((1,difference),dtype=tf.dtypes.float32)
            add2_weights = tf.zeros((self.vector_size,difference),dtype=tf.dtypes.float32)
            
            weights = tf.concat([weights,add2_weights],axis = 1)
            bias_mat = tf.concat([bias,add2_bias],axis=1) * ones_symetrical
        else:
            weights     = weights
            bias_mat = bias * ones_symetrical
          
        return self.inner_loss(weights,con_weights,bias_mat,con_bias_mat,co_occurences)
    
    alpha = tf.constant(0.75,dtype=tf.dtypes.float32)
    XMAX = tf.constant(100.0,dtype=tf.dtypes.float32)
    
    def cut_function2(self,value):
        clipped = tf.clip_by_value(value, clip_value_min = 0.0, clip_value_max=100.0)
        return tf.pow(clipped / self.XMAX, self.alpha)
    
    def load_optimizer(self,epoch,zeile,spalte,optimizer_factory):
        #load optimizer & blocks
        if(epoch == 0):
            optimizer = optimizer_factory.create()
        else:
            name = 'S://optimizer{z}-{s}'.format(z = zeile,s = spalte)
            with open(name, "rb") as file:
                optimizer = cloudpickle.load(file)
            optimizer.learning_rate.assign(lrOnPlato.get_lr(epoch))
        return optimizer
        
        
        
    def train_splitted(self,epochs,use_grad_clipping = False):
        
        self.load_weights()
        optimizer = tf.keras.optimizers.Adam(0.0025)#first hundret with 0.005 second hundret with 0.0025
        
        for epoch in range(0,epochs):
            cur_loss = float(0.0)
            
            
            for zeile in range(self.amount_split):
                for spalte in range(self.amount_split):
                    if spalte > zeile:
                        continue
                    
                    #train one side
                    self.load_block(zeile,spalte)
                    
                    #train code
                    with tf.GradientTape() as tape:
                        tmp_loss = self.loss(zeile,spalte,self.tf_co_occurences)
                    grads = tape.gradient(tmp_loss, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights])
                    if use_grad_clipping:
                        grads, _ = tf.clip_by_global_norm(grads, 5.0)
                    optimizer.apply_gradients(zip(grads, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights]))
                    cur_loss += tmp_loss.numpy()
                    
                    
                    #train the other side
                    if spalte != zeile:
                        self.tf_co_occurences = tf.transpose(self.tf_co_occurences)
                    
                        #train code
                        with tf.GradientTape() as tape:
                            tmp_loss = self.loss(spalte,zeile,self.tf_co_occurences)
                        grads = tape.gradient(tmp_loss, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights])
                        if use_grad_clipping:
                            grads, _ = tf.clip_by_global_norm(grads, 5.0)
                        optimizer.apply_gradients(zip(grads, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights]))
                        cur_loss += tmp_loss.numpy()
                           
            self.save_weights()    
            print('epoch: '+str(epoch)+" loss: "+str(int(cur_loss)))
            #lrOnPlato.notify_loss(cur_loss.numpy(),epoch)
            self.csv_writer.write('ADAM',0.01,epoch+1,cur_loss)
        self._close_files()
        return None

    

In [None]:
import time
vocab = Vocabulary()
vocab.load('..\\vocabs\\baseline')
size = vocab.get_size()

class AdamFactory:
    def __init__(self,lr = 0.001):
        self.lr = lr
    def create(self):
        return tf.keras.optimizers.Adam(self.lr)
    def optimiser_name(self):
        return "Adam"
    
    
tf.keras.backend.clear_session()
trainer = ModelTrainer(size,"S:\\base_coocurrence_hdf_5000",vector_size=100)
trainer.prepare("baseline_100with_diag")

startTime = time.time()

trainer.train_splitted(50)

#executionTime = (time.time() - startTime)
#print('Execution time in seconds: ' + str(executionTime))


amout_split: 30
epoch: 0 loss: 242672797.66625977
epoch: 1 loss: 193726993.4523468
epoch: 2 loss: 79954536.0752716
epoch: 3 loss: 44063401.91394043
epoch: 4 loss: 49529985.22874451
epoch: 5 loss: 45480012.53930664
epoch: 6 loss: 48630405.97711182
epoch: 7 loss: 43281221.68469238
epoch: 8 loss: 39820151.885009766
epoch: 9 loss: 45538042.136276245
epoch: 10 loss: 44126636.58528137
epoch: 11 loss: 57533896.64096069
epoch: 12 loss: 41762438.60635376
epoch: 13 loss: 52482711.447769165
epoch: 14 loss: 43313104.673339844
epoch: 15 loss: 53841548.899002075
epoch: 16 loss: 41478727.251708984
epoch: 17 loss: 52357323.512786865
epoch: 18 loss: 41855400.25175476
epoch: 19 loss: 52442989.33869934
epoch: 20 loss: 40588292.60429382
epoch: 21 loss: 51423581.44065857
epoch: 22 loss: 40540153.1832428
epoch: 23 loss: 50958921.62113953
epoch: 24 loss: 39556664.31889343
epoch: 25 loss: 50255630.20196533
epoch: 26 loss: 39178726.31492615
epoch: 27 loss: 49513928.780685425
epoch: 28 loss: 38290981.85066223


In [14]:
import time
vocab = Vocabulary()
vocab.load('..\\vocabs\\baseline')
size = vocab.get_size()

startTime = time.time()
#trainer2 = trainer
trainer2 = ModelTrainer(size,'S:\\base_coocurrence_hdf_5000',vector_size=100)
trainer2.resume("baseline_100")

trainer2.train_splitted(25,use_grad_clipping=True)

executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

amout_split: 30
epoch: 0 loss: 32319290.13219452
epoch: 1 loss: 28063685.399000168
epoch: 2 loss: 23469092.35553837
epoch: 3 loss: 19526608.906079292
epoch: 4 loss: 16516468.600085258
epoch: 5 loss: 14581918.278337955
epoch: 6 loss: 13521506.73195362
epoch: 7 loss: 12942247.856032372
epoch: 8 loss: 12585711.294221401
epoch: 9 loss: 12340951.821409702
epoch: 10 loss: 12164295.586251736
epoch: 11 loss: 12029207.879078388
epoch: 12 loss: 11918260.349421978
epoch: 13 loss: 11826072.363986969
epoch: 14 loss: 11753940.630680084
epoch: 15 loss: 11693549.512440205
epoch: 16 loss: 11646015.226731777
epoch: 17 loss: 11607097.475978374
epoch: 18 loss: 11574080.647561073
epoch: 19 loss: 11546544.746124744
epoch: 20 loss: 11522990.712842941
epoch: 21 loss: 11504210.00441885
epoch: 22 loss: 11484754.35866785
epoch: 23 loss: 11472839.152806282
epoch: 24 loss: 11457424.354858875
Execution time in seconds: 2612.442837715149


In [15]:
import time
vocab = Vocabulary()
vocab.load('..\\vocabs\\baseline')
size = vocab.get_size()

startTime = time.time()
#trainer2 = trainer
trainer2 = ModelTrainer(size,'S:\\base_coocurrence_hdf_5000',vector_size=100)
trainer2.resume("baseline_100")

trainer2.train_splitted(10,use_grad_clipping=True)

executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

amout_split: 30
epoch: 0 loss: 11431963.595276356
epoch: 1 loss: 11494556.348935604
epoch: 2 loss: 11478202.342249632
epoch: 3 loss: 11474417.381432056
epoch: 4 loss: 11457380.481976986
epoch: 5 loss: 11443283.10721922
epoch: 6 loss: 11429455.382554531
epoch: 7 loss: 11414459.488080502
epoch: 8 loss: 11404743.955230236
epoch: 9 loss: 11393610.03084898
Execution time in seconds: 1051.6223187446594
