In [1]:
import gensim.downloader as api
#dataset = api.load("text8")
import math
import numpy as np
import h5py
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  
import sys
sys.path.append("..")
import tensorflow as tf
from Vocabulary import *
import time
tf.keras.backend.clear_session()
import csv
import cloudpickle
from csv_writer import *

In [2]:
class lr_On_plato:
    lowest_loss = -1
    lowest_time = 0
    patience = 10
    factor = 0.005
    
    def notify_loss(self,loss,epoch):
        if(self.lowest_loss == -1):
            self.lowest_loss = loss
            self.lowest_time = epoch
        if(loss < self.lowest_loss):
            self.lowest_loss = loss
            self.lowest_time = epoch
        if(loss > self.lowest_loss and self.lowest_time + 10 < epoch):
            self.lowest_loss = loss
            self.lowest_time = epoch
            print("decreased LR")
            self.factor = self.factor * 0.5
    
    def get_lr(self,epoch):
        return self.factor
        

In [3]:
class ModelTrainer:
    def __init__(self,vocab_length,block_path,vector_size = 300):
        self.vector_size = vector_size
        # AND HERE IT IS AGAIN
        self.block_length = 20000
        self.amount_split = math.ceil(vocab_length/float(self.block_length))
        print('amout_split: ' + str(self.amount_split))
        self.block_path = block_path
        self.vocab_length = vocab_length
    
    #start training for first time
    def prepare(self,filename):
        self.f = h5py.File('S:\\{filename}.hdf5'.format(filename=filename), "w")#plus experiment name
        #initalize all the HDF files
        self.con_weights = self.f.create_dataset("context-weights", (self.vocab_length, self.vector_size))
        self.weights = self.f.create_dataset("weights",(self.vector_size,self.vocab_length))
        self.context_bias = self.f.create_dataset("context-bias", (self.vocab_length,1))
        self.bias = self.f.create_dataset("bias", (1,self.vocab_length))
        self.csv_writer = CSV_writer(filename+".csv")

        self.init_matrices()
    
    #return to training
    def resume(self,filename):
        self.f = h5py.File('S:\\{filename}.hdf5'.format(filename=filename), "r+")#plus experiment name
        #initalize all the HDF files
        self.con_weights = self.f.get("context-weights")
        self.weights = self.f.get("weights")
        self.context_bias = self.f.get("context-bias")
        self.bias = self.f.get("bias")
        self.csv_writer = CSV_writer(filename+".csv",appendmode=True)

    
    def init_matrices(self,chunk_size=10000):
        self.init_hdf_matrix(self.weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.con_weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.context_bias,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.bias,-0.5,0.5,chunk_size)
    
    def init_hdf_matrix(self,hdf_data,min_value,max_value,block_length):
        if len(hdf_data) > len(hdf_data[0]):
            iterations = int(math.ceil(len(hdf_data) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data)-block_length*i)
                hdf_data[i*block_length:(i+1)*block_length , :] = np.random.rand(current_size,len(hdf_data[0]))/self.vector_size
        else:
            iterations = int(math.ceil(len(hdf_data[0]) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data[0])-block_length*i)
                hdf_data[:,i*block_length:(i+1)*block_length] = np.random.rand(len(hdf_data),current_size)/self.vector_size
            
    
    def load_block(self,zeile,spalte):
        # load the hdf coocurence block
        if(zeile >= spalte):
            template = "co_occurence_{i}_{j}.hdf5".format(i=zeile,j=spalte)
        else:
            template = "co_occurence_{i}_{j}.hdf5".format(i=spalte,j=zeile)
        
        file_path =  self.block_path + '\\' + template
        
        tmp_hf = h5py.File(file_path, "r")
        coocurrence = tmp_hf.get("co-ocurrence")[:]
        if (spalte > zeile):
            coocurrence = np.transpose(coocurrence)
        self.tf_co_occurences = tf.convert_to_tensor(coocurrence,dtype=tf.dtypes.float32)
        coocurrence = None
        tmp_hf.close()
        

    def load_weights(self):
        self.tf_weights     = tf.Variable(initial_value=self.weights[:,:],dtype=tf.dtypes.float32)
        self.tf_con_weights =  tf.Variable(initial_value=self.con_weights[:,:],dtype=tf.dtypes.float32)
        self.tf_bias        = tf.Variable(initial_value=self.bias[:,:],dtype=tf.dtypes.float32)
        self.tf_con_bias    = tf.Variable(initial_value=self.context_bias[:,:],dtype=tf.dtypes.float32)
        
        
    def save_weights(self):
        self.context_bias[:,:] = self.tf_con_bias.numpy()
        self.bias[:,:] = self.tf_bias.numpy()
        self.con_weights[:,:] = self.tf_con_weights.numpy()
        self.weights[:,:] = self.tf_weights.numpy()
    
    def _close_files(self):
        self.f.close()
        self.csv_writer.close()
        
    def inner_loss(self,weights,context_weights,bias_mat,con_bias_mat,co_occurences):
        bias_terms = bias_mat + con_bias_mat
        weight_matrix = tf.matmul(context_weights,weights)
        log_X = tf.math.log(co_occurences + 1)
        inner_sum = bias_terms + weight_matrix - log_X
        squared_sum = tf.math.square(inner_sum)
        weighted_sum = self.cut_function2(co_occurences) * squared_sum
        reduced = tf.math.reduce_sum(weighted_sum)
        return reduced
    
    def loss(self,zeile,spalte,co_occurences):
        rest_zeilen = math.ceil(self.vocab_length - zeile*self.block_length)
        rest_spalten= math.ceil(self.vocab_length - spalte*self.block_length)
        weights     = tf.slice(self.tf_weights,(0,spalte*self.block_length)    , (-1,min(self.block_length,rest_spalten)))
        con_weights = tf.slice(self.tf_con_weights,(zeile*self.block_length,0) , (min(self.block_length, rest_zeilen),-1))
        bias        = tf.slice(self.tf_bias,(0,spalte*self.block_length)       , (-1,min(self.block_length,rest_spalten)))
        con_bias    = tf.slice(self.tf_con_bias,(zeile*self.block_length,0)    , (min(self.block_length, rest_zeilen),-1))
        
        ones_symetrical = tf.ones((self.block_length,self.block_length), dtype=tf.dtypes.float32, name=None)
    
        #just the words context
        if(zeile == self.amount_split - 1):
            difference = self.block_length - con_bias.shape[0]
            add2_context_bias   = tf.zeros((difference,1),dtype=tf.dtypes.float32)
            add2_context_weights = tf.zeros((difference,self.vector_size),dtype=tf.dtypes.float32)
            
            con_weights       = tf.concat([con_weights,add2_context_weights],axis = 0)
            con_bias_mat   = tf.concat([con_bias,add2_context_bias],axis = 0) * ones_symetrical
        else:
            con_weights       = con_weights
            con_bias_mat   = con_bias * ones_symetrical
        
        co_occurences = self.tf_co_occurences
        #just the words without context
        if(spalte == self.amount_split - 1):
            difference = self.block_length - bias.shape[1]
            add2_bias = tf.zeros((1,difference),dtype=tf.dtypes.float32)
            add2_weights = tf.zeros((self.vector_size,difference),dtype=tf.dtypes.float32)
            
            weights = tf.concat([weights,add2_weights],axis = 1)
            bias_mat = tf.concat([bias,add2_bias],axis=1) * ones_symetrical
        else:
            weights     = weights
            bias_mat = bias * ones_symetrical
          
        bias_terms = bias_mat + con_bias_mat
        weight_matrix = tf.matmul(con_weights,weights)
        log_X = tf.math.log(co_occurences + 1)
        inner_sum = bias_terms + weight_matrix - log_X
        squared_sum = tf.math.square(inner_sum)
        weighted_sum = self.cut_function2(co_occurences) * squared_sum
        reduced = tf.math.reduce_sum(weighted_sum)
        return reduced
    
    alpha = tf.constant(0.75,dtype=tf.dtypes.float32)
    XMAX = tf.constant(100.0,dtype=tf.dtypes.float32)
    
    def cut_function2(self,value):
        clipped = tf.clip_by_value(value, clip_value_min = 0.0, clip_value_max=100.0)
        return tf.pow(clipped / self.XMAX, self.alpha)
    
    def load_optimizer(self,epoch,zeile,spalte,optimizer_factory):
        #load optimizer & blocks
        if(epoch == 0):
            optimizer = optimizer_factory.create()
        else:
            name = 'S://optimizer{z}-{s}'.format(z = zeile,s = spalte)
            with open(name, "rb") as file:
                optimizer = cloudpickle.load(file)
            optimizer.learning_rate.assign(lrOnPlato.get_lr(epoch))
        return optimizer
        
        
        
    def train_splitted(self,epochs,use_grad_clipping = False):
        
        self.load_weights()
        for epoch in range(0,epochs):
            cur_loss = float(0.0)
            
            for zeile in range(self.amount_split):
                for spalte in range(self.amount_split):
                    if spalte > zeile:
                        continue
                    
                    optimizer = tf.keras.optimizers.Adam(0.0025)#first hundret with 0.005 second hundret with 0.0025
                    #train one side
                    self.load_block(zeile,spalte)
                    print(zeile,spalte)
                    
                    #train code
                    with tf.GradientTape() as tape:
                        tmp_loss = self.loss(zeile,spalte,self.tf_co_occurences)
                    grads = tape.gradient(tmp_loss, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights])
                    if use_grad_clipping:
                        grads, _ = tf.clip_by_global_norm(grads, 5.0)
                    optimizer.apply_gradients(zip(grads, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights]))
                    cur_loss += tmp_loss.numpy()
                    
                    
                    #train the other side
                    
                    if spalte != zeile:
                        optimizer = tf.keras.optimizers.Adam(0.0025)
                        self.tf_co_occurences = tf.transpose(self.tf_co_occurences)
                    
                        #train code
                        with tf.GradientTape() as tape:
                            tmp_loss = self.loss(spalte,zeile,self.tf_co_occurences)
                        grads = tape.gradient(tmp_loss, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights])
                        if use_grad_clipping:
                            grads, _ = tf.clip_by_global_norm(grads, 5.0)
                        optimizer.apply_gradients(zip(grads, [self.tf_con_bias,self.tf_bias,self.tf_con_weights,self.tf_weights]))
                        cur_loss += tmp_loss.numpy()
                           
            self.save_weights()    
            print('epoch'+str(epoch)+"loss:"+str(cur_loss))
            #lrOnPlato.notify_loss(cur_loss.numpy(),epoch)
            self.csv_writer.write('ADAM',0.0025,epoch+1,cur_loss)
        self._close_files()
        return None

    

In [4]:
vocab = Vocabulary()
vocab.load('..\\vocabs\\baseline')
size = vocab.get_size()

class AdamFactory:
    def __init__(self,lr = 0.001):
        self.lr = lr
    def create(self):
        return tf.keras.optimizers.Adam(self.lr)
    def optimiser_name(self):
        return "Adam"
    
    
tf.keras.backend.clear_session()
trainer = ModelTrainer(size,"S:\\base_coocurrence_hdf",vector_size=50)
trainer.prepare("baseline_50dV2")
trainer.train_splitted(1)


amout_split: 8
0 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
tf.Tensor(183779380.0, shape=(), dtype=float32)
1 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
tf.Tensor(27176458.0, shape=(), dtype=float32)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
tf.Tensor(27181954.0, shape=(), dtype=float32)
1 1
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
tf.Tensor(10090964.0, shape=(), dtype=float32)
2 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
tf.Tensor(16585179.0, shape=(), dtype=float32)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)


KeyboardInterrupt: 

In [6]:
import time
vocab = Vocabulary()
vocab.load('..\\vocabs\\baseline')
size = vocab.get_size()

startTime = time.time()
trainer2 = ModelTrainer(size,'S:\\base_coocurrence_hdf',vector_size=50)
trainer2.resume("baseline_50d")

trainer2.train_splitted(1,use_grad_clipping=True)

executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

amout_split: 8
0 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
1 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
1 1
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
2 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
2 1
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
2 2
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
3 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
3 1
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
3 2
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
3 3
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
4 0
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
4 1
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
(50, 20000)
(20000, 50)
(1, 20000)
(20000, 1)
4 2
(50, 20000)
(