In [1]:
import math
import numpy as np
import h5py
import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  
import sys
sys.path.append("..")
import tensorflow as tf
from Vocabulary import *
import time
tf.keras.backend.clear_session()
import csv
import cloudpickle
from csv_writer import *
import random

import threading, queue

In [2]:
class ModelTrainer:
    def __init__(self,vocab_length,block_path,vector_size = 100):
        self.vector_size = vector_size
        # AND HERE IT IS AGAIN
        self.block_length = 5000
        self.amount_split = math.ceil(vocab_length/float(self.block_length))
        print('amout_split: ' + str(self.amount_split))
        self.block_path = block_path
        self.vocab_length = vocab_length
        self.optimizer = None
    
    #start training for first time
    def prepare(self,basepath,experiment_name):
        self.basepath = basepath
        self.experiment_name = experiment_name
        self.f = h5py.File(basepath + '//{filename}.hdf5'.format(filename=experiment_name), "w")
        #initalize all the HDF files
        self.con_weights = self.f.create_dataset("context-weights", (self.vocab_length, self.vector_size))
        self.weights = self.f.create_dataset("weights",(self.vector_size,self.vocab_length))
        self.context_bias = self.f.create_dataset("context-bias", (self.vocab_length,1))
        self.bias = self.f.create_dataset("bias", (1,self.vocab_length))
        self.csv_writer = CSV_writer(basepath,experiment_name+".csv")

        self.init_matrices()
    
    
    def init_matrices(self,chunk_size=10000):
        self.init_hdf_matrix(self.weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.con_weights,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.context_bias,-0.5,0.5,chunk_size)
        self.init_hdf_matrix(self.bias,-0.5,0.5,chunk_size)
    
    def init_hdf_matrix(self,hdf_data,min_value,max_value,block_length):
        if len(hdf_data) > len(hdf_data[0]):
            iterations = int(math.ceil(len(hdf_data) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data)-block_length*i)
                hdf_data[i*block_length:(i+1)*block_length , :] = np.random.rand(current_size,len(hdf_data[0]))/self.vector_size
        else:
            iterations = int(math.ceil(len(hdf_data[0]) / float(block_length)))
            for i in range(iterations):
                current_size = min(block_length,len(hdf_data[0])-block_length*i)
                hdf_data[:,i*block_length:(i+1)*block_length] = np.random.rand(len(hdf_data),current_size)/self.vector_size
            
    
    def block_file_path(self,zeile,spalte):
        # load the hdf coocurence block
        if(zeile >= spalte):
            template = "tf_cooccurence_{i}_{j}.hdf".format(i=zeile,j=spalte)
        else:
            template = "tf_cooccurence_{i}_{j}.hdf".format(i=spalte,j=zeile)
        
        return  self.block_path + '\\' + template
        
    
    file_que = queue.Queue()
    
    
    
    def load_block(self,zeile,spalte):
        file_path =  self.block_file_path(zeile,spalte)
        
        
        tmp_hf = h5py.File(file_path, "r")
        coocurrence = tmp_hf.get("co-ocurrence")[:]
        if (spalte > zeile):
            coocurrence = np.transpose(coocurrence)
        self.tf_co_occurences = tf.convert_to_tensor(coocurrence,dtype=tf.dtypes.float32)
        coocurrence = None
        tmp_hf.close()
    
    
    
    def load_block_async(self,zeile,spalte):
        self.thread = threading.Thread(target=self.thread_load,args=(zeile,spalte))
        self.thread.start()

    def get_block_async(self):
        self.thread.join()
        self.tf_co_occurences = self.file_que.get()
        
    
    def thread_load(self,zeile,spalte):
        file_path =  self.block_file_path(zeile,spalte)
        
        tmp_hf = h5py.File(file_path, "r")
        coocurrence = tmp_hf.get("co-ocurrence")[:]
        if (spalte > zeile):
            coocurrence = np.transpose(coocurrence)
        tf_co_occurences = tf.convert_to_tensor(coocurrence,dtype=tf.dtypes.float32)
        coocurrence = None
        tmp_hf.close()
        
        self.file_que.put(tf_co_occurences)
        tf_co_occurences = None
        
        
    def load_weights(self):
        iterations = math.ceil(self.vocab_length/self.block_length) 
        self.tf_weights,self.tf_con_weights,self.tf_bias, self.tf_con_bias  = \
        [None]*iterations,[None]*iterations,[None]*iterations,[None]*iterations
        
        for iter in range(iterations):
            # seems like i don't need fillage
            block_fillage = min(self.block_length, self.vocab_length - iter * self.block_length)
            
            
            
            self.tf_weights[iter]    = tf.Variable(initial_value=self.weights[:,iter * self.block_length:(iter+1)*self.block_length],dtype=tf.dtypes.float32)
            self.tf_con_weights[iter]= tf.Variable(initial_value=self.con_weights[iter * self.block_length:(iter+1)*self.block_length,:],dtype=tf.dtypes.float32)
            self.tf_bias[iter]       = tf.Variable(initial_value=self.bias[:,iter * self.block_length:(iter+1)*self.block_length],dtype=tf.dtypes.float32)
            self.tf_con_bias[iter]   = tf.Variable(initial_value=self.context_bias[iter * self.block_length:(iter+1)*self.block_length,:],dtype=tf.dtypes.float32)
        
        
    def save_weights(self):
        iterations = math.ceil(self.vocab_length/self.block_length) 
        for iter in range(iterations):
            # seems like i don't need fillage
            block_fillage = min(self.block_length, self.vocab_length - iter * self.block_length)
            
            self.weights[:,iter * self.block_length:(iter+1)*self.block_length] = self.tf_weights[iter].numpy()
            self.context_bias[iter * self.block_length:(iter+1)*self.block_length,:] = self.tf_con_bias[iter].numpy()
            self.bias[:,iter * self.block_length:(iter+1)*self.block_length] = self.tf_bias[iter].numpy()
            self.con_weights[iter * self.block_length:(iter+1)*self.block_length,:] = self.tf_con_weights[iter].numpy()
           
    
    def _close_files(self):
        self.f.close()
        self.csv_writer.close()
   
    def inner_loss(self,weights,context_weights,bias_mat,con_bias_mat,co_occurences):
        #co_occurences = tf.clip_by_value(co_occurences, clip_value_min = 0.0, clip_value_max=5000.0)
        bias_terms = bias_mat + con_bias_mat
        weight_matrix = tf.matmul(context_weights,weights)
        log_X = tf.math.log(co_occurences + 1)
        summe = bias_terms + weight_matrix - log_X
        summe = tf.math.square(summe)
        summe = self.scale_fn(co_occurences) * summe
        reduced = tf.math.reduce_sum(summe)
        return reduced
    
    def loss(self,zeile,spalte,weights,context_weights,bias,con_bias,co_occurences):
        
        ones_symetrical = tf.ones((self.block_length,self.block_length), dtype=tf.dtypes.float32, name=None)
        #print(weights.shape)
        #print(context_weights.shape)
        #print(bias.shape)
        #print(con_bias.shape)
    
        #just the words context
        if(zeile == self.amount_split - 1):
            difference = self.block_length - con_bias.shape[0]
            add2_context_bias   = tf.zeros((difference,1),dtype=tf.dtypes.float32)
            add2_context_weights = tf.zeros((difference,self.vector_size),dtype=tf.dtypes.float32)
            
            con_weights       = tf.concat([context_weights,add2_context_weights],axis = 0)
            con_bias_mat   = tf.concat([con_bias,add2_context_bias],axis = 0) * ones_symetrical
        else:
            con_weights       = context_weights
            con_bias_mat   = con_bias * ones_symetrical
        
        co_occurences = self.tf_co_occurences
        #just the words without context
        if(spalte == self.amount_split - 1):
            difference = self.block_length - bias.shape[1]
            add2_bias = tf.zeros((1,difference),dtype=tf.dtypes.float32)
            add2_weights = tf.zeros((self.vector_size,difference),dtype=tf.dtypes.float32)
            
            weights = tf.concat([weights,add2_weights],axis = 1)
            bias_mat = tf.concat([bias,add2_bias],axis=1) * ones_symetrical
        else:
            weights     = weights
            bias_mat = bias * ones_symetrical
          
        return self.inner_loss(weights,con_weights,bias_mat,con_bias_mat,co_occurences)
    
    alpha = tf.constant(0.75,dtype=tf.dtypes.float32)
    XMAX = tf.constant(100.0,dtype=tf.dtypes.float32)
    
    def scale_fn(self,value):
        clipped = tf.clip_by_value(value, clip_value_min = 0.0, clip_value_max=100.0)
        return tf.pow(clipped / self.XMAX, self.alpha)
    
    def train_splitted(self,epochs,use_grad_clipping = False):
        
        if (self.optimizer == None and use_grad_clipping):
            self.optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01,clipvalue=100.0)
            self.load_weights()
        elif(self.optimizer == None):
            self.optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)
            self.load_weights()
            
        for epoch in range(epochs):
            cur_loss = 0.0
            
            
            block_list = [(x,y) for x in range(self.amount_split) for y in range(self.amount_split) if x >= y]
            random.shuffle(block_list)
            #print(block_list)
        
            enumerated = enumerate(block_list)
            for id,(zeile,spalte) in enumerated:
                if(id == 0):
                    self.load_block(zeile,spalte)
                    self.load_block_async(block_list[id+1][0],block_list[id+1][1])
                else:
                    self.get_block_async()
                    if(id < len(block_list) - 1):#if not last id
                        next = block_list[id+1]
                        self.load_block_async(next[0],next[1])
                #self.load_block(zeile,spalte)
                #print(zeile,spalte)

                    
                #train code
                with tf.GradientTape() as tape:
                    tmp_loss = self.loss(zeile,spalte,self.tf_weights[spalte],self.tf_con_weights[zeile],\
                    self.tf_bias[spalte],self.tf_con_bias[zeile],self.tf_co_occurences)
                    
                    weights = [self.tf_weights[spalte],self.tf_con_weights[zeile],\
                    self.tf_bias[spalte],self.tf_con_bias[zeile]]
                    grads = tape.gradient(tmp_loss, weights)
                    self.optimizer.apply_gradients(zip(grads, weights))
                cur_loss += tmp_loss.numpy()
                     
                #train the other side
                if spalte != zeile:
                    self.tf_co_occurences = tf.transpose(self.tf_co_occurences)
                    
                    #train code
                    with tf.GradientTape() as tape:
                        tmp_loss = self.loss(spalte,zeile,self.tf_weights[zeile],self.tf_con_weights[spalte],\
                        self.tf_bias[zeile],self.tf_con_bias[spalte],self.tf_co_occurences)
                        
                        weights = [self.tf_weights[zeile],self.tf_con_weights[spalte],\
                        self.tf_bias[zeile],self.tf_con_bias[spalte]]
                        
                        grads = tape.gradient(tmp_loss, weights)
                        self.optimizer.apply_gradients(zip(grads, weights))
                    cur_loss += tmp_loss.numpy()
                           
            self.save_weights()    
            print('epoch: '+str(epoch)+" loss: "+str(int(cur_loss)))
            #lrOnPlato.notify_loss(cur_loss.numpy(),epoch)
            self.csv_writer.write('Adagrad',0.5,epoch+1,cur_loss)
        #self._close_files()
        return None

    

In [3]:
import time
vocab = Vocabulary()
vocab.load('..\\vocabs\\m_base')
size = vocab.get_size()

In [None]:
experiment_name = "m_base2021_200d_nodiag_"

tf.keras.backend.clear_session()
trainer = ModelTrainer(size,"E:\\tmp\\hdf_m",vector_size=200)
trainer.prepare('E:\\',experiment_name+"100epochs")

startTime = time.time()

trainer.train_splitted(75)

executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

amout_split: 53
epoch: 0 loss: 68890010
epoch: 1 loss: 26879963
epoch: 2 loss: 21726421
epoch: 3 loss: 19478835
epoch: 4 loss: 18208734
epoch: 5 loss: 17392871
epoch: 6 loss: 16845361
epoch: 7 loss: 16444423
epoch: 8 loss: 16154009
epoch: 9 loss: 15932526
epoch: 10 loss: 15761194
epoch: 11 loss: 15629605
epoch: 12 loss: 15523103
epoch: 13 loss: 15436366
epoch: 14 loss: 15365067
epoch: 15 loss: 15306995
epoch: 16 loss: 15256572
epoch: 17 loss: 15214546
epoch: 18 loss: 15176938
epoch: 19 loss: 15144941
epoch: 20 loss: 15114967
epoch: 21 loss: 15089505
epoch: 22 loss: 15065423
epoch: 23 loss: 15043323
epoch: 24 loss: 15023289
epoch: 25 loss: 15004226
epoch: 26 loss: 14985376
epoch: 27 loss: 14968365
epoch: 28 loss: 14951127
epoch: 29 loss: 14934990
epoch: 30 loss: 14918041
epoch: 31 loss: 14902358
epoch: 32 loss: 14885689
epoch: 33 loss: 14869042
epoch: 34 loss: 14851806
epoch: 35 loss: 14833889
epoch: 36 loss: 14815207
epoch: 37 loss: 14795444
epoch: 38 loss: 14773926
epoch: 39 loss: 147

In [None]:
weights = trainer.f.get('weights')
weights = weights[:]
context_weights = trainer.f.get('context-weights')
context_weights = context_weights[:]

list_of_words = vocab.id2Word
print(weights.shape)
matrix = weights + np.transpose(context_weights)
with open('..//embeddings//'+experiment_name+'_75e_wc','w+',encoding='utf8') as file:
    for index,word in enumerate(vocab.id2Word):
        file.write(word)
        vector = matrix[:,index]
        for coord in vector:
            file.write(' '+str(coord))
        file.write('\n')

In [None]:
weights = trainer.f.get('weights')
weights = weights[:]
context_weights = trainer.f.get('context-weights')
context_weights = context_weights[:]

list_of_words = vocab.id2Word
print(weights.shape)
matrix = weights
with open('..//embeddings//'+experiment_name+'_75e_w','w+',encoding='utf8') as file:
    for index,word in enumerate(vocab.id2Word):
        file.write(word)
        vector = matrix[:,index]
        for coord in vector:
            file.write(' '+str(coord))
        file.write('\n')

Dann nochmal 50 mal trainieren!!!

In [None]:
trainer.train_splitted(25)

In [None]:
weights = trainer.f.get('weights')
weights = weights[:]
context_weights = trainer.f.get('context-weights')
context_weights = context_weights[:]

list_of_words = vocab.id2Word
print(weights.shape)
matrix = weights + np.transpose(context_weights)
with open('..//embeddings//'+experiment_name+'_100e_wc','w+',encoding='utf8') as file:
    for index,word in enumerate(vocab.id2Word):
        file.write(word)
        vector = matrix[:,index]
        for coord in vector:
            file.write(' '+str(coord))
        file.write('\n')

In [None]:
weights = trainer.f.get('weights')
weights = weights[:]
context_weights = trainer.f.get('context-weights')
context_weights = context_weights[:]

list_of_words = vocab.id2Word
print(weights.shape)
matrix = weights
with open('..//embeddings//'+experiment_name+'_100e_w','w+',encoding='utf8') as file:
    for index,word in enumerate(vocab.id2Word):
        file.write(word)
        vector = matrix[:,index]
        for coord in vector:
            file.write(' '+str(coord))
        file.write('\n')

In [None]:
trainer._close_files()

In [None]:
import winsound
winsound.Beep(440, 500)