In [1]:
%run ../chap15/gan.ipynb
%run ../chap15/NLP_dataset.ipynb
import scipy

In [2]:
class NLP(Gan):
    def __init__(self, name, dataset, hconfigs, JJ=0, show_maps=False,
                 l2_decay=0, l1_decay=0, dump_structure=True, word_vector_dimension=100, window_size=1, negative=5, load=0):
        self.word_vector_dimension=word_vector_dimension
        self.training_unique_words_number=0
        self.training_full_text_number=0
        self.new_words_number=0
        self.new_full_text_number=0
        self.window_size=window_size
        self.negative = negative
        self.text_dataset  = NLPDataset(['initial'])
        self.text_dataset.text_find_number_words("training_text",1)
        self.text_dataset.create_contexts_target(self.text_dataset.training_corpus,self.window_size)
        self.load=load
        super(NLP, self).__init__(name, dataset, hconfigs, show_maps,
                                          l2_decay, l1_decay)
        
    def train(self,epoch_count, batch_size, learning_rate, learning_decrease,restore, name):
        print("the number of words and the number of unique words")
        print(len(self.text_dataset.training_corpus),self.text_dataset.unique_words)
        self.training_full_text_number=len(self.text_dataset.training_corpus)
        self.training_unique_words_number=self.text_dataset.unique_words
        
        if self.load == 1:
            self.load_parameters("dic")
  

        self.learning_rate = learning_rate
        self.n=0
        self.batch_size=batch_size
        batch_count = int(self.text_dataset.train_count / batch_size)
        time1 = time2 = int(time.time())
  
        for epoch in range(epoch_count):
            costs = []
            accs = []
            self.epoch = epoch
            self.learning_rate = self.learning_rate*learning_decrease
            self.text_dataset.shuffle_train_data(batch_size*batch_count)
  
            for n in range(batch_count):
                self.n=n
                trX, trY = self.text_dataset.get_train_data(batch_size, n)
                cost= self.train_step(trX, trY)
                
                with open(restore,'a',newline='') as aa:
                    writer = csv.writer(aa, delimiter=',')
                    time3 = int(time.time())
                    tm1, tm2 = time3-time2, time3-time1 #tm1은 한 에포크 시간, tm2는 첨부터 시간
                    time2 = time3
                    writer.writerow([epoch+1]+[n]+[self.test()])
                    
            if epoch % 10==0 and epoch != 0:
                self.save_parameters("dic")
  
            
        tm_total = int(time.time()) - time1
        print('Model {} train ended in {} secs:'.format(self.name, tm_total))
    
    def init_parameters(self, hconfigs):
        self.hconfigs = hconfigs
        self.pm_hiddens = []
        
        pm_hidden = self.alloc_embedding_layer(len(self.text_dataset.word_to_id),self.word_vector_dimension) #가중치 어휘수 * 벡터 차원 
        prev_shape = self.word_vector_dimension*2*self.window_size
        self.pm_hiddens.append(pm_hidden)
        
        for hconfig in hconfigs:
            pm_hidden, prev_shape = self.alloc_layer_param(prev_shape, hconfig)
            self.pm_hiddens.append(pm_hidden)
            
        pm_hidden = self.alloc_end_embedding_layer(prev_shape)
        self.pm_hiddens.append(pm_hidden)
        
        
    def train_step(self, x, y):
        self.is_training = True
        loss, aux_nn = self.forward_neuralnet(x,y)
        G_loss = 1.0
        self.backprop_neuralnet(G_loss, aux_nn)
        self.is_training = False
        return loss
    
    def forward_neuralnet(self, x,y):
        hidden = x
        aux_layers = []

        hidden, aux = self.forward_embedding_layer(hidden, None, self.pm_hiddens[0])
        aux_layers.append(aux)
        
        for n, hconfig in enumerate(self.hconfigs):
            hidden, aux = self.forward_layer(hidden, hconfig, self.pm_hiddens[n+1])
            aux_layers.append(aux)

        output, aux = self.forward_end_embedding_layer(hidden, self.pm_hiddens[-1],y)

        return output, [aux, aux_layers]

    def backprop_neuralnet(self, G_output, aux):
        aux_out, aux_layers = aux
    
        G_hidden = self.backprop_end_embedding_layer(G_output, self.pm_hiddens[-1], aux_out)
    

        for n in reversed(range(len(self.hconfigs))):
            hconfig, pm, aux = self.hconfigs[n], self.pm_hiddens[n+1], aux_layers[n+1]
            G_hidden = self.backprop_layer(G_hidden, hconfig, pm, aux)
        
        hconfig, pm, aux = self.hconfigs[0], self.pm_hiddens[0], aux_layers[0]
        G_hidden = self.backprop_embedding_layer(G_hidden, hconfig, pm, aux)
        
        return G_hidden
      
    def test(self,a='is', b='are'):
        a_=self.text_dataset.word_to_id[a]
        b_=self.text_dataset.word_to_id[b]

        print(np.dot(self.pm_hiddens[0]['dic'][a_], self.pm_hiddens[0]['dic'][b_])/((sum(self.pm_hiddens[0]['dic'][a_]*self.pm_hiddens[0]['dic'][a_])\
                                                                                    **0.5)*(sum(self.pm_hiddens[0]['dic'][b_]*self.pm_hiddens[0]['dic'][b_])**0.5)))
        
    def predict_word(self,word_1):
        pass
    
    def alloc_embedding_layer(self, input_shape, hconfig):
        input_cnt = np_cpu.prod(input_shape)
        output_cnt = get_conf_param(hconfig, 'width', hconfig)
        weight = np.random.normal(0, self.rand_std, [input_cnt, output_cnt], dtype = 'float32')#가중치 (어휘수 * 벡터 차원)
        return {'dic':weight}

    def forward_embedding_layer(self, x, hconfig, pm):
        #print(pm['dic'][x,:].shape)
        return pm['dic'][x,:].reshape(self.batch_size,-1), [x, pm['dic'][x,:].reshape(self.batch_size,-1)]

    def backprop_embedding_layer(self, G_y, hconfig, pm, aux):
        if pm is None: return G_y
        x, y= aux
        g_affine_weight = self.text_dataset.training_one_hot_word[x].transpose()
        for i in range(2*self.window_size):
            #print("Asdfasdfasdfasd",g_affine_weight.shape,G_y.shape)
            G_weight = np.matmul(g_affine_weight[:,i,:], G_y[:,i*self.word_vector_dimension:(i+1)*self.word_vector_dimension])
            self.update_param(pm, 'dic', G_weight)
        return 0
    
    def alloc_end_embedding_layer(self, input_shape):
        input_cnt = np_cpu.prod(input_shape)
        weight = np.random.normal(0, self.rand_std, [input_cnt, len(self.text_dataset.word_to_id)], dtype = 'float32')#가중치 (히든레이어 * 벡터 차원)
        return {'dic_end':weight}

    def forward_end_embedding_layer(self, x, pm,y):
        temp_number_list=list()
        for i in range(self.batch_size):
            while 1==1:
                temp_number_=list(map(int, np.random.choice(self.text_dataset.np_arange_, size=self.negative,p=self.text_dataset.counting_list).tolist()))
                if y[i] in temp_number_:
                    continue
                else:
                    temp_number_list.append(temp_number_)
                    break

        output_list=list()
        y_one_list=list()
        self.temp_pm_false=list()
        #print("종우", y.shape, len(temp_number_list[0]))
        for i in range(self.negative+1):
            
            if i ==0:
                output = np.matmul(x, pm['dic_end'][:,np.array(y,dtype='int32').tolist()])
                self.temp_pm_false.append(y)
            else:
                output = np.matmul(x, pm['dic_end'][:,np.array(temp_number_list, dtype='int32')[:,i-1].tolist()])
                self.temp_pm_false.append(np.array(temp_number_list, dtype='int32')[:,i-1].tolist())
            #print(x.shape,pm['dic_end'][:,np.array(y,dtype='int32').tolist()].shape)
            #output = np.sum(output,axis=1) #output (2500,300) -> 2500,1
            output = self.activate(output, {'actfunc':'sigmoid'})#3
            if i == 0:
                y_one=np.ones(output.shape,dtype='float32')
            else:
                y_one=np.zeros(output.shape,dtype='float32')
            entropy = sigmoid_cross_entropy_with_logits(y_one, output)#2 (2500,1)
            loss = np.mean(entropy)#1
            print('loss:',loss)
            output_list.append(output)
            y_one_list.append(y_one)
            """
            for j in range(self.batch_size):
                print("y",y.shape,pm['dic_end'].shape)
                print(i,j,x.shape,pm['dic_end'][:,np.array(y[j],dtype='int32').tolist()].shape)
                output = np.matmul(x,pm['dic_end'][:,np.array(y[j],dtype='int32').tolist()])
                output = self.activate(output, {'actfunc':'sigmoid'})
                print("asdfasdf",output.shape)
                output_list.append(output)
            
                    
            """    
                
        aux = [y_one_list, output_list, x, x.shape,temp_number_list,output.shape,y]
 
        return loss, aux

    def backprop_end_embedding_layer(self, G_loss, pm, aux):
        if pm is None: return G_y
        
        y_one_list, output_list, x,x_org_shape,temp_number_list,output_shape, y= aux
        shape = output_list[-1].shape
        #print(shape)
        for i in reversed(range(self.negative+1)):
            self.i_i_i=i
            #print(i)
            g_loss_entropy = np.ones(shape) / np_cpu.prod(shape)#1
            g_entropy_output = sigmoid_cross_entropy_with_logits_derv(y_one_list[i], output_list[i])#2

            G_entropy = g_loss_entropy * G_loss
            G_affine = g_entropy_output * G_entropy

            G_affine = self.activate_derv(G_affine, output_list[i], {'actfunc':'sigmoid'})#3
            g_affine_weight = x.transpose()
            #print(pm['dic_end'][:,np.array(y_one_list[i],dtype='int32')].shape, G_affine.shape)
            if i != 0:
                g_affine_input = pm['dic_end'][:,np.array(temp_number_list, dtype='int32')[:,i-1].tolist()].transpose()
                
            else:
                g_affine_input = pm['dic_end'][:,np.array(y,dtype='int32').tolist()].transpose()
                
            #print("wd",G_affine.shape,g_affine_input.shape)
            #recover_ = G_affine * np.ones([2500,300]) / 300  #####
            #print(recover_)
            G_weight = np.matmul(g_affine_weight, G_affine)
            #print(self.temp_pm_false.shape)
            
            if i == self.negative:
                G_input = np.matmul(G_affine, g_affine_input)
            else:
                G_input = G_input + np.matmul(G_affine, g_affine_input)  #5번으로 대체해도 될듯? 새로운 발명
            
            #G_input = np.matmul(G_affine, g_affine_input)    #5번
            #print("종우",G_input.shape)
            self.inter_stop=1
            self.update_param(pm, 'dic_end', G_weight)
            self.inter_stop=0
        return G_input.reshape(x_org_shape)
    
    def save_parameters(self,char):    #for word vector
        time1 = int(time.time())

        with open("parameters_"+self.name+char+".csv",'w',newline='') as aa:
            writer = csv.writer(aa, delimiter=',')
            
            if char == 'dic':
                writer.writerow(self.hconfigs)
                temp_pm=self.pm_hiddens
                hconfigs=self.hconfigs
            #print(self.pm_hiddens)
        for i, j in enumerate(temp_pm):
            #print(type_)
            #print(i)
            with open("parameters_"+self.name+str(i)+'_'+char+"_.csv",'w',newline='') as aa:

                writer = csv.writer(aa, delimiter=',')
                
                    
                if i == 0:
                    meta = self.pm_hiddens[i]['dic'].shape
                    writer.writerow(meta)
                    for k in range(meta[0]):
                        writer.writerow(self.pm_hiddens[i]['dic'][k][:])
                elif i ==1:
                    meta = self.pm_hiddens[i]['w'].shape
                    writer.writerow(meta)
                    for k in range(meta[0]):
                        writer.writerow(self.pm_hiddens[i]['w'][k][:])
                    
                    #writer.writerow(self.pm_hiddens[i]['w'])
                    writer.writerow(self.pm_hiddens[i]['b'][:])
                elif i == 2:
                    meta = self.pm_hiddens[i]['dic_end'].shape
                    writer.writerow(meta)
                    for k in range(meta[0]):
                        writer.writerow(self.pm_hiddens[i]['dic_end'][k][:])
     
        time2 = int(time.time())
        print(time2-time1, "걸린시간(초)")

    def load_parameters(self,char): #for word vector

        time1 = int(time.time())
        
        if char == 'dic':
            temp_pm=self.pm_hiddens
            hconfigs=self.hconfigs
        for i, j in enumerate(temp_pm):
            #print(type_)

            with open("parameters_"+self.name+str(i)+'_'+char+"_.csv",'r',newline='') as aa:
                #print("종우")
                reader = csv.reader(aa, delimiter=',')
                w_list=list()
                for k, t in enumerate(reader):                    
                    if i==0:

                        if k == 0:
                            multi_0=int(t[0])
         
                            multi_1=int(t[1])
                            continue
                        elif k < multi_0: # if 100, multi_0 = 100, 실제 위치 101까지 있음. 그러므로 10
                            t=list(map(float, t))
            
                            w_list.append(t)
                  
                        elif k == multi_0:
                            t=list(map(float, t))
                            #rint("asdasd")
                            w_list.append(t)
                            #print(t)
                            self.pm_hiddens[0]['dic']=np.array(w_list, dtype='float32')
                            #print(self.pm_hiddens[0]['dic'].shape)
                
                    if i ==1:
                        if k == 0:
                            multi_0=int(t[0])
                 
                            multi_1=int(t[1])
                            continue
                        elif k < multi_0: # if 100, multi_0 = 100, 실제 위치 101까지 있음. 그러므로 10
                            t=list(map(float, t))
              
                            w_list.append(t)
       
                        elif k == multi_0:
                            t=list(map(float, t))
                            w_list.append(t)
                            self.pm_hiddens[1]['w']=np.array(w_list, dtype='float32')
                            #print(t)
                            print(self.pm_hiddens[1]['w'].shape)
                        elif k == multi_0+1:
                            t=list(map(float, t))
                            #print(t)
                            self.pm_hiddens[1]['b']=np.array(t,dtype='float32')
                            #print(self.pm_hiddens[1]['w'].shape)
                            
                    if i==2:

                        if k == 0:
                            multi_0=int(t[0])
         
                            multi_1=int(t[1])
                            continue
                        elif k < multi_0: # if 100, multi_0 = 100, 실제 위치 101까지 있음. 그러므로 10
                            t=list(map(float, t))
            
                            w_list.append(t)
                  
                        elif k == multi_0:
                            t=list(map(float, t))
                            w_list.append(t)
                            self.pm_hiddens[2]['dic_end']=np.array(w_list, dtype='float32')
                            #print(self.pm_hiddens[2]['dic_end'].shape)
                
                    
                        
        time2 = int(time.time())
        print(time2-time1, "걸린시간(초)")
                    

 