In [1]:
"""数据导入"""

import re
filename =open('data0502.txt','r',encoding='utf-8')        #打开数据文件

text = filename.read()        #将数据读取到字符串text中
text = ' '.join(re.split(' |\t|\v',text))        #将数据中的空格符统一，便于后期处理(原始数据中空格符包含\t、\v等)   
text = re.split('([: ,.\n(){}\[\]=])',text)        #将字符串数据按照括号中的符号进行分割，分割成列表格式，并且在列表中保留分隔符

text = list(filter(lambda x: x!=' 'and x!='',text))        #将列表中的空格和非空格筛选掉
list_text = text        #保留一份列表格式的数据
text = ' '.join(text)        #将列表转换成字符串

In [2]:
"""文本词频统计"""

def word_count(list_text):        #定义计算文本词频的函数，传入list_text列表
    import collections
    word_freq = collections.defaultdict(int)        #定义一个int型的词频词典，并提供默认值
    for w in list_text:        #遍历列表中的元素，元素出现一次，频次加一
        word_freq[w] += 1
    return word_freq        #返回词频词典
    
    #return word_freq.items()   该语句返回值的类型为list（这句话有语法问题，不必考虑）

In [3]:
"""根据text文本创建代码词词典"""

def build_dict(text, min_word_freq=50):
    word_freq = word_count(text)         #文本词频统计，返回一个词频词典
    word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())          # filter将词频数量低于指定值的代码词删除。
    word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))         # key用于指定排序的元素，因为sorted默认使用list中每个item的第一个元素从小到大排列，所以这里通过lambda进行前后元素调序，并对词频去相反数，从而将词频最大的排列在最前面
    words, _ = list(zip(*word_freq_sorted))         #获取每一个代码词
    word_idx = dict(zip(words, range(len(words))))         #构建词典（不包含词频）
    word_idx['<unk>'] = len(words)         #unk表示unknown，未知单词
    return words         #这里只返回了words，倒数两行代码还用不上。返回的是一个不含重复的代码词词典，不包含词频。

In [4]:
"""数据预处理-字符串序列向量化"""

import numpy as np
import keras

maxlen = 50         #提取50个代码词组成的序列
step = 5         #每5个代码词采样一个新序列
sentences = []         #保存所提取的序列
next_words = []         #保存目标代码词

cut_words = list_text         #将列表形式的元数据保存在cut_words中
for i in range(0,len(cut_words) - maxlen,step):
    sentences.append(cut_words[i:i + maxlen])         #将元数据按照步长来存储在每个序列中       
    next_words.append(cut_words[i + maxlen])         #将目标代码词存储在next_words中
    
    
print('Number of sequences:', len(sentences))


words = list(build_dict(list_text,0))         #创建代码词词典，返回的是一个不含重复的代码词词典，不包含词频。
print('Unique words:',len(words))
#print(words)

word_indices = dict((word,words.index(word)) for word in words)         #创建一个包含代码词唯一索引的代码词词典，返回的是一个字典
#print(word_indices)

print('Vectorization...')
x = np.zeros((len(sentences),maxlen))         #初始化x
y = np.zeros((len(sentences)))         #初始化y
for i,sentence in enumerate(sentences):
    for t,word in enumerate(sentence):
        x[i,t] = word_indices[word]         #将代码词转换成向量形式的编码
    y[i] = word_indices[next_words[i]]

y = keras.utils.to_categorical(y, len(words))         #将int型数组y转换成one-hot编码

"""说明一下，为什么要把x转换成向量形式、而非one-hot编码，而把y转换成one-hot
x之所以转换成向量形式，是为了便于将x输入到embedding层中。
one-hot格式的x无法直接作为embedding层的输入（在我们这个模型中是不行的，其他模型就不一定了）。
y之所以转换成one-hot，是因为后面模型训练的时候，y必须是one-hot编码格式
其实全用one-hot编码也可以，只不过要转换格式才能导入到embedding层中，我觉得比较麻烦就没有这样做
"""

Using TensorFlow backend.


Number of sequences: 104432
Unique words: 5573
Vectorization...


'说明一下，为什么要把x转换成向量形式、而非one-hot编码，而把y转换成one-hot\nx之所以转换成向量形式，是为了便于将x输入到embedding层中。\none-hot格式的x无法直接作为embedding层的输入（在我们这个模型中是不行的，其他模型就不一定了）。\ny之所以转换成one-hot，是因为后面模型训练的时候，y必须是one-hot编码格式\n其实全用one-hot编码也可以，只不过要转换格式才能导入到embedding层中，我觉得比较麻烦就没有这样做\n'

In [5]:
"""定义下一个代码词的采样函数---temperature越大，代码生成的创造性越强---"""

def sample(preds,temperature=0.1):         #《python深度学习》文本生成那一章介绍过temperature了，不明白的自行翻书
    preds = np.asarray(preds).astype('float')
    preds = np.log(preds) /temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1,preds,1)
    return np.argmax(probas)

In [6]:
"""将字符串写到指定文件中"""

def save(filename, contents): 
      file = open(filename, 'a', encoding='utf-8')
      file.write(contents)
      file.close()

In [None]:
"""上面属于公共部分的代码，把它们提取出来了"""

In [None]:
----------------------------------------------------------------我是分割线------------------------------------------------------------------------------------

In [7]:
"""模型尝试一：yk_model_local_gpu-0507-01：Embedding + 单层LSTM(加入dropout)"""

import keras
from keras import layers
from keras.layers import LSTM, Dense, Dropout

def create_model(words):         #定义创建模型的函数
    model = keras.models.Sequential()         #模型初始化
    model.add(layers.Embedding(len(words),128))         #模型第一层为embedding层
    model.add(layers.LSTM(128,dropout=0.2,recurrent_dropout=0.2))         #模型第二层为LSTM层，加入dropout减少过拟合
    model.add(layers.Dense(len(words),activation='softmax'))         #模型第三层为全连接层

    optimizer = keras.optimizers.RMSprop(lr=0.003)         #定义优化器
    model.compile(loss='categorical_crossentropy',optimizer=optimizer)         #模型编译
    
    return model

In [8]:
"""创建模型实例"""
model = create_model(words)         #创建模型
model.summary()         #打印模型结构

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         713344    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 5573)              718917    
Total params: 1,563,845
Trainable params: 1,563,845
Non-trainable params: 0
_________________________________________________________________


In [9]:
"""模型保存"""
from keras.callbacks import ModelCheckpoint
filepath = "yk_model_local_gpu-0507-01.hdf5"         #尽量将模型名字和前面的标题统一，这样便于查找
checkpoint = ModelCheckpoint(filepath, save_weights_only=False,verbose=1,save_best_only=False, period=1)         #回调函数，实现断点续训功能

In [11]:
"""训练模型"""

#这一部分的代码我改的比较多，看的时候要稍微耐心点，有看不懂的地方可以随时问我

import random
import sys
import os

strings =''        #将生成的代码保存下来，一轮epoch结束后，将生成的代码写入到文件中

#mark、last_word、start_gen定义的目的是为了让最终的生成的代码符合标准代码的格式要求----简言之就是该空格的地方空格，不该的就不空格
mark = '.,()[]:{}\n'        #将后面不需要空格的元素保存在字符串中
last_word = ''
start_gen=''

for epoch in range(0,50):
    print('\n' + '---------------------------------------------------------epoch=' + str(epoch) + '--------------------------------------------------------------' + '\n')
    strings += '\n' + '---------------------------------------------------------epoch=' + str(epoch) + '--------------------------------------------------------------' + '\n'
    
    if os.path.exists(filepath):        #如果模型存在，则从现有模型开始训练
        model.load_weights(filepath)
        print("==============================正在从断点开始续训模型==============================")
        strings += "==============================正在从断点开始续训模型=============================="
        
        model.fit(x,y,batch_size=128,epochs=1,callbacks=[checkpoint])        #开始训练模型

    else:
        model.fit(x,y,batch_size=128,epochs=1,callbacks=[checkpoint])
    
    
    for temperature in [0.1,0.4,0.8]:        #定义随时数，随机数越高，文本生成的创造性越强，规则表示越弱
        print('\n' + '------ temperature:' ,str(temperature) +'\n' )
        strings += '\n' + '------ temperature:' + str(temperature) +'\n' 
        
        for j in range(30):       #生成30行代码
            if temperature == 0.1:        #随机选择起始代码序列
                if j == 0:
                    start_index = random.randint(0,len(cut_words) - step - 1)        #随机选择一个代码序列作为代码生成的开头，这个随机序列必须位于某一个代码句的开头
                    while cut_words[start_index] != '\n':        #如果起始代码词不是换行符，则选择下一个代码词
                        start_index += 1
                        if start_index >= (len(cut_words) - step - 1):       #如果起始代码词的下标超过了最大长度，重新随机一个起始位置
                            start_index = random.randint(0,len(cut_words) - step - 1)

                    generated_text = cut_words[start_index + 1:start_index + 1 + step]        #选择起始代码序列（一个步长）
                    start_gen = generated_text[:]        #将起始代码序列保存在start_gen中
                    
                    for k in range(len(generated_text)):        #将生成的代码转换成标准代码格式，并打印出来
                        if generated_text[k] not in mark and last_word not in mark:
                            sys.stdout.write(' ' + generated_text[k])
                            strings += ' ' + generated_text[k]
                        else:
                            sys.stdout.write(generated_text[k])
                            strings += generated_text[k]
                        last_word = generated_text[k]
            else:
                if j == 0:
                    generated_text = start_gen[:]        #若temperature不等于0.1，则让别的temperature的初始序列和0.1的一样
                    last_word = ''
                    for k in range(len(generated_text)):        #将生成的代码转换成标准代码格式，并打印出来
                            if generated_text[k] not in mark and last_word not in mark:
                                sys.stdout.write(' ' + generated_text[k])
                                strings += ' ' + generated_text[k]
                            else:
                                sys.stdout.write(generated_text[k])
                                strings += generated_text[k]
                            last_word = generated_text[k]
                    
            for i in range(50):        #生成一个代码句
                sampled = np.zeros((1,len(generated_text)))        #根据现有代码词长度，初始化一个相同长度的sampled
                for t,word in enumerate(generated_text):        #将已有代码词向量化
                    sampled[0,t] = word_indices[word]

                preds = model.predict(sampled,verbose=0)[0]        #预测并生成下一个代码词
                next_index = sample(preds,temperature = 0.3)
                next_word = words[next_index]


                generated_text.append(next_word)        #将生成的代码词加到已生成的代码序列中
                
                #如果下面这两句话不标注掉，那么本模型是按照滑动框（n-gram）来训练的。标注掉以后，每一个新的代码词都是从已生成的代码序列来进行预测的。
                #if len(generated_text) == maxlen: 
                #    generated_text = generated_text[1:]

                if next_word not in mark and last_word not in mark:        #将生成的代码转换成标准代码格式，并打印出来
                    sys.stdout.write(' ' + next_word)
                    strings += ' ' + next_word
                else:
                    sys.stdout.write(next_word)
                    strings +=  next_word
                
                last_word = next_word
    
                if next_word == '\n':        #如果预测的代码词为\n，那么表示这一句结束
                    break
            
            last_word = ''
                
        save('model_training_result/yk_model_local_gpu-0507-01.hdf5.txt',strings)        #将生成的代码保存
        strings = ''
        generated_text = ''
    start_gen=''


---------------------------------------------------------epoch=5--------------------------------------------------------------

Epoch 1/1

------ temperature: 0.1

from tensorflow.import keras
import numpy as np
from sklearn.metrics import preprocessing
from keras.layers.core import Dense,Dropout,Activation
from

  """


 keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import BatchNormalization
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import BatchNormalization
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convol

def __init__(self,callbacks =,validation_split =,verbose =):
self.model = Sequential()
self.model.add(LSTM(units =,input_shape =(),return_sequences =))
self.model.add(Dense(self.compile())
self.model = Sequential()
self.model.add(LSTM(units =,return_sequences =,input_shape =()))
self.model.add(LSTM())
model.add(LSTM(self.nb_classes,activation =))
self.model.compile(loss =,optimizer =(),metrics =[])
self.model.fit(x_train,y_train,batch_size =,verbose =)
model.compile(loss =,optimizer =,metrics =[])
return model
def.():
self.model = Sequential()
self.model.add(LSTM(self.model.add(Dense(self.return self.model,input_shape =()))
self.model = self.model.def input_shape():
self.model = self.models = self.fit()
def __init__(self,dropout:):from keras.layers.convolutional import Conv1D
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers import Dense

------ temperature: 0.4

model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metr

from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import MaxPooling1D

------ temperature: 0.8

history = model.predict()
return model
def.():
self.model = self.input_shape =()
self.model = self.X()
def __init__():
self.dropout = self.model.add(Bidirectional(LSTM(units =,return_sequences =,input_shape =(),units =,return_sequences =))
self.model.add(Dropout())
self.model.add(Dense(self.rnn_size,return_sequences =))
self.model.add(Dropout())
self.model.add(Dense(units =,activation =))
self.model.add(Dropout())
self.model.add(Dense(self.nb_classes,activation =))
self.model = Sequential()
self.model.add(Dense(self.rnn_size,return_sequences =))
model.add(Dr

from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import LSTM
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Flatten

---------------------------------------------------------epoch=12--------------------------------------------------------------

Epoch 1/1

------ temperature: 0.1

model.reset_states()
return model
def Dense(self.nb_classes,activation =):
return self.model.predict()
def __init__(self,batch_size =,dropout =):
self.model = self.lstm()
elif model = =:
self.model.save()
def __init__():
self.model = self.lstm()
def layers(self.nb_classes,self.nb_classes,],self.nb_classes,self.if self.model.add(Dropout()))
self.model.add(Dense(self.Dense(self.Dense(self.model.add(Activation())))
self.model.compile(loss =,optimize

def build_model(top_words,embedding_vecor_length,max_review_length,show_summaries =):
input_layer = Embedding(top_words,embedding_vecor_length,input_length =)
branch_2 = Sequential()
branch_3 = Sequential()
branch_3.add()
branch_3.add(Conv1D(filters =,kernel_size =,padding =,kernel_regularizer =()))
branch_3.add(Activation())
branch_3.add(MaxPooling1D(pool_size =))
branch_3.add(Dropout())
branch_3.add(LSTM())
branch_3 = Sequential()
branch_3.add()
branch_3.add(Conv1D(filters =,kernel_size =,padding =,kernel_regularizer =()))
branch_3.add(Activation())
branch_3.add(MaxPooling1D(pool_size =))
branch_3.add(Dropout())

------ temperature: 0.4

from master import run_model,generate_read_me,get_text_data,load_word2vec
import time
import numpy as np
import matplotlib
import argparse
import keras
import csv
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation,Flatten,Flatten,Embedding,LSTM,TimeDistributed,Activation,Conv1D,MaxPoo

from keras.regularizers import l2
def build_model(top_words,embedding_vecor_length,max_review_length,show_summaries =):
input_layer = Embedding(top_words,embedding_vecor_length,input_length =)
branch_2 = Sequential()
branch_2.add()
branch_2.add(Conv1D(filters =,kernel_size =,padding =,kernel_regularizer =()))
branch_2.add(Activation())
branch_2.add(MaxPooling1D(pool_size =))
branch_2.add(Dropout())
branch_2.add(BatchNormalization())
branch_2.add(LSTM())
branch_2.add(Dropout())
branch_2.add(LSTM())
branch_2.add(Dropout())
branch_2.add(LSTM())

------ temperature: 0.8

model.add(LSTM(units,return_sequences =,input_shape =()))
model.add(LSTM())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
return model
def add(Dense(1,activation =):):
model.compile(loss =,optimizer =,metrics =[])
model.fit(X_train,y_train,batch_size =,epochs =,batch_size =)
model.save()
def __init__():
self.model = Sequential()
self.model.add(Dense(self.branch_7.type =))
self.mod

if self.model.add(Bidirectional(LSTM(units =,return_sequences =,dropout =,recurrent_dropout =)))
self.model.add(Dropout())
self.model.add(Dense(self.rnn_size,1,activation =))
self.model.add(Dropout())
self.model.add(Dense(self.rnn_size,1,activation =))
model.add(Dropout())
model.add(Dense(self.nb_classes,activation =))
return model
def,(self,input_shape =():
self.model = Sequential()
self.model.add(Embedding(output_dim =[1],input_length =,dropout =))
self.model.add(Dropout())
self.model.add(Dense(1,activation =))
self.model.compile(optimizer =,loss =,metrics =[])

------ temperature: 0.8

units = 3
def,Dense,activation =):
model = Sequential()
model.add(LSTM(32,return_sequences =,input_shape =()))
model.add(LSTM(units =,return_sequences =,activation =,kernel_initializer =))
model.add(LSTM(units =,return_sequences =,activation =,name =))
model.add(Dropout())
model.add(Dense(loss =,optimizer =,metrics =[]))
return model
def,():
def __init__():
model = Sequential()
model.add(LSTM(units =,

model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
return model
def,Dropout():
model = Sequential()
model.add(Embedding(top_words,embedding_vecor_length,input_length =))
model.add(LSTM())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
if show_summaries:
return model
os.environ[]=
parser = argparse.ArgumentParser(description =)
parser.add_argument(,action =,default =,help =)

---------------------------------------------------------epoch=21--------------------------------------------------------------

Epoch 1/1

------ temperature: 0.1

model.compile(loss =,optimizer =)
return model
def model():
model = Sequential()
model.add(LSTM(input_shape =(),return_sequences =))
model.add(LSTM())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
model.fit(X_train,y_train,batch_size =,epochs =)
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation,Flatten,Flatten,Dropo

KeyboardInterrupt: 

In [None]:
"""开始测试"""

In [32]:
"""定义测试函数--代码句推荐"""

def generate_text_sentence(seed_text,model_filename):        #测试代码和上面训练模型的代码基本一样，就不再介绍
    model.load_weights(model_filename)
    
    strings=''
    last_word=''
    seed_text = re.split('([: ,.\n(){}\[\]=])',seed_text)
    seed_text = list(filter(lambda x: x!=' 'and x!='',seed_text))
    
    generated_text = seed_text[:]
    
    for temperature in [0.1,0.4,0.8]:
        strings += '\n' + '-------------temperature:' + str(temperature) +'-------------\n' +'\n'
        
        for i in range(50):
            if i == 0:
                for k in range(len(generated_text)):
                    if generated_text[k] not in mark and last_word not in mark:
                        strings += ' ' + generated_text[k]
                    else:
                        strings += generated_text[k]
                    last_word = generated_text[k]

            sampled = np.zeros((1,len(generated_text)))
            for t,word in enumerate(generated_text):
                sampled[0,t] = word_indices[word]

            preds = model.predict(sampled,verbose=0)[0]
            next_index = sample(preds,temperature = 0.3)
            next_word = words[next_index]


            generated_text.append(next_word)

            #if len(generated_text) == maxlen:
            #    generated_text = generated_text[1:]

            if next_word not in mark and last_word not in mark:
                strings += ' ' + next_word
            else:
                strings +=  next_word

            last_word = next_word

            if next_word == '\n':
                break
        
        generated_text = seed_text[:]
        
    return strings


In [29]:
"""定义测试函数--代码段推荐"""

def generate_text_paragraph(seed_text,model_filename):        #测试代码和上面训练模型的代码基本一样，就不再介绍
    model.load_weights(model_filename)
    
    strings=''
    seed_text = re.split('([: ,.\n(){}\[\]=])',seed_text)
    seed_text = list(filter(lambda x: x!=' 'and x!='',seed_text))
    
    for temperature in [0.1,0.4,0.8]:
        strings += '\n' + '-------------temperature:' + str(temperature) +'-------------\n' +'\n'
        
        for i in range(10):
            if i == 0:
                last_word=''
                generated_text = seed_text[:]
                for k in range(len(generated_text)):
                    if generated_text[k] not in mark and last_word not in mark:
                        strings += ' ' + generated_text[k]
                    else:
                        strings += generated_text[k]
                    last_word = generated_text[k]            
                        
            for j in range(30):
                sampled = np.zeros((1,len(generated_text)))
                for t,word in enumerate(generated_text):
                    sampled[0,t] = word_indices[word]

                preds = model.predict(sampled,verbose=0)[0]
                next_index = sample(preds,temperature = 0.3)
                next_word = words[next_index]


                generated_text.append(next_word)

                #if len(generated_text) == maxlen:
                #    generated_text = generated_text[1:]

                if next_word not in mark and last_word not in mark:
                    strings += ' ' + next_word
                else:
                    strings +=  next_word

                last_word = next_word

                if next_word == '\n':
                    break
        
    return strings


In [40]:
"""进行测试"""

input_strings = input("请输入代码词：")
model_filename = 'yk_model_local_gpu-0507-01.hdf5'
strings = generate_text_paragraph(input_strings,model_filename)
print(strings)

请输入代码词：def model()


  """



-------------temperature:0.1-------------

def model():
model = Sequential()
model.add(LSTM(input_shape =(),return_sequences =))
model.add(Dropout())
model.add(LSTM())
model.add(Dropout())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
model.fit(x_train,y_train,batch_size =,epochs =,validation_split =)

-------------temperature:0.4-------------

def model():
model = Sequential()
model.add(LSTM(input_shape =(),return_sequences =))
model.add(LSTM())
model.add(Dropout())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
model.fit(X_train,y_train,batch_size =,nb_epoch =,validation_split =)
from keras.models import Sequential

-------------temperature:0.8-------------

def model():
model = Sequential()
model.add(LSTM(32,return_sequences =,input_shape =()))
model.add(Dropout())
model.add(LSTM())
model.add(Dense())
model.add(Activation())
model.compile(loss =,optimizer =,metrics =[])
model.fit(X_train,y_train,ba