**Based on [C code Generation using LSTM](https://blogs.oracle.com/meena/code-generation-using-lstm-long-short-term-memory-rnn-network)**

In [2]:
import tensorflow as tf
import os
import numpy as np

from tensorflow import keras

In [3]:
tf.__version__

'2.0.0'

In [4]:
path_name = "data/julia_sources/" # julia repo

## concat all files into training file
with open("data/julia_sources/julia_training.txt", "w") as a:
    for file in os.listdir(path_name):
        f = os.path.join(path_name, file)
        current_file = open(f).read()
        a.write(current_file)

In [5]:
text = open("data/julia_sources/julia_training.txt", 'r').read()

Clean up -- remove comments

- `#` ...
- #= 

  ...
  
  =#
    
- """ 

  ... 

  """


In [6]:
import re

def remove_comments(text):
    single_comments_r = r'#.*'
    multiple_comments_r = r'#.*\n.*\n+=#'
    string_comments_r = r'"{3}([\s\S]*?"{3})'
    
    text = re.sub(multiple_comments_r, '', text)
    text = re.sub(string_comments_r, '', text)
    text = re.sub(single_comments_r, '', text)
    return text.strip()


text_without_comments = remove_comments(text)

In [7]:
chars = sorted(list(set(text_without_comments)))
VOCAB_SIZE = len(chars)

print(f"Length of file: {len(text)}")
print(f"Length of file without comments: {len(text_without_comments)}")
print(f"Total vocab length: {VOCAB_SIZE}")

Length of file: 4766833
Length of file without comments: 3294376
Total vocab length: 161


**Mapping of unique chars to integers and a reverse mapping**

In [8]:
char_to_int = {c:i for i, c in enumerate(chars)}
int_to_char = {i:c for i, c in enumerate(chars)}

print(char_to_int)
print(int_to_char)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '<': 29, '=': 30, '>': 31, '?': 32, '@': 33, 'A': 34, 'B': 35, 'C': 36, 'D': 37, 'E': 38, 'F': 39, 'G': 40, 'H': 41, 'I': 42, 'J': 43, 'K': 44, 'L': 45, 'M': 46, 'N': 47, 'O': 48, 'P': 49, 'Q': 50, 'R': 51, 'S': 52, 'T': 53, 'U': 54, 'V': 55, 'W': 56, 'X': 57, 'Y': 58, 'Z': 59, '[': 60, '\\': 61, ']': 62, '^': 63, '_': 64, '`': 65, 'a': 66, 'b': 67, 'c': 68, 'd': 69, 'e': 70, 'f': 71, 'g': 72, 'h': 73, 'i': 74, 'j': 75, 'k': 76, 'l': 77, 'm': 78, 'n': 79, 'o': 80, 'p': 81, 'q': 82, 'r': 83, 's': 84, 't': 85, 'u': 86, 'v': 87, 'w': 88, 'x': 89, 'y': 90, 'z': 91, '{': 92, '|': 93, '}': 94, '~': 95, '\xa0': 96, '²': 97, '³': 98, '´': 99, '÷': 100, 'Δ': 101, 'Ω': 102, 'β': 103, 'γ': 104, 'η': 105, 'θ': 106, 'λ': 107, 'ξ': 108, 'π': 109, 'ρ'

**In this example we would use word based model**

In [9]:
SEQ_LENGTH = 100
EPOCHES = 10
BATCH_SIZE = 128

In [10]:
X = []
y = []

for i in range(len(text_without_comments) - SEQ_LENGTH):
    seq_in = text_without_comments[i:i + SEQ_LENGTH]
    seq_out = text_without_comments[i + SEQ_LENGTH] # we try to predict next character
    X.append([char_to_int[char] for char in seq_in])
    y.append(char_to_int[seq_out])
    
dataX = X
dataY = y

In [11]:
print(''.join([int_to_char[x] for x in X[12]]))
print(f"Next character: {int_to_char[y[12]]}")

dule) = ccall(:jl_module_name, Ref{Symbol}, (Any,), m)


parentmodule(m::Module) = ccall(:jl_module_
Next character: p


In [13]:
samples = len(X)
print(f"Total samples: {samples}")

Total samples: 3294276


In [14]:
X = np.reshape(X, (samples, SEQ_LENGTH, 1))
X = X / float(VOCAB_SIZE) # normalization

y = keras.utils.to_categorical(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (3294276, 100, 1)
y shape: (3294276, 161)


**Create simple model with 2 LSTMs**

In [50]:
def build_model(vocab_size, embedding_size, rnn_units, batch_size):
    model = keras.Sequential([
        keras.layers.LSTM(rnn_units, input_shape=(X.shape[1], X.shape[2])),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [51]:
EMBEDDING_SIZE = 128
RNN_UNITS = 256
model = build_model(VOCAB_SIZE, EMBEDDING_SIZE, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               264192    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 161)               41377     
Total params: 305,569
Trainable params: 305,569
Non-trainable params: 0
_________________________________________________________________


In [52]:
checkpoint_dir = './training_checkpoints_julia'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [53]:
earlystop_callback = tf.keras.callbacks.EarlyStopping(
  monitor='val_accuracy', min_delta=0.0001,
  patience=1)

In [None]:
history = model.fit(X,
                    y, 
                    epochs=EPOCHES, 
                    batch_size=BATCH_SIZE, 
                    verbose=1, 
                    callbacks=[checkpoint_callback, earlystop_callback])

In [None]:
checkpoint_output = 'output/checkpoints_lstms_small_character_model' 
latest = tf.train.latest_checkpoint(checkpoint_output)
print(latest)

# remember to change output layer size to 95 (smaller dict)
model.load_weights(latest)

In [None]:
import sys 

start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print('Seed:')
print("".join([int_to_char[value] for value in pattern]))
print('\nGenerated:\n\n')

In [None]:
for i in range(500):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(VOCAB_SIZE)
    pred = model.predict(x, verbose=0)
    index = np.argmax(pred)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]