**Based on [C code Generation using LSTM](https://blogs.oracle.com/meena/code-generation-using-lstm-long-short-term-memory-rnn-network)**

In [62]:
import tensorflow as tf
import os
import numpy as np

from tensorflow import keras

In [4]:
tf.__version__

'2.0.0'

In [38]:
# I used only kernel folder from linux repo (https://github.com/torvalds/linux/tree/master/kernel)
path_name = "data/linux_kernel/"

## concat all files into training file
with open("data/linux_kernel_training.txt", "w") as a:
    for file in os.listdir(path):
        f = os.path.join(path_name, file)
        current_file = open(f).read()
        a.write(current_file)

In [63]:
text = open("data/linux_kernel_training.txt", 'r').read()
chars = sorted(list(set(text)))

VOCAB_SIZE = len(chars)

print(f"Length of file: {len(text)}")
print(f"Total vocab length: {VOCAB_SIZE}")

Length of file: 2099427
Total vocab length: 99


**Mapping of unique chars to integers and a reverse mapping**

In [51]:
char_to_int = {c:i for i, c in enumerate(chars)}
int_to_char = {i:c for i, c in enumerate(chars)}

print(char_to_int)
print(int_to_char)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '#': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, '*': 12, '+': 13, ',': 14, '-': 15, '.': 16, '/': 17, '0': 18, '1': 19, '2': 20, '3': 21, '4': 22, '5': 23, '6': 24, '7': 25, '8': 26, '9': 27, ':': 28, ';': 29, '<': 30, '=': 31, '>': 32, '?': 33, '@': 34, 'A': 35, 'B': 36, 'C': 37, 'D': 38, 'E': 39, 'F': 40, 'G': 41, 'H': 42, 'I': 43, 'J': 44, 'K': 45, 'L': 46, 'M': 47, 'N': 48, 'O': 49, 'P': 50, 'Q': 51, 'R': 52, 'S': 53, 'T': 54, 'U': 55, 'V': 56, 'W': 57, 'X': 58, 'Y': 59, 'Z': 60, '[': 61, '\\': 62, ']': 63, '^': 64, '_': 65, '`': 66, 'a': 67, 'b': 68, 'c': 69, 'd': 70, 'e': 71, 'f': 72, 'g': 73, 'h': 74, 'i': 75, 'j': 76, 'k': 77, 'l': 78, 'm': 79, 'n': 80, 'o': 81, 'p': 82, 'q': 83, 'r': 84, 's': 85, 't': 86, 'u': 87, 'v': 88, 'w': 89, 'x': 90, 'y': 91, 'z': 92, '{': 93, '|': 94, '}': 95, '~': 96, 'Ă': 97, 'Ą': 98}
{0: '\t', 1: '\n', 2: ' ', 3: '!', 4: '"', 5: '#', 6: '$', 7: '%', 8: '&', 9: "'", 10: '(', 11: ')', 12: '*', 13: '

**In this example we would use character based model**

In [52]:
SEQ_LENGTH = 100
EPOCHES = 10
BATCH_SIZE = 128

In [53]:
X = []
y = []

for i in range(len(text) - SEQ_LENGTH):
    seq_in = text[i:i + SEQ_LENGTH]
    seq_out = text[i + SEQ_LENGTH] # we try to predict next character
    X.append([char_to_int[char] for char in seq_in])
    y.append(char_to_int[seq_out])

In [56]:
print(''.join([int_to_char[x] for x in X[12345]]))
print(f"Next character: {int_to_char[y[12345]]}")

ct *acct)
{
	acct_t ac;
	unsigned long flim;
	const struct cred *orig_cred;
	struct file *file = acc
Next character: t


In [59]:
samples = len(X)
print(f"Total samples: {samples}")


X = np.reshape(X, (samples, SEQ_LENGTH, 1))

Total samples: 2099327


In [64]:
X = X / float(VOCAB_SIZE) # normalization

y = keras.utils.to_categorical(y)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (2099327, 100, 1)
y shape: (2099327, 99)


**Create simple model with 2 LSTMs**

In [69]:
model = keras.Sequential([
    keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True), input_shape=(X.shape[1], X.shape[2])),
    keras.layers.Dropout(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(256)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(y.shape[1], activation='softmax')
])

In [70]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [71]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_4 (Bidirection (None, 100, 512)          528384    
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 99)                50787     
Total params: 2,154,083
Trainable params: 2,154,083
Non-trainable params: 0
_________________________________________________________________


In [74]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [75]:
history = model.fit(X, 
                    y, 
                    epochs=EPOCHES, 
                    batch_size=BATCH_SIZE, 
                    verbose=1, 
                    callbacks=[checkpoint_callback])

Train on 2099327 samples
Epoch 1/10
  86400/2099327 [>.............................] - ETA: 58:49 - loss: 2.9293

KeyboardInterrupt: 