In [1]:
import pandas as pd
import numpy as np
import re
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Flatten, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import warnings
warnings.simplefilter(action='ignore')
#using GPU to accelerate training
import tensorflow as tf
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

Using TensorFlow backend.


In [4]:
from google.colab import files

uploaded = files.upload()

Saving conala-mined.jsonl to conala-mined.jsonl


In [0]:
# data loading --> This might differ from system so system
mined = pd.read_json("conala-mined.jsonl", lines=True)
mined = mined.sample(400000)

In [6]:
# Since our data is very heavy on the low side (below 20%) we shift the weights towards this spectrum, to get more nuanced results
from matplotlib import pyplot as plt
mined['prob'] = mined['prob'].apply(lambda x: np.log(x))
mined['prob'] = mined['prob'] + 10
mined['prob'] = mined['prob'].apply(lambda x: x/10)
mined['prob'].plot()

# train test split
X = mined[['intent', 'snippet']]
y = mined['prob'].as_matrix()

0.6630625189608299


OverflowError: ignored

<Figure size 432x288 with 1 Axes>

In [0]:
# data cleanup to save memory
# usually you would want to clean up special characters like "'" or ";" but since we are dealing with code here this would be a bad idea
X['intent'] = X['intent'].apply(lambda x: x.lower())
X['intent'] = X['intent'].apply(lambda x: re.sub(pattern='[^a-z0-9]', string=x, repl=' '))

In [0]:
# concatenate snippet and intent
# the network is powerful enough and gets enough data
# to learn that our border word marks a border
X['intent'] = X['intent'] + ' STOP ' + X['snippet']

In [0]:
X = X['intent']

In [0]:
X = X.astype(str)

In [0]:
# we tokenize the text and pad the tokens
MAX_NB_WORDS = 250
MAX_SEQUENCE_LENGTH  = 125
EMBEDDING_DIM = 250
# keras Tokenizer class which: Creates a dictionary based on the given texts
# based on word frequency where every words gets it's own unique integer
# this dictionary (tokenizer.word_index) will be exported for predictions made
# later
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

In [13]:
print(tokenizer.word_index)



In [20]:
# let's export the word_index
to_export = tokenizer.word_index
dirty_list = []
# this marks every entry in our index which will not be used
for word, index in to_export.items():
  if index > MAX_NB_WORDS:
    dirty_list.append(word)
# this deletes every marked entry
for mark in dirty_list:
  del to_export[mark]
print(to_export)
# write dictionary to json
import json
with open('vocab.json', 'w') as json_file:
  json.dump(to_export, json_file)

{'stop': 1, 'in': 2, 'a': 3, 'python': 4, 'to': 5, "'": 6, 'how': 7, '1': 8, '0': 9, 'for': 10, 'of': 11, 'with': 12, 'i': 13, 'from': 14, 'x': 15, 'list': 16, 'self': 17, '2': 18, 'the': 19, 'def': 20, 'print': 21, 'pass': 22, 'import': 23, 'if': 24, 'return': 25, 'data': 26, 's': 27, 'np': 28, 'and': 29, 'file': 30, '3': 31, 'get': 32, 'using': 33, 'string': 34, 'array': 35, 'is': 36, 'name': 37, 'df': 38, 'numpy': 39, 'as': 40, 'b': 41, 'y': 42, 'django': 43, 'class': 44, 'value': 45, 'set': 46, 'n': 47, '5': 48, 'do': 49, 'f': 50, 'pandas': 51, 'on': 52, '4': 53, 'key': 54, 'an': 55, 'can': 56, 'c': 57, 'd': 58, 'values': 59, 'line': 60, 'range': 61, 'text': 62, 'true': 63, 'all': 64, 'index': 65, 'path': 66, 'plt': 67, 'function': 68, 'add': 69, 'matplotlib': 70, 'len': 71, 'request': 72, 'open': 73, 'time': 74, 'find': 75, '10': 76, 'os': 77, 'dataframe': 78, 'none': 79, 'way': 80, 'dict': 81, 'object': 82, 'datetime': 83, 'by': 84, 'into': 85, 't': 86, 'random': 87, 'not': 88, '

In [15]:
# this is a rather simple LSTM
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Dropout(0.2))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(300))
model.add(Dropout(0.2)))
model.add(Dense(1, activation='relu'))
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae', 'mse'])

epochs = 17
batch_size = 128

history = model.fit(X, y, epochs=epochs,verbose=1, batch_size=batch_size, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=9, min_delta=0.0001)])

W0814 23:30:29.880508 140568500856704 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0814 23:30:29.883676 140568500856704 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0814 23:30:29.898814 140568500856704 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0814 23:30:29.920298 140568500856704 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0814 23:30:29.934668 

Train on 320000 samples, validate on 80000 samples
Epoch 1/17
  3584/320000 [..............................] - ETA: 41:23 - loss: 0.2108 - mean_absolute_error: 0.2108 - mean_squared_error: 0.0897

KeyboardInterrupt: ignored

In [0]:
# we calculate the linearized test loss
from scipy import stats
xi = np.arange(0, len(history.history['val_loss']))
y = history.history['val_loss']
slope, intercept, r_value, p_value, std_err = stats.linregress(xi,y)
line = slope*xi+intercept

In [0]:
# we visualize the training process
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(line)
plt.title('model loss [Mean Absolute Error]')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train loss', 'test loss', 'linearized test loss'], loc='upper left')
plt.savefig('loss.pdf')
plt.show()

In [0]:
# we save our model
# lstm.h5 is the version which will actually be used when exporting to tensorflowjs
from keras.models import load_model
model.save('lstm.h5')
model.save_weights('model.h5')
model_json = model.to_json()
with open('model.json', "w") as json_file:
    json_file.write(model_json)
json_file.close()