In [1]:
!module list


Currently Loaded Modules:
  1) anaconda3/5.1.0-gcc/8.3.1     4) cudnn/8.0.0.180-11.0-linux-x64-gcc/7.5.0
  2) anaconda3/2019.10-gcc/8.3.1   5) openjdk/1.8.0_222-b10-gcc/8.3.1
  3) cuda/11.0.3-gcc/7.5.0         6) hadoop/3.2.1-gcc/8.3.1

 



In [2]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle


import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [4]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices
tf.config.experimental.set_memory_growth(physical_devices[0], True)
# tf.config.experimental.set_memory_growth(physical_devices[1], True)

In [8]:
file_name = "Project_CodeNet_MLM.tar.gz"
data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# Download tar archive to local disk
with open(file_name, "wb") as f:
    f.write(requests.get(data_url).content)
    
# Extract contents of archive to local disk
if os.path.exists("tokens"):
    shutil.rmtree("tokens")    
with tarfile.open(file_name) as tfile:
    tfile.extractall()
    
!ls tokens

test  train


In [10]:
# Read all files and return content as list of lines.
def get_text_list_from_files(files):
    text_list = []
    for name in files:
        with open(name) as f:
            for line in f:
                text_list.append(line)
    return text_list

# Compose the full path names to the token files.
# Creates and returns a dataframe with single key "tokens".
def get_data_from_text_files(folder_name):
    files = glob.glob(folder_name + '/*.toks')
    texts = get_text_list_from_files(files)
    df = pd.DataFrame({'tokens': texts})
    df = df.sample(len(df)).reset_index(drop=True)
    return df

train_data = get_data_from_text_files('tokens/train')
print(train_data.tail())

                                                  tokens
49995  # include < id . id > int main ( void ) { int ...
49996  # include < id . id > # include < id . id > # ...
49997  # include < id . id > # include < id . id > in...
49998  # include < id . id > int main ( void ) { int ...
49999  # include < id . id > int main ( void ) { int ...


In [15]:
pd.set_option('display.max_colwidth', 1000)
train_data.head()

Unnamed: 0,tokens
0,"# include < id . id > int main ( ) { int id , id , id , id , id , id ; scanf ( string , operator id ) ; scanf ( string , operator id ) ; scanf ( string , operator id ) ; scanf ( string , operator id ) ; scanf ( string , operator id ) ; if ( id < number ) id = number ; if ( id < number ) id = number ; if ( id < number ) id = number ; if ( id < number ) id = number ; if ( id < number ) id = number ; id = ( id operator id operator id operator id operator id ) operator number ; printf ( string , id ) ; return 0 ; }"
1,"# include < id . id > int main ( void ) { int id ; int id ; int id ; for ( id = 1 ; ; id operator ) { scanf ( string , operator id ) ; if ( id operator 0 ) { printf ( string , id , id ) ; } if ( id operator 0 ) { break ; } } return 0 ; }"
2,"# include < id . id > # include < id . id > int main ( ) { int id ; double id , id , id , id , id , id , id , id [ number ] = { } ; while ( 1 ) { double id [ number ] = { } ; id = 0 ; id = 0 ; scanf ( string , operator id ) ; if ( id operator 0 ) break ; for ( id = 0 ; id < id ; id operator ) { scanf ( string , operator id [ id ] ) ; id = id operator id [ id ] ; } id = id operator id ; for ( id = 0 ; id < id ; id operator ) { id = id [ id ] operator id ; if ( id < 0 ) id = id operator ( operator 1 ) ; id = pow ( id , number ) ; id = id operator id ; } id = id operator id ; id = pow ( id , number ) ; printf ( string , id ) ; } return 0 ; }"
3,"# include < id . id > # include < string . id > int main ( ) { int id , id ; scanf ( string , operator id ) ; char id [ number ] ; scanf ( string , id ) ; for ( id = 0 ; id < strlen ( id ) ; id operator ) { int id = id [ id ] operator id ; if ( id operator number ) { id [ id ] = id ; } else if ( id > number ) { id [ id ] = id operator number ; } } printf ( string , id ) ; return 0 ; }"
4,"# include < id . id > int main ( ) { int id , id [ number ] [ number ] , id , id , id , id , id , id , id , id , id ; int id [ ] = { operator 1 , 0 , 1 , 0 } ; int id [ ] = { 0 , operator 1 , 0 , 1 } ; while ( scanf ( string , operator id ) , id ) { id = id = id = id = id [ 0 ] [ 0 ] = id [ 0 ] [ 1 ] = 0 ; for ( id = 1 ; id < id ; id operator ) { scanf ( string , operator id , operator id ) ; id [ id ] [ 0 ] = id [ id ] [ 0 ] operator id [ id ] ; id [ id ] [ 1 ] = id [ id ] [ 1 ] operator id [ id ] ; if ( id [ id ] [ 0 ] < id ) id = id [ id ] [ 0 ] ; if ( id [ id ] [ 0 ] > id ) id = id [ id ] [ 0 ] ; if ( id [ id ] [ 1 ] > id ) id = id [ id ] [ 1 ] ; if ( id [ id ] [ 1 ] < id ) id = id [ id ] [ 1 ] ; } printf ( string , id operator id operator 1 , id operator id operator 1 ) ; } return 0 ; }"


In [17]:
class Config:
    MAX_LEN = 256               # length of each input sample in tokens
    BATCH_SIZE = 32             # batch size
    LR = 0.001                  # learning rate
    VOCAB_SIZE = 256            # max. number of words in vocabulary
    EMBED_DIM = 128             # word embedding vector size
    NUM_HEAD = 8                # number of attention heads (BERT)
    FF_DIM = 128                # feedforward dimension (BERT)
    NUM_LAYERS = 1              # number of BERT module layers

config = Config()

In [19]:
# No special text filtering.
def custom_standardization(input_data):
    return input_data


from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Create TextVectorization layer.
def get_vectorize_layer(texts, vocab_size, max_seq):
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode='int',
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    # Create vocabulary over all texts:
    vectorize_layer.adapt(texts)
    # Insert special mask token in vocabulary:
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2:len(vocab)-1] + ['[mask]']
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

vectorize_layer = get_vectorize_layer(
    train_data.tokens.values.tolist(),
    config.VOCAB_SIZE,
    config.MAX_LEN,
)

vocab = vectorize_layer.get_vocabulary()
print('vocabulary size:', len(vocab))
print('padding token vocab[0]: "%s"' % vocab[0])
print('OOV token vocab[1]: "%s"' % vocab[1])
print('mask token vocab[%d]: "%s"' % (len(vocab)-1, vocab[len(vocab)-1]))

vocabulary size: 203
padding token vocab[0]: ""
OOV token vocab[1]: "[UNK]"
mask token vocab[202]: "[mask]"


In [6]:
corpus = py_tokenize("train")
corpus_new = []
for code in corpus:
    corpus_new.extend(code.split('<EOL>'))
print(corpus_new[0:20])
print(len(corpus_new))

train: are done
['arr = [ <NUM_LIT:0> ] * <NUM_LIT:100> ', ' while True : ', ' try : ', ' x , y , s = map ( int , input ( ) . split ( "<STR_LIT:U+002C>" ) ) ', ' if s == <NUM_LIT:3> : ', ' if x <= <NUM_LIT:7> : ', ' arr [ <NUM_LIT:10> * y + x + <NUM_LIT:2> ] += <NUM_LIT:1> ', ' if x >= <NUM_LIT:2> : ', ' arr [ <NUM_LIT:10> * y + x - <NUM_LIT:2> ] += <NUM_LIT:1> ', ' if y <= <NUM_LIT:7> : ', ' arr [ <NUM_LIT:10> * y + x + <NUM_LIT:20> ] += <NUM_LIT:1> ', ' if y >= <NUM_LIT:2> : ', ' arr [ <NUM_LIT:10> * y + x - <NUM_LIT:20> ] += <NUM_LIT:1> ', ' if s >= <NUM_LIT:2> : ', ' if x != <NUM_LIT:9> and y != <NUM_LIT:9> : ', ' arr [ <NUM_LIT:10> * y + x + <NUM_LIT:11> ] += <NUM_LIT:1> ', ' if x != <NUM_LIT:9> and y != <NUM_LIT:0> : ', ' arr [ <NUM_LIT:10> * y + x - <NUM_LIT:9> ] += <NUM_LIT:1> ', ' if x != <NUM_LIT:0> and y != <NUM_LIT:0> : ', ' arr [ <NUM_LIT:10> * y + x - <NUM_LIT:11> ] += <NUM_LIT:1> ']
3815


In [7]:
full_corpus_tokens = ''.join(corpus).split()
print(full_corpus_tokens[0:100])

['arr', '=', '[', '<NUM_LIT:0>', ']', '*', '<NUM_LIT:100>', '<EOL>', 'while', 'True', ':', '<EOL>', 'try', ':', '<EOL>', 'x', ',', 'y', ',', 's', '=', 'map', '(', 'int', ',', 'input', '(', ')', '.', 'split', '(', '"<STR_LIT:U+002C>"', ')', ')', '<EOL>', 'if', 's', '==', '<NUM_LIT:3>', ':', '<EOL>', 'if', 'x', '<=', '<NUM_LIT:7>', ':', '<EOL>', 'arr', '[', '<NUM_LIT:10>', '*', 'y', '+', 'x', '+', '<NUM_LIT:2>', ']', '+=', '<NUM_LIT:1>', '<EOL>', 'if', 'x', '>=', '<NUM_LIT:2>', ':', '<EOL>', 'arr', '[', '<NUM_LIT:10>', '*', 'y', '+', 'x', '-', '<NUM_LIT:2>', ']', '+=', '<NUM_LIT:1>', '<EOL>', 'if', 'y', '<=', '<NUM_LIT:7>', ':', '<EOL>', 'arr', '[', '<NUM_LIT:10>', '*', 'y', '+', 'x', '+', '<NUM_LIT:20>', ']', '+=', '<NUM_LIT:1>', '<EOL>', 'if', 'y']


In [8]:
train_len = 3+1
text_sequences = []
for i in range(train_len,len(full_corpus_tokens)):
    seq = full_corpus_tokens[i-train_len:i]
    text_sequences.append(seq)
sequences = {}
count = 1
for i in range(len(full_corpus_tokens)):
    if full_corpus_tokens[i] not in sequences:
        sequences[full_corpus_tokens[i]] = count
        count += 1
print(text_sequences[:10])

[['arr', '=', '[', '<NUM_LIT:0>'], ['=', '[', '<NUM_LIT:0>', ']'], ['[', '<NUM_LIT:0>', ']', '*'], ['<NUM_LIT:0>', ']', '*', '<NUM_LIT:100>'], [']', '*', '<NUM_LIT:100>', '<EOL>'], ['*', '<NUM_LIT:100>', '<EOL>', 'while'], ['<NUM_LIT:100>', '<EOL>', 'while', 'True'], ['<EOL>', 'while', 'True', ':'], ['while', 'True', ':', '<EOL>'], ['True', ':', '<EOL>', 'try']]


In [9]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences) 

#Collecting some information   
vocabulary_size = len(tokenizer.word_counts)+1

n_sequences = np.empty([len(sequences),train_len], dtype='int32')
for i in range(len(sequences)):
    n_sequences[i] = sequences[i]

In [10]:
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]
train_inputs.shape
#print(train_targets[0])

(40546, 3)

In [11]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
#model = load_model("mymodel.h5")

model = Sequential()
model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=50,verbose=1)
model.save("mymodel_lstm.h5")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 3)              2322      
_________________________________________________________________
lstm (LSTM)                  (None, 3, 50)             10800     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 774)               39474     
Total params: 75,346
Trainable params: 75,346
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Ep

In [12]:
model.save('lstm_port.h5')
model = load_model('lstm_port.h5')

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_text = input().strip().lower()
while(input_text != 'stop'):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
#     print(encoded_text, pad_encoded)
    predictions = []
    for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
        pred_word = tokenizer.index_word[i]
        predictions.append(pred_word)
    print("Next word suggestion:",predictions)
    input_text = input().strip().lower()

 if x


Next word suggestion: ['+', 'in', '==']


 for i


Next word suggestion: ['in', ',', '(']


 for i,


Next word suggestion: ['j', 'i', 'x']


 for i in


Next word suggestion: ['range', 'xrange', 'sys']


 for i in range


Next word suggestion: ['(', '-', '<num_lit:2>']


 import


Next word suggestion: ["'<str_lit>'", '[', 'input']


 import math


Next word suggestion: ['.', ',', ')']


 import math.


Next word suggestion: ["'<str_lit>'", '[', 'input']


 import sys


Next word suggestion: ['.', ',', ':']


 import sys,


Next word suggestion: ["'<str_lit>'", '[', 'input']


 return


Next word suggestion: ['[', 'int', 'sys']


 stop
