In [1]:
import os
import numpy as np
import re
import requests
import tarfile
import shutil
import glob

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle

In [2]:
file_name = "Project_CodeNet_LangClass.tar.gz"
data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# Download tar archive to local disk
with open(file_name, "wb") as f:
    f.write(requests.get(data_url).content)
    
# Extract contents of archive to local disk
if os.path.exists("data"):
    shutil.rmtree("data")    
with tarfile.open(file_name) as tf:
    tf.extractall()

In [3]:
!ls data data/train

data:
test  train

data/train:
C  C#  C++  D  Haskell	Java  JavaScript  PHP  Python  Rust


In [4]:
langs = [
  "C",
  "C#",
  "C++",
  "D",
  "Haskell",
  "Java",
  "JavaScript",
  "PHP",
  "Python",
  "Rust"
]

In [5]:
file_list = glob.glob(os.path.join(os.getcwd(),"data/train/Python","*.*"))
corpus = []

for file_path in file_list:
    with open(file_path) as f_input:
        text = re.sub(r'#.*\n?', '', f_input.read(), flags=re.MULTILINE)
        corpus.append(text)

print(corpus[0:2])

["def check(x, y):\n    return 0 <= x <= 9 and 0 <= y <= 9\n\n\ndef small(x, y, area):\n    if check(x+1, y):\n        area[x+1][y] += 1\n    if check(x, y+1):\n        area[x][y+1] += 1\n    if check(x-1, y):\n        area[x-1][y] += 1\n    if check(x, y-1):\n        area[x][y-1] += 1\n    area[x][y] += 1\n    return area\n\n\ndef mediam(x, y, area):\n    area = small(x, y, area)\n    if check(x+1, y+1):\n        area[x+1][y+1] += 1\n    if check(x+1, y-1):\n        area[x+1][y-1] += 1\n    if check(x-1, y+1):\n        area[x-1][y+1] += 1\n    if check(x-1, y-1):\n        area[x-1][y-1] += 1\n    return area\n\n\ndef large(x, y, area):\n    area = mediam(x, y, area)\n    if check(x+2, y):\n        area[x+2][y] += 1\n    if check(x, y+2):\n        area[x][y+2] += 1\n    if check(x-2, y):\n        area[x-2][y] += 1\n    if check(x, y-2):\n        area[x][y-2] += 1\n    return area\n\narea = [[0 for i in range(10)] for j in range(10)]\n\nwhile True:\n    try:\n        x, y, s = map(int, 

In [6]:
print(len(corpus))

90


In [7]:
lang_full_corpus = '\n'.join(corpus)

In [8]:
len(lang_full_corpus)

116635

In [9]:
#lang_full_corpus = lang_full_corpus.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('\t','').replace('  ',' ')
lang_full_corpus = lang_full_corpus.replace('\n', '').replace('\ufeff', '').replace('  ',' ')

In [10]:
len(lang_full_corpus)

98578

In [11]:
lang_full_corpus[0:360]

'def check(x, y):  return 0 <= x <= 9 and 0 <= y <= 9def small(x, y, area):  if check(x+1, y):    area[x+1][y] += 1  if check(x, y+1):    area[x][y+1] += 1  if check(x-1, y):    area[x-1][y] += 1  if check(x, y-1):    area[x][y-1] += 1  area[x][y] += 1  return areadef mediam(x, y, area):  area = small(x, y, area)  if check(x+1, y+1):    area[x+1][y+1] += 1  i'

In [12]:
from string import digits,punctuation
remove_digits = str.maketrans('', '', digits)
# remove_punt = str.maketrans(punctuation, ' '*len(punctuation)) 
res = lang_full_corpus.translate(remove_digits)
# res = res.translate(remove_punt)
res[0:360]

'def check(x, y):  return  <= x <=  and  <= y <= def small(x, y, area):  if check(x+, y):    area[x+][y] +=   if check(x, y+):    area[x][y+] +=   if check(x-, y):    area[x-][y] +=   if check(x, y-):    area[x][y-] +=   area[x][y] +=   return areadef mediam(x, y, area):  area = small(x, y, area)  if check(x+, y+):    area[x+][y+] +=   if check(x+, y-):    ar'

In [13]:
# import nltk
# allWords = nltk.tokenize.word_tokenize(lang_full_corpus)
# allWordDist = nltk.FreqDist(w.lower() for w in allWords)
# mostCommon= allWordDist.most_common(100)

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([res])

pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([res])[0]
sequence_data[:10]

[14, 76, 1, 2, 12, 1, 25, 2, 14, 73]

In [15]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

765


In [16]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  15465


array([[14, 76],
       [76,  1],
       [ 1,  2],
       [ 2, 12],
       [12,  1],
       [ 1, 25],
       [25,  2],
       [ 2, 14],
       [14, 73],
       [73,  1]])

In [17]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [18]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [14 76  1  2 12]
The responses are:  [76  1  2 12  1]


In [19]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             7650      
_________________________________________________________________
lstm_2 (LSTM)                (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_3 (Dense)              (None, 765)               765765    
Total params: 13,822,415
Trainable params: 13,822,415
Non-trainable params: 0
_________________________________________________________________


In [26]:
# from tensorflow.keras.utils.vis_utils import plot_model

# plot_model(model, to_file='model.png', show_layer_names=True)

In [27]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [28]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))

In [29]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/150

Epoch 00001: loss improved from inf to 4.91015, saving model to nextword1.h5
Epoch 2/150

Epoch 00002: loss improved from 4.91015 to 4.54270, saving model to nextword1.h5
Epoch 3/150

Epoch 00003: loss improved from 4.54270 to 4.23619, saving model to nextword1.h5
Epoch 4/150

Epoch 00004: loss improved from 4.23619 to 4.03522, saving model to nextword1.h5
Epoch 5/150

Epoch 00005: loss improved from 4.03522 to 3.84579, saving model to nextword1.h5
Epoch 6/150

Epoch 00006: loss improved from 3.84579 to 3.69185, saving model to nextword1.h5
Epoch 7/150

Epoch 00007: loss improved from 3.69185 to 3.56718, saving model to nextword1.h5
Epoch 8/150

Epoch 00008: loss improved from 3.56718 to 3.47351, saving model to nextword1.h5
Epoch 9/150

Epoch 00009: loss improved from 3.47351 to 3.39259, saving model to nextword1.h5
Epoch 10/150

Epoch 00010: loss improved from 3.39259 to 3.32553, saving model to nextword1.h5
Epoch 11/150

Epoch 00011: loss improved from 3.32553 to 3.2587

<tensorflow.python.keras.callbacks.History at 0x145a6d1cbc90>

In [32]:
from tensorflow.keras.models import load_model
model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

In [33]:
def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [None]:
while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line:  raw_input




Enter your line:  input


split


Enter your line:  map


int


Enter your line:  =
Enter your line:  if


x


Enter your line:  else


return


Enter your line:  for


i


Enter your line:  range


for


Enter your line:  import


sysdef


Enter your line:  os
Enter your line:  math


sqrt


Enter your line:  str.


data


Enter your line:  math.


sqrt


Enter your line:  range(


for


Enter your line:  for i in


range


Enter your line:  range(


for


Enter your line:  [[
Enter your line:  [
Enter your line:  list


points


Enter your line:  type
Enter your line:  list(


points


Enter your line:  tuple(


is


Enter your line:  digit


data


Enter your line:  if x==0


y


Enter your line:  if x==0 and 
Enter your line:  if x==0:


y


Enter your line:  numpy
Enter your line:  import os
Enter your line:  import numpy as np
