In [1]:
import os
import numpy as np
import re
import requests
import tarfile
import shutil
import glob

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle

In [2]:
np.__version__

'1.19.5'

In [3]:
tf.__version__

'2.4.0'

In [4]:
!module list


Currently Loaded Modules:
  1) anaconda3/5.1.0-gcc/8.3.1     4) cudnn/8.0.0.180-11.0-linux-x64-gcc/7.5.0
  2) anaconda3/2019.10-gcc/8.3.1   5) openjdk/1.8.0_222-b10-gcc/8.3.1
  3) cuda/11.0.3-gcc/7.5.0         6) hadoop/3.2.1-gcc/8.3.1

 



In [5]:
file_name = "Project_CodeNet_LangClass.tar.gz"
data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# Download tar archive to local disk
with open(file_name, "wb") as f:
    f.write(requests.get(data_url).content)
    
# Extract contents of archive to local disk
if os.path.exists("data"):
    shutil.rmtree("data")    
with tarfile.open(file_name) as tfile:
    tfile.extractall()

In [6]:
!ls data data/train

data:
test  train

data/train:
C  C#  C++  D  Haskell	Java  JavaScript  PHP  Python  Rust


In [7]:
langs = [
  "C",
  "C#",
  "C++",
  "D",
  "Haskell",
  "Java",
  "JavaScript",
  "PHP",
  "Python",
  "Rust"
]

In [8]:
file_list = glob.glob(os.path.join(os.getcwd(),"data/train/Python","*.*"))
corpus = []

for file_path in file_list:
    with open(file_path) as f_input:
        text = re.sub(r'#.*\n?', '', f_input.read(), flags=re.MULTILINE)
        corpus.append(text)

print(corpus[0:2])

["mass = [[0 for p in xrange(14)] for q in xrange(14)]\nwhile True:\n\ttry:\n\t\tx,y,size = map(int,raw_input().split(','))\n\t\tx += 2 ; y += 2\n\t\tif   size == 1:\n\t\t\tmass[x-2][y-2]+=0;mass[x-1][y-2]+=0;mass[x][y-2]+=0;mass[x+1][y-2]+=0;mass[x+2][y-2]+=0\n\t\t\tmass[x-2][y-1]+=0;mass[x-1][y-1]+=0;mass[x][y-1]+=1;mass[x+1][y-1]+=0;mass[x+2][y-1]+=0\n\t\t\tmass[x-2][y]  +=0;mass[x-1][y]  +=1;mass[x][y]  +=1;mass[x+1][y]  +=1;mass[x+2][y]  +=0\n\t\t\tmass[x-2][y+1]+=0;mass[x-1][y+1]+=0;mass[x][y+1]+=1;mass[x+1][y+1]+=0;mass[x+2][y+1]+=0\n\t\t\tmass[x-2][y+2]+=0;mass[x-1][y+2]+=0;mass[x][y+2]+=0;mass[x+1][y+2]+=0;mass[x+2][y+2]+=0\n\t\telif size == 2:\n\t\t\tmass[x-2][y-2]+=0;mass[x-1][y-2]+=0;mass[x][y-2]+=0;mass[x+1][y-2]+=0;mass[x+2][y-2]+=0\n\t\t\tmass[x-2][y-1]+=0;mass[x-1][y-1]+=1;mass[x][y-1]+=1;mass[x+1][y-1]+=1;mass[x+2][y-1]+=0\n\t\t\tmass[x-2][y]  +=0;mass[x-1][y]  +=1;mass[x][y]  +=1;mass[x+1][y]  +=1;mass[x+2][y]  +=0\n\t\t\tmass[x-2][y+1]+=0;mass[x-1][y+1]+=1;mass[x][y+

In [9]:
print(len(corpus))

90


In [10]:
lang_full_corpus = '\n'.join(corpus)

In [11]:
len(lang_full_corpus)

116635

In [12]:
#lang_full_corpus = lang_full_corpus.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('\t','').replace('  ',' ')
lang_full_corpus = lang_full_corpus.replace('\n', '').replace('\ufeff', '').replace('  ',' ')

In [13]:
len(lang_full_corpus)

98578

In [14]:
lang_full_corpus[0:360]

"mass = [[0 for p in xrange(14)] for q in xrange(14)]while True:\ttry:\t\tx,y,size = map(int,raw_input().split(','))\t\tx += 2 ; y += 2\t\tif  size == 1:\t\t\tmass[x-2][y-2]+=0;mass[x-1][y-2]+=0;mass[x][y-2]+=0;mass[x+1][y-2]+=0;mass[x+2][y-2]+=0\t\t\tmass[x-2][y-1]+=0;mass[x-1][y-1]+=0;mass[x][y-1]+=1;mass[x+1][y-1]+=0;mass[x+2][y-1]+=0\t\t\tmass[x-2][y] +=0;mass[x-1][y] +="

In [15]:
from string import digits,punctuation
remove_digits = str.maketrans('', '', digits)
# remove_punt = str.maketrans(punctuation, ' '*len(punctuation)) 
res = lang_full_corpus.translate(remove_digits)
# res = res.translate(remove_punt)
res[0:360]

"mass = [[ for p in xrange()] for q in xrange()]while True:\ttry:\t\tx,y,size = map(int,raw_input().split(','))\t\tx +=  ; y += \t\tif  size == :\t\t\tmass[x-][y-]+=;mass[x-][y-]+=;mass[x][y-]+=;mass[x+][y-]+=;mass[x+][y-]+=\t\t\tmass[x-][y-]+=;mass[x-][y-]+=;mass[x][y-]+=;mass[x+][y-]+=;mass[x+][y-]+=\t\t\tmass[x-][y] +=;mass[x-][y] +=;mass[x][y] +=;mass[x+][y] +=;mass[x+]["

In [16]:
# import nltk
# allWords = nltk.tokenize.word_tokenize(lang_full_corpus)
# allWordDist = nltk.FreqDist(w.lower() for w in allWords)
# mostCommon= allWordDist.most_common(100)

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([res])

pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([res])[0]
sequence_data[:10]

[36, 9, 11, 8, 73, 9, 94, 8, 73, 57]

In [18]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

764


In [19]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  15466


array([[36,  9],
       [ 9, 11],
       [11,  8],
       [ 8, 73],
       [73,  9],
       [ 9, 94],
       [94,  8],
       [ 8, 73],
       [73, 57],
       [57, 39]])

In [20]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [21]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [36  9 11  8 73]
The responses are:  [ 9 11  8 73  9]


In [22]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             7640      
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 764)               764764    
Total params: 13,821,404
Trainable params: 13,821,404
Non-trainable params: 0
_________________________________________________________________


In [25]:
# from tensorflow.keras.utils.vis_utils import plot_model

# plot_model(model, to_file='model.png', show_layer_names=True)

In [26]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [27]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))

In [28]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/150

Epoch 00001: loss improved from inf to 4.91762, saving model to nextword1.h5
Epoch 2/150

Epoch 00002: loss improved from 4.91762 to 4.49342, saving model to nextword1.h5
Epoch 3/150

Epoch 00003: loss improved from 4.49342 to 4.19296, saving model to nextword1.h5
Epoch 4/150

Epoch 00004: loss improved from 4.19296 to 4.00627, saving model to nextword1.h5
Epoch 5/150

Epoch 00005: loss improved from 4.00627 to 3.85152, saving model to nextword1.h5
Epoch 6/150

Epoch 00006: loss improved from 3.85152 to 3.73318, saving model to nextword1.h5
Epoch 7/150

Epoch 00007: loss improved from 3.73318 to 3.62035, saving model to nextword1.h5
Epoch 8/150

Epoch 00008: loss improved from 3.62035 to 3.53317, saving model to nextword1.h5
Epoch 9/150

Epoch 00009: loss improved from 3.53317 to 3.46220, saving model to nextword1.h5
Epoch 10/150

Epoch 00010: loss improved from 3.46220 to 3.39668, saving model to nextword1.h5
Epoch 11/150

Epoch 00011: loss improved from 3.39668 to 3.3341

<tensorflow.python.keras.callbacks.History at 0x14b95c7788d0>

In [29]:
from tensorflow.keras.models import load_model
model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

In [30]:
def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [31]:
while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line:  for i in




range


Enter your line:  math.


sqrt


Enter your line:  import m


m


Enter your line:  import


sysimport


Enter your line:  math.pi
Enter your line:  print


f


Enter your line:  if


x


Enter your line:  if x==0


y


Enter your line:  return


false


Enter your line:  while


true


Enter your line:  while(x
Enter your line:  stop the script


Ending The Program.....
