In [None]:
url_of_data = "http://snow.iiar.pwr.wroc.pl:8080/MetricsRepoCsv/repo?id=446&id=447&metrics=bugs"

In [None]:
#import javalang
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Dense, Dropout,LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from keras.layers import BatchNormalization, UpSampling2D
from keras import optimizers
from time import time
from datetime import datetime
import os
from bs4 import BeautifulSoup
from fastai import *
from fastai.text import *
import requests
import csv
from keras.preprocessing.text import text_to_word_sequence
import time

In [None]:
#Hyperparameters
seq_length = 10 #sequence length
learning_rate = 0.01
n_epochs = 50
batch_size = 128

dropout_rate = 0.1 #fraction of inputs to drop

#LSTM
num_lstm_units = 256 #number of LSTM units in one layer

#reduce learning rate on plateau
#to be used? 
use_it = 0 #1 if yes, 0 if no
reduce_learning_rate_fraction = 0.2 # new learning rate = current_lr * fraction
n_patience = 10 #number of epochs after which need to decrease LR arises
min_learning_rate = 0.01 # minimum learning rate. LR will not be allowed fall below this value.

In [None]:
#get data
def get_sdp_data(url_of_data):
  #url = "http://snow.iiar.pwr.wroc.pl:8080/MetricsRepoCsv/repo?id=410&id=411&id=412&id=413&metrics=bugs"
  response = requests.get(url_of_data)

  data = BeautifulSoup(response.content, "html.parser")
  x = csv.reader(str(data).splitlines())
  mylist = list(x)
  nameofproject = mylist[1][0].split(';')[0]
  return mylist

In [None]:
def get_code_text(mylist, i, nameofproject):
  makelink = mylist[i][0].split(';')[-2].split('.')
  tempstrlink = '/'.join(makelink)
  strlink = tempstrlink[:tempstrlink.find('$')]
  srclink = ""
  if (nameofproject == 'poi'):
    srclink = srclink + "http://svn.apache.org/repos/asf/poi/trunk/src/java/"
  if (nameofproject == 'synapse'):
    srclink = srclink + "http://svn.apache.org/repos/asf/synapse/trunk/java/modules/core/src/main/java/"
  if (nameofproject == 'xalan'):
    srclink = srclink + "http://svn.apache.org/repos/asf/xalan/java/trunk/src/"
  if (nameofproject == 'xerces'):
    srclink = srclink + "http://svn.apache.org/repos/asf/xerces/java/trunk/src/"
  if (nameofproject == 'camel'):
    srclink = srclink + "http://svn.apache.org/repos/asf/camel/trunk/camel-core/src/main/java/"
  if (nameofproject == 'jEdit'):
    srclink = srclink + "https://raw.githubusercontent.com/romuloceccon/jedit/master/"
  if (nameofproject == 'lucene'):
    srclink = srclink + "http://svn.apache.org/repos/asf/lucene/java/branches/lucene_3_0/src/java/"
  
  
  codelink = srclink  + strlink + "." + "java"
  response = requests.get(codelink)
  codetext = str(BeautifulSoup(response.content, "html.parser"))
  classname = mylist[i][0].split(';')[2]  
  return codetext, classname, i 

In [None]:
def create_vocab(codetext):
  words = text_to_word_sequence(codetext, filters='#$%&()*+-/<=>@\^_`{|}~', lower=False, split=' ')
  vocab = sorted(list(set(words)))
  n_vocab = len(vocab)
  word_to_int = dict((w, i) for i, w in enumerate(vocab))
  int_to_word = dict((i, w) for i, w in enumerate(vocab))
  return words, word_to_int, int_to_word, n_vocab

In [None]:
def create_training_data(words, word_to_int, int_to_word):
  train_X = []
  train_Y = []
  for i in range(0, len(words) - seq_length, 1):
	  input_seq = words[i:i + seq_length] 
	  output_seq = words[i + seq_length]
	  train_X.append([word_to_int[word] for word in input_seq]);
	  train_Y.append(word_to_int[output_seq]);
  print("Total Sequences: ", len(train_X))
  
  return train_X, train_Y

In [None]:
def create_model():
  #model
  model = Sequential()
  model.add(LSTM(num_lstm_units, input_shape=(train_X_norm.shape[1], train_X_norm.shape[2]), 
                        return_sequences=True)) #return sequences = True if next LSTM layer is present
  model.add(Dropout(dropout_rate))
  #check to add Batch Normalization?
  model.add(LSTM(num_lstm_units))
  model.add(Dense(64, activation = 'relu'))
  BatchNormalization()
  model.add(Dense(train_Y_oh.shape[1], activation='softmax'))
  
  #ADAM OPTIMIZER

  my_adam = optimizers.Adam(lr=learning_rate)
  model.compile(loss='categorical_crossentropy', optimizer=my_adam, metrics=['accuracy'])
  return model

In [None]:
mylist = get_sdp_data(url_of_data)
nameofproject = mylist[1][0].split(';')[0]

In [None]:
mylist

[['Project', 'Version', 'Class;bugs'],
 ['jEdit;4.2;org.gjt.sp.jedit.options.AbbrevsOptionPane$ActionHandler;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.gui.ViewRegisters$ListHandler;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.options.ToolBarOptionPane;0'],
 ['jEdit;4.2;bsh.BSHImportDeclaration;0'],
 ['jEdit;4.2;gnu.regexp.RETokenLookAhead;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.gui.DockableWindowManager$6;0'],
 ['jEdit;4.2;org.objectweb.asm.ByteVector;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.search.HyperSearchResults$ResultVisitor;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.browser.BrowserView$1;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.jEdit$4;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.browser.VFSBrowser;1'],
 ['jEdit;4.2;org.gjt.sp.jedit.pluginmgr.PluginList;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.options.BrowserColorsOptionPane;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.pluginmgr.PluginManager$ActionHandler;0'],
 ['jEdit;4.2;org.gjt.sp.jedit.pluginmgr.Roster;1'],
 ['jEdit;4.2;org.gjt.sp.jedit.proto.jeditresource.PluginResURLConnection;0'],
 ['jEd

In [None]:
full_codetext = ""
count = 0
for i in range(1, 100, 1):
  print(count, "/", 20)
  codetext, classname, indexindata  = get_code_text(mylist, i, nameofproject)
  #print("len of codetext = ", (codetext))
  if (codetext == '404: Not Found\n'):
    count = count
    #print("wow")
    continue
  #print("OK")
  count = count + 1
  full_codetext = full_codetext + codetext
  if (count == 20):
    break

print(count)

0 / 20
1 / 20
1 / 20
1 / 20
1 / 20
1 / 20
2 / 20
2 / 20
3 / 20
4 / 20
5 / 20
5 / 20
5 / 20
5 / 20
6 / 20
6 / 20
6 / 20
7 / 20
8 / 20
8 / 20
8 / 20
9 / 20
9 / 20
10 / 20
10 / 20
10 / 20
10 / 20
10 / 20
11 / 20
11 / 20
11 / 20
12 / 20
13 / 20
14 / 20
14 / 20
15 / 20
15 / 20
16 / 20
17 / 20
17 / 20
18 / 20
19 / 20
19 / 20
19 / 20
20


In [None]:
print(full_codetext)

/*
 * AbbrevsOptionPane.java - Abbrevs options panel
 * :tabSize=4:indentSize=4:noTabs=false:
 * :folding=explicit:collapseFolds=1:
 *
 * Copyright (C) 1999, 2000, 2001, 2002 Slava Pestov
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

package org.gjt.sp.jedit.options;

//{{{ Imports
import javax.swing.border.EmptyBorder;
import javax.swing

In [None]:
df = pd.DataFrame()
seq_len = 100
now = time.clock()
start = -1
for i in range(0,len(full_codetext)-seq_len):
  start = i
  sent = full_codetext[i:i+seq_len]
  target = full_codetext[i+seq_len]
  #target  = np.random.randint(10)
  newrow = [[sent, target]]
  df = df.append(newrow, ignore_index=True)

In [None]:
df.columns = ['text', 'target']
valid_pct = 0.05 #validation percent
df = df.iloc[np.random.permutation(len(df))]
cut = int(valid_pct * len(df)) + 1
train_df, valid_df = df[cut:], df[:cut]
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='text')

In [None]:
valid_pct = 0.05 #validation percent
cut = int(valid_pct * len(df)) + 1
train_df, valid_df = df[cut:], df[:cut]
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='text')

In [None]:
learn = language_model_learner(data_lm,  arch = AWD_LSTM, pretrained = True, drop_mult=0.7)
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)  # FIT ONE CYCLE POLICY
wd=1e-7
lr=0.001
lrs = lr
learn.fit(1,lrs, wd)

epoch,train_loss,valid_loss,accuracy,time
0,1.237784,1.061444,0.807062,23:29


epoch,train_loss,valid_loss,accuracy,time
0,0.98664,0.865374,0.832514,23:32


In [None]:
learn.predict('int', n_words=5, temperature=0.8 )

'int have a public constructor .'

In [None]:
list(learn.model.parameters())[-1]

Parameter containing:
tensor([ 6.1178, -6.3831,  0.5089,  ..., -2.6316, -2.9272, -2.8727],
       device='cuda:0', requires_grad=True)

In [None]:
list(learn.model.modules())[-9]

LSTM(1152, 400, batch_first=True)

In [None]:
from fastai.callbacks.hooks import *
nn_module = list(learn.model.modules())[-5]
hook = hook_output(nn_module)
learn.predict('System.out.println', n_words=4, temperature=0.8 )

'System.out.println ( " \t <'

In [None]:
hook.stored.cpu().numpy()[0][0]

array([-9.479024e-03, -3.428975e-02, -4.614540e-03, -4.043110e-05, ..., -2.775651e-02,  5.020733e-03,  5.038312e-02,
        2.584061e-03], dtype=float32)

In [None]:
indexesindata = []
classnames = []
codevectorlist = []
bugslist = []

for i in range(1, len(mylist), 1):
  codetext, classname, indexindata  = get_code_text(mylist, i, nameofproject)
  nbugs = mylist[i][0].split(';')[-1]
  if (codetext == '404: Not Found\n'):
    indexesindata.append(indexindata)
    classnames.append(classname)
    codevectorlist.append(np.zeros(1152))
    bugslist.append(nbugs)
    #print("None")
    continue
  
  nn_module = list(learn.model.modules())[-5]
  hook = hook_output(nn_module)
  learn.predict(codetext, n_words=5, temperature=0.8 )
  codevector = hook.stored.cpu().numpy()[0][0]
  indexesindata.append(indexindata)
  classnames.append(classname)
  codevectorlist.append(codevector)
  bugslist.append(nbugs)
  print(i, "/", len(mylist), " COMPLETED")




1 / 1938  COMPLETED
6 / 1938  COMPLETED
8 / 1938  COMPLETED
9 / 1938  COMPLETED


In [None]:
finaldf = pd.DataFrame(list(zip(classnames,codevectorlist, bugslist)), 
               columns =['Classnames','Vector', 'Defects']) 

In [None]:
print(finaldf.Vector[0][500])

In [None]:
df3 = pd.DataFrame(finaldf['Vector'].values.tolist())

In [None]:
df3.head()

In [None]:
superfinal = pd.concat([df3, finaldf ], axis = 1).drop(['Vector'], axis = 1)

In [None]:
superfinal

In [None]:
from google.colab import files

filename = nameofproject + '_super_df.csv'
superfinal.to_csv(filename)

In [None]:
files.download(filename)

In [None]:
#from sklearn.ensemble import RandomForestClassifier

In [None]:
#clf = RandomForestClassifier(max_depth=100, random_state=0)

In [None]:
#from sklearn.model_selection import train_test_split

In [None]:
len(superfinal)

In [None]:
print(superfinal.Defects == '1')

In [None]:
#x_train1, x_test, y_train1, y_test= train_test_split(superfinal.drop(['Classnames'], axis = 1), (superfinal.Defects == '1'), test_size = .2,random_state=12)

In [None]:
#clf.fit(x_train1, y_train1)

In [None]:
#clf.predict(x_test)

In [None]:
#from sklearn.metrics import accuracy_score

In [None]:
#from sklearn.metrics import *


In [None]:
#f1_score(y_test, clf.predict(x_test))

In [None]:
"""from google.colab import files

filename = nameofproject + '_df.csv'
finaldf.to_csv(filename)"""

In [None]:
#time.sleep(10)

In [None]:
#time.sleep(1)

In [None]:
#files.download(filename)

In [None]:
#ellen = pd.Series(finaldf.Vector)

In [None]:
#X = np.stack(ellen)

In [None]:
#np.stack(finaldf.Defects)

In [None]:
#len(X)

In [None]:
"""autoencoder = Sequential()
autoencoder.add(Dense(1024, input_dim=1152, activation='relu'))
autoencoder.add(Dense(16, activation='relu'))
autoencoder.add(Dense(1152, activation='sigmoid'))

my_adam = optimizers.Adam(lr=learning_rate)
autoencoder.compile(loss='categorical_crossentropy', optimizer=my_adam, metrics=['accuracy'])"""

In [None]:
"""count = 0
for i in range(len(X)):
  if (sum(X[i]) == 0):
    count+=1
print(count)"""

In [None]:
#X[0].shape

In [None]:
#y = np.stack(finaldf.Defects)

In [None]:
"""from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rf.fit(X, y);"""

In [None]:
#pred = rf.predict(X)

In [None]:
#yvalues= np.array(y).astype(None)

In [None]:
#correct = (np.floor(pred) == yvalues)

In [None]:
"""countofzero = 0
for i in range(len(yvalues)):
  if (yvalues[i] == 0):
    countofzero+=1
print(countofzero/len(yvalues))"""

In [None]:
#correct

In [None]:
"""counthaha = 0
for i in range(len(correct)):
  if (correct[i] == True):
    counthaha+=1

print(counthaha/len(correct))"""