In [125]:
import json
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout,Conv1D, GlobalMaxPooling1D

from keras.utils import to_categorical

In [98]:
def load_inferred_bugs(dataset_path):
    bug_data = []
    for repo in os.listdir(dataset_path):
        repo_path = os.path.join(dataset_path, repo)
        if os.path.isdir(repo_path):
            for bug_id in os.listdir(repo_path):
                bug_path = os.path.join(repo_path, bug_id)
                bug_json_path = os.path.join(bug_path, 'bug.json')
                method_before_path = os.path.join(bug_path, 'method_before.txt')

                if os.path.exists(bug_json_path,) and os.path.exists(method_before_path):
                    with open(bug_json_path, 'r',encoding='utf-8') as bug_file, open(method_before_path, 'r',encoding='utf-8') as method_file:
                        bug_info = json.load(bug_file)
                        method_before = method_file.read()
                        bug_type = bug_info.get("bug_type","unknown")
                        bug_class = bug_info.get("bug_class","unknown")
                        kind = bug_info.get("kind","unknown")
                        visibility = bug_info.get("visibility","unknown")
                        severity = bug_info.get("severity","unknown")
                        bug_data.append({'bug_class': bug_class, 'kind':kind,'visibility':visibility, 'severity': severity, 'method_before': method_before, 'bug_type':bug_type})
    return bug_data

In [99]:
dataset_path = r"C:\Users\simra\Downloads\data_bug_detection"
bug_data = load_inferred_bugs(dataset_path)

In [100]:
#convert to csv
def convert_json_to_csv_with_pandas(bug_data, csv_file_path):
    df = pd.DataFrame(bug_data)
    df.to_csv(csv_file_path, index=False)

# Example usage
convert_json_to_csv_with_pandas(bug_data, "output_bugs_pandas.csv")

In [101]:
df = pd.read_csv(r"C:\Users\simra\Downloads\output_bugs_pandas.csv")
df.head()

Unnamed: 0,bug_class,kind,visibility,severity,method_before,bug_type
0,PROVER,ERROR,user,HIGH,public PreparedStatement prepareUpdateFields(...,NULL_DEREFERENCE
1,PROVER,ERROR,user,HIGH,"public Map<CQLQueryType, PreparedStatement> p...",NULL_DEREFERENCE
2,PROVER,ERROR,user,HIGH,public boolean initForceBatchStatementsOrd...,NULL_DEREFERENCE
3,PROVER,ERROR,user,HIGH,public void addInterceptorsToEntityMetas(L...,NULL_DEREFERENCE
4,PROVER,ERROR,user,HIGH,private TypeParsingResult parseComputedTyp...,NULL_DEREFERENCE


In [102]:
target_labels = df["bug_type"].unique()
print("bug_type: ",target_labels)
bug_class_values = df["bug_class"].unique()
print("bug_class: ",bug_class_values)
bug_kind_values = df["kind"].unique()
print("bug_kind_values: ",bug_kind_values)
visibility_values = df["visibility"].unique()
print("visibility_values: ",visibility_values)
severity_values = df["severity"].unique()
print("severity_values: ",severity_values)


bug_type:  ['NULL_DEREFERENCE' 'RESOURCE_LEAK' 'THREAD_SAFETY_VIOLATION'
 'CHECKERS_IMMUTABLE_CAST' 'CHECKERS_PRINTF_ARGS'
 'INTERFACE_NOT_THREAD_SAFE' 'UNSAFE_GUARDED_BY_ACCESS']
bug_class:  ['PROVER']
bug_kind_values:  ['ERROR']
visibility_values:  ['user']
severity_values:  ['HIGH']


In [103]:
df.shape

(3003, 6)

##### We see that the columns 'bug_class', 'kind', 'visibility', 'severity' have only one type of value, so they do not effect our target column "bug_type". Hence we can discard them.

In [74]:
df['bug_type'].value_counts()

bug_type
NULL_DEREFERENCE             1216
RESOURCE_LEAK                1197
THREAD_SAFETY_VIOLATION       497
CHECKERS_PRINTF_ARGS           43
INTERFACE_NOT_THREAD_SAFE      36
CHECKERS_IMMUTABLE_CAST         9
UNSAFE_GUARDED_BY_ACCESS        5
Name: count, dtype: int64

##### Here, the majority of the samlples are grouped into 3 major bug types namely 'NULL_DEREFERENCE', 'RESOURCE_LEAK' and 'THREAD_SAFETY_VIOLATION'. Apart from these three types, we discard the other types as they contain very less number of samples in comparison.

In [104]:
df = df.loc[(df['bug_type'] == 'NULL_DEREFERENCE') | 
            (df['bug_type'] == 'RESOURCE_LEAK') | 
            (df['bug_type'] == 'THREAD_SAFETY_VIOLATION')]
df['bug_type'].value_counts()

bug_type
NULL_DEREFERENCE           1216
RESOURCE_LEAK              1197
THREAD_SAFETY_VIOLATION     497
Name: count, dtype: int64

In [105]:
df['bug_type'].unique()

array(['NULL_DEREFERENCE', 'RESOURCE_LEAK', 'THREAD_SAFETY_VIOLATION'],
      dtype=object)

In [106]:
df.shape

(2910, 6)

In [107]:


# Assuming 'bug_type' has 3 categories (for example: ['NULL_DEREFERENCE', 'RESOURCE_LEAK', 'THREAD_SAFETY_VIOLATION'])
dict1 = {'NULL_DEREFERENCE':0, 'RESOURCE_LEAK':1, 'THREAD_SAFETY_VIOLATION':2}
df['bug_type'] = df['bug_type'].map(dict1)
# Convert the integer-encoded labels to one-hot encoded vectors
#one_hot_labels = to_categorical(encoded_labels, num_classes=num_classes)
df['bug_type'].unique()

array([0, 1, 2], dtype=int64)

In [108]:
df.head()

Unnamed: 0,bug_class,kind,visibility,severity,method_before,bug_type
0,PROVER,ERROR,user,HIGH,public PreparedStatement prepareUpdateFields(...,0
1,PROVER,ERROR,user,HIGH,"public Map<CQLQueryType, PreparedStatement> p...",0
2,PROVER,ERROR,user,HIGH,public boolean initForceBatchStatementsOrd...,0
3,PROVER,ERROR,user,HIGH,public void addInterceptorsToEntityMetas(L...,0
4,PROVER,ERROR,user,HIGH,private TypeParsingResult parseComputedTyp...,0


In [109]:
one_hot_labels = to_categorical(df['bug_type'], num_classes=3)

In [110]:
#Tokenization and Padding
# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the text data in 'method_before' column
tokenizer.fit_on_texts(df['method_before'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['method_before'])
sequence_lengths = [len(seq) for seq in sequences]

# Find the maximum length
max_length = max(sequence_lengths)

print("Maximum length of sequences in the dataset:", max_length)

# Define maximum sequence length
max_length = 10000  # adjusted this based on my dataset

# Pad sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


Maximum length of sequences in the dataset: 8288


In [111]:
# Load GloVe embeddings from a file
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Path to the GloVe file
glove_file_path = r"C:\Users\simra\Desktop\MISC COURSES\ABSA\glove.6B.200d.txt"
embeddings_index = load_glove_embeddings(glove_file_path)

word_index = tokenizer.word_index

# Initialize the embedding matrix
vocab_size = len(word_index) + 1
embedding_dim = 200  # The dimension of GloVe vectors
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Populate the embedding matrix with GloVe vectors
for word, i in word_index.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

In [121]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df["bug_type"], test_size=0.2, random_state=42)

In [122]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Dense

# Define a simple CNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,),
                embeddings_initializer=Constant(embedding_matrix),trainable=False,))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train CNN model
model.fit(X_train, y_train, epochs=10, batch_size=32,validation_data=(X_test, y_test))


Epoch 1/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 6s/step - accuracy: 0.6018 - loss: 0.8614 - val_accuracy: 0.8436 - val_loss: 0.4159
Epoch 2/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 6s/step - accuracy: 0.9337 - loss: 0.2303 - val_accuracy: 0.8797 - val_loss: 0.3326
Epoch 3/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 6s/step - accuracy: 0.9868 - loss: 0.0747 - val_accuracy: 0.8900 - val_loss: 0.3199
Epoch 4/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 6s/step - accuracy: 0.9886 - loss: 0.0469 - val_accuracy: 0.8814 - val_loss: 0.3515
Epoch 5/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 6s/step - accuracy: 0.9928 - loss: 0.0512 - val_accuracy: 0.9107 - val_loss: 0.3053
Epoch 6/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m463s[0m 6s/step - accuracy: 0.9902 - loss: 0.1153 - val_accuracy: 0.9089 - val_loss: 0.2972
Epoch 7/10
[1m73/73[0m [32m━━━━

<keras.src.callbacks.history.History at 0x21983934970>

In [123]:
cnn_accuracy_gl = model.evaluate(X_test, y_test)[1]
print("cnn_accuracy with glove: ",cnn_accuracy_gl) 

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 268ms/step - accuracy: 0.9153 - loss: 0.2591
cnn_accuracy with glove:  0.9140893220901489


In [119]:
cnn_accuracy = model.evaluate(X_test, y_test)[1]
print("cnn_accuracy: ",cnn_accuracy) #without glove

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 280ms/step - accuracy: 0.9373 - loss: 0.1833
cnn_accuracy:  0.9278350472450256


In [120]:
#Saving the CNN Model trained without GloVe
model.save('cnn_model.h5')



In [126]:
model = load_model('cnn_model.h5')



In [136]:
code_snippet = """ public void printState (PrintStream out) {
		Alphabet a = instances.getDataAlphabet();
		out.println ("#doc pos typeindex type topic");
		for (int di = 0; di < topics.length; di++) {
			FeatureSequence fs = (FeatureSequence) instances.get(di).getData();
			for (int token = 0; token < topics[di].length; token++) {
				int type = fs.getIndexAtPosition(token);
				out.print(di); out.print(' ');
				out.print(token); out.print(' ');
				out.print(type); out.print(' ');
				out.print(a.lookupObject(type)); out.print(' ');
				out.print(topics[di][token]); out.println();
			}
		}
	}
"""
# Tokenize and pad the new code snippet
sequence = tokenizer.texts_to_sequences([code_snippet])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')   
# Get the model's prediction (returns probabilities for each class)
prediction = model.predict(padded_sequence) 
# Get the class with the highest probability
predicted_class = np.argmax(prediction, axis=-1)[0]
if predicted_class == 0:
    print("NULL_DEREFERENCE")
elif predicted_class == 1:
    print("RESOURCE_LEAK")
elif predicted_class == 2:
    print("THREAD_SAFETY_VIOLATION")
#print(predicted_class) 
# Convert the class index back to the original label
#bug_type = label_encoder.inverse_transform([predicted_class])
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
NULL_DEREFERENCE


In [96]:
num_classes = 3
# Build and compile the model
input_length = padded_sequences.shape[1]
# Build the LSTM model
def build_lstm_model(vocab_size, embedding_dim, embedding_matrix, input_length, num_classes):
    model = Sequential()
    
    # Embedding layer
    embedding_layer = Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim,  
                        input_shape=(max_length,),
                        embeddings_initializer=Constant(embedding_matrix), 
                        trainable=False)  # Freezing the GloVe embeddings
    
    model.add(embedding_layer)
    # LSTM layer
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    
    # Dense output layer for multi-class classification
    model.add(Dense(num_classes, activation='softmax'))  # Use 'softmax' for multi-class
    
    # Compile the model
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy',  # Use categorical crossentropy for multi-class
                  metrics=['accuracy'])
    
    return model

model = build_lstm_model(vocab_size, embedding_dim, embedding_matrix, input_length, num_classes)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, one_hot_labels, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

  super().__init__(**kwargs)


Epoch 1/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1029s[0m 14s/step - accuracy: 0.4163 - loss: 1.0468 - val_accuracy: 0.4107 - val_loss: 1.0473
Epoch 2/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1035s[0m 14s/step - accuracy: 0.4027 - loss: 1.0369 - val_accuracy: 0.3986 - val_loss: 1.0475
Epoch 3/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1037s[0m 14s/step - accuracy: 0.4156 - loss: 1.0452 - val_accuracy: 0.3986 - val_loss: 1.0510
Epoch 4/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1033s[0m 14s/step - accuracy: 0.4170 - loss: 1.0557 - val_accuracy: 0.3986 - val_loss: 1.0575
Epoch 5/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m997s[0m 14s/step - accuracy: 0.4090 - loss: 1.0387 - val_accuracy: 0.3986 - val_loss: 1.0483
Epoch 6/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1041s[0m 14s/step - accuracy: 0.4060 - loss: 1.0331 - val_accuracy: 0.3986 - val_loss: 1.0554
Epoch 7/10
[1m73/73[0

<keras.src.callbacks.history.History at 0x2195ab3fca0>

In [97]:
glove_accuracy = model.evaluate(X_test, y_test)[1]
print(glove_accuracy)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 5s/step - accuracy: 0.3984 - loss: 1.0481
0.41065293550491333


In [None]:
# Predict bug type for a new code snippet
def predict_bug_type(model, tokenizer, code_snippet, max_length, label_encoder):
    # Tokenize and pad the new code snippet
    sequence = tokenizer.texts_to_sequences([code_snippet])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    
    # Get the model's prediction (returns probabilities for each class)
    prediction = model.predict(padded_sequence)
    
    # Get the class with the highest probability
    predicted_class = np.argmax(prediction, axis=-1)[0]
    
    # Convert the class index back to the original label
    bug_type = label_encoder.inverse_transform([predicted_class])
    
    return bug_type[0]

# Example usage
new_code = "public void someFunction() { // code }"
predicted_bug_type = predict_bug_type(model, tokenizer, new_code, max_length, label_encoder)
print("Predicted Bug Type:", predicted_bug_type)
