In [262]:
import os
import json
import random
import re

import numpy as np
import pandas as pd


from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Bidirectional
from keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import to_categorical
from keras.layers.merge import concatenate, add
from keras.layers import Lambda



### Preparation of Training Data

In [263]:
def data_split(source_fr, source_other, fold_num):
    with open(source_fr, 'r') as f:
        frs = f.readlines()
    with open(source_other, 'r') as f:
        others = f.readlines()
    all_data = frs + others
    print(len(all_data))
    fr = []
    other = []
    for d in all_data:
        d = d.strip()
        if not d or len(d) == 0:
            continue
        js = json.loads(d)
        if js['body'] is not None:
            if js['label'] == "feature":
                fr.append(js)
            elif js['label'] == 'other':
                other.append(js)
    class_statistic(fr + other)
    random.shuffle(fr)
    random.shuffle(other)
    fr_fold_num = len(fr) // fold_num
    other_fold_num = len(other) // fold_num
    pos_folds = []
    neg_folds = []
    for i in range(fold_num):
        if i == fold_num - 1:
            pos_folds.append(fr[i * fr_fold_num:])
            neg_folds.append(other[i * other_fold_num:])
        else:
            pos_folds.append(fr[i * fr_fold_num:(i + 1) * fr_fold_num])
            neg_folds.append(other[i * other_fold_num:(i + 1) * other_fold_num])
    train_folds = []
    test_folds = []
    for i in range(fold_num):
        train = []
        test = []
        for j in range(fold_num):
            if j == i:
                test.extend(neg_folds[j])
                test.extend(pos_folds[j])
            else:
                train.extend(pos_folds[j])
                train.extend(neg_folds[j])
        train_folds.append(train)
        test_folds.append(test)

    return train_folds, test_folds


def class_statistic(data):
    fr_cnt = 0
    other_cnt = 0
    for d in data:
        labels = d['label']
        if "feature" in labels:
            fr_cnt += 1
        else:
            other_cnt += 1
    print(f"Feature Request: {fr_cnt}, Others: {other_cnt}, Rate: {fr_cnt / (other_cnt + fr_cnt + 1e-6)}")
    

def replace_tokens(content):
    """
    This method clean the data 
    """
    content = re.sub(r"\*\*I'm submitting a.+?\\r\\n\\r\\n\*\*", "", content)
    content = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', 'URL ', content)
    content = re.sub(r'^[0-9a-zA-Z_]{0,19}@[0-9a-zA-Z]{1,13}\.[com,cn,net]{1,3}$', 'EMAIL ', content)
    content = re.sub(r'[(0-9)+(a-z)+]{10,}', 'HASH_ID ', content)
    content = re.sub(r'#\d+\s', 'PR_ID ', content)
    content = re.sub(r"'''.*'''", "CODE ", content)
    content = re.sub(r'<[^>]*>|<\/[^>]*>', 'HTML ', content)
    content = re.sub(r'-\s\[\s*x?\s*\]\s((feature\srequest)|(bug\sreport)|other)', '', content)
    return content

In [264]:
project = "angular_target"
fold_num = 3
train_folds, test_folds = data_split(f"../data/{project}_feature.txt", f"../data/{project}_other.txt", fold_num)

320
Feature Request: 36, Others: 280, Rate: 0.11392405027239225


In [265]:
train_folds

[[{'id': '8addb269',
   'body': '',
   'comments': [{'timstamp': '2017-02-11 00:58',
     'user': 'colonel-panic',
     'body': 'is there a dropdown menu or something?'},
    {'timstamp': '2017-02-11 00:58',
     'user': 'colonel-panic',
     'body': 'How much of the HTML do I have to reproduce?'},
    {'timstamp': '2017-02-11 00:58',
     'user': 'colonel-panic',
     'body': "this thing I'm working on is pieced together out of templates"},
    {'timstamp': '2017-02-11 00:59',
     'user': 'wafflejock',
     'body': 'colonel-panic,'},
    {'timstamp': '2017-02-11 01:18',
     'user': 'colonel-panic',
     'body': 'wafflejock, thanks for that plunker.'},
    {'timstamp': '2017-02-11 01:18',
     'user': 'colonel-panic',
     'body': "That's how it's supposed to work, but for some reason I can't seem to pass this variable from my controller into the template"},
    {'timstamp': '2017-02-11 01:19',
     'user': 'colonel-panic',
     'body': 'My code looks very similar to that, except tha

In [266]:
project = "angular_target"
frs_file_name = f"../data/{project}_feature.txt"
other_file_name = f"../data/{project}_other.txt"

with open(frs_file_name, 'r') as f:
    frs = f.readlines()
with open(other_file_name, 'r') as f:
    others = f.readlines()
all_data = frs + others
print(len(all_data))

320


In [267]:
# taking all data and converting into list from json
labels = []

comment_list = []


for d in all_data:
    d = d.strip()
    if not d or len(d) == 0:
        continue
    js = json.loads(d)
    if js['body'] is not None:
        if js['label'] == "feature":
            #fr.append(js)
            _comments = js['comments']
            text = []
            for j in range(len(_comments)):
                text.append(replace_tokens(_comments[j]['body']))
            
            comment_list.append(" ".join(text))
            labels.append(js['label'])
        
        elif js['label'] == 'other':
            #other.append(js)
            _comments = js['comments']
            text = []
            for j in range(len(_comments)):
                text.append(replace_tokens(_comments[j]['body']))
            comment_list.append(" ".join(text))
            labels.append(js['label'])

print(f"total labeled data {len(comment_list)} ")

# creating dataframe for training data for angular js
df = pd.DataFrame()
df['comments'] = comment_list
df['label'] = labels
#replacing feature with 1 and others as 0
df.replace({'label':{'feature':1,'other':0}}, inplace=True)
df = df.sample(frac=1)

total labeled data 316 


##### Glimple of data

In [268]:
df.head()

Unnamed: 0,comments,label
31,is it possible to use the service of an HASH_I...,1
271,doing some more serious doctrine stuff for the...,0
59,can someone see if there is something wrong wi...,0
41,Morning fellas o/ SargoDarya: hey How's it goi...,0
248,"guys using a table like this in plunker , on c...",0


In [269]:
#y_train = to_categorical(df['label'])
y_train = df['label'].values
#y_test = to_categorical(y_test)

In [270]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['comments'])

X_train = tokenizer.texts_to_sequences(df['comments'])
# X_test = tokenizer.texts_to_sequences(X_test)

In [271]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 200
embedding_dim = 50

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
#X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [272]:
from numpy import array
from numpy import asarray
from numpy import zeros
glove_file_name = "glove.6B/glove.6B.50d.txt"
embeddings_dictionary = dict()

f = open(glove_file_name, 'r', encoding = 'utf-8')

for line in f:
    line = f.readline()
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
f.close()   

In [273]:
embedding_matrix = zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## BiLSTM Model architrecture

In [274]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(deep_inputs)
lstm_layer_1 = Bidirectional(LSTM(128))(embedding_layer)
dense_layer_1 = Dense(1, activation='sigmoid')(lstm_layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

In [275]:
!pip install pydot
!pip install graphviz
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [276]:
print(model.summary())

Model: "model_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_53 (InputLayer)       [(None, 200)]             0         
                                                                 
 embedding_38 (Embedding)    (None, 200, 50)           339850    
                                                                 
 bidirectional_24 (Bidirecti  (None, 256)              183296    
 onal)                                                           
                                                                 
 dense_15 (Dense)            (None, 1)                 257       
                                                                 
Total params: 523,403
Trainable params: 183,553
Non-trainable params: 339,850
_________________________________________________________________
None


In [155]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Preparing Paired Data

In [277]:
# randomly select one data point from each group of data and if both are having same label then label is one else its 0
first_comment_list = []
second_comment_list = []
paired_label_list = []
for _ in range(1000):
    sample_df = df.sample(n=2)
    _cmt_list = list(sample_df['comments'])
    _lbl_list = list(sample_df['label'])
    first_comment_list.append(_cmt_list[0])
    second_comment_list.append(_cmt_list[1])
    #print(_lbl_list)
    
    # for same labeled data
    if _lbl_list[0] == _lbl_list[1]:
        paired_label_list.append(1)
    else:
        paired_label_list.append(0)
paired_data_df = pd.DataFrame()
paired_data_df['c1'] = first_comment_list
paired_data_df['c2'] = second_comment_list
paired_data_df['label'] = paired_label_list

In [278]:
# now paired data label distribution
paired_data_df['label'].value_counts()

1    799
0    201
Name: label, dtype: int64

In [279]:
# spliting data into train and test 
X_train_df, X_test_df = train_test_split(paired_data_df, test_size=0.20, random_state=13)

In [280]:
# building tokenizer 
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_df['c1'])



In [284]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 200
embedding_dim = 50

X_train_c1 = tokenizer.texts_to_sequences(X_train_df['c1'])
X_train_c2 = tokenizer.texts_to_sequences(X_train_df['c2'])
X_train_c1 = pad_sequences(X_train_c1, padding='post', maxlen=maxlen)
X_train_c2 = pad_sequences(X_train_c2, padding='post', maxlen=maxlen)

#final_X_train = np.concatenate((X_train_c1, X_train_c2), axis=0)


X_test_c1 = tokenizer.texts_to_sequences(X_test_df['c1'])
X_test_c2 = tokenizer.texts_to_sequences(X_test_df['c2'])
X_test_c1 = pad_sequences(X_test_c1, padding='post', maxlen=maxlen)
X_test_c2 = pad_sequences(X_test_c2, padding='post', maxlen=maxlen)

#final_X_test = np.concatenate(X_test_c1, X_test_c2)
#X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

## Building Siamese Network

In [285]:
def base_network():    
    deep_inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(deep_inputs)

    convs = []
    kernel_sizes = [2,3,4,5] # in the loop, first apply 3 as size, then 4 then 5

    for kernel in kernel_sizes:
        l_conv = Conv1D(filters=25, kernel_size=kernel, activation='relu')(embedding_layer)
        #kernel is the filter
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis=1)

    lstm_layer_1 = Bidirectional(LSTM(150))(l_merge)
    #dense_layer_1 = Dense(300, activation='sigmoid')(lstm_layer_1)
    return Model(inputs=deep_inputs, outputs=lstm_layer_1)

In [None]:
base_model = base_network()
input_a = Input(shape=(maxlen,))
input_b = Input(shape=(maxlen,))
vect_input_a = base_model(input_a)
vect_input_b = base_model(input_b)

In [261]:
base_model.summary()

Model: "model_30"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_49 (InputLayer)          [(None, 200)]        0           []                               
                                                                                                  
 embedding_36 (Embedding)       (None, 200, 50)      339850      ['input_49[0][0]']               
                                                                                                  
 conv1d_74 (Conv1D)             (None, 199, 25)      2525        ['embedding_36[0][0]']           
                                                                                                  
 conv1d_75 (Conv1D)             (None, 198, 25)      3775        ['embedding_36[0][0]']           
                                                                                           

In [287]:
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x-y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


In [288]:
output = Lambda(euclidean_distance, output_shape=eucl_dist_shape)([vect_input_a, vect_input_b])
model = Model([input_a, input_b], output)

In [255]:
model.summary()

Model: "model_31"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_50 (InputLayer)          [(None, 200)]        0           []                               
                                                                                                  
 input_51 (InputLayer)          [(None, 200)]        0           []                               
                                                                                                  
 model_30 (Functional)          (None, 300)          568650      ['input_50[0][0]',               
                                                                  'input_51[0][0]']               
                                                                                                  
 lambda_9 (Lambda)              (None, 1)            0           ['model_30[0][0]',        

In [252]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit([X_train_c1,X_train_c2], X_train_df['label'], batch_size=32, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [294]:
predictions = model.predict([X_test_c1,X_test_c2])
pair_labels = X_test_df['label']

In [None]:
for p, pair_label in zip(predictions, pair_labels):
    if same_label == p:
                
        if ff_label == pair_label:
            self.true_positive += 1
        if oo_label == pair_label:
            self.true_negative += 1
        if fo_label == pair_label:
            self.false_positive += 1
        if of_label == pair_label:
            self.false_negative += 1
    if diff_label == p:
        if fo_label == pair_label:
            self.true_negative += 1
        if of_label == pair_label:
            self.true_positive += 1
        if ff_label == pair_label:
            self.false_negative += 1
        if oo_label == pair_label:
            self.false_positive += 1

def get_metric(self, reset: bool):
    precision = self.true_positive * 1.0 / (self.true_positive + self.false_positive + 1e-6)
    recall = self.true_positive * 1.0 / (self.true_positive + self.false_negative + 1e-6)
    fmeasure = (2.0 * precision * recall) / (precision + recall + 1e-6)
    return precision, recall, fmeasure