In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
import itertools
import texthero as hero
# from zeugma import EmbeddingTransformer

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
df_train = pd.read_csv('./final_fewshot_train.csv')
df_test = pd.read_csv('./final_fewshot_test.csv')
df_train=df_train[['text','class']]
df_test=df_test[['text','class']]

In [4]:
df_train.head()

Unnamed: 0,text,class
0,[ALLUXIO-2743] Fix failing unit tests,1
1,#2 Refactored structure of Argument,3
2,Remove some features from JwtTokenStore,4
3,Remove duplicated 1.613 section from changelog,2
4,* webapp structure refactoring,3


In [5]:
df_train.groupby('class').count()

Unnamed: 0_level_0,text
class,Unnamed: 1_level_1
1,20
2,20
3,20
4,20
5,20


We have 20 examples each for 5 classes

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3277 entries, 0 to 3276
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3277 non-null   object
 1   class   3277 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 51.3+ KB


In [7]:
labels = df_train['class'].unique()
labels

array([1, 3, 4, 2, 5], dtype=int64)

In [8]:
# Clean text
# def text_cleaner(s):
#     s = hero.remove_digits(s)
#     s = hero.remove_brackets(s)
#     s = hero.remove_punctuation(s)
#     s = hero.remove_whitespace(s)
#     s = hero.remove_stopwords(s)

#     return s

df_train['cleaned_text'] = hero.clean(df_train['text'])
df_test['cleaned_text'] = hero.clean(df_test['text'])

In [9]:
df_train.head()

Unnamed: 0,text,class,cleaned_text
0,[ALLUXIO-2743] Fix failing unit tests,1,alluxio fix failing unit tests
1,#2 Refactored structure of Argument,3,refactored structure argument
2,Remove some features from JwtTokenStore,4,remove features jwttokenstore
3,Remove duplicated 1.613 section from changelog,2,remove duplicated section changelog
4,* webapp structure refactoring,3,webapp structure refactoring


In [10]:
text_left = []
text_right = []
target = []


for label in labels:
    
    similar_texts = df_train[df_train['class']==label]['cleaned_text']
    group_similar_texts = list(itertools.combinations(similar_texts,2))
    
    text_left.extend([group[0] for group in group_similar_texts])
    text_right.extend([group[1] for group in group_similar_texts])
    target.extend([1.]*len(group_similar_texts))

    dissimilar_texts = df_train[df_train['class']!=label]['cleaned_text']
    for i in range(len(group_similar_texts)):
        text_left.append(np.random.choice(similar_texts))
        text_right.append(np.random.choice(dissimilar_texts))
        target.append(0.)
        
dataset = pd.DataFrame({'text_left':text_left,
                    'text_right':text_right,
                    'target': target})

In [11]:
dataset.sample(10)

Unnamed: 0,text_left,text_right,target
203,cloudstack ui network guest network ip address...,add notes coptic bug fix scalar performance im...,0.0
738,reverted earlier design change allow use regul...,improved mmap management buffer pool full perf...,0.0
1791,improved mmap management buffer pool full perf...,introduced thunk structure intermediate repres...,0.0
960,updated new grouping features,docearevent structure refactored,0.0
1521,improved performance clearing map instead recr...,add notes coptic bug fix scalar performance im...,1.0
412,webapp structure refactoring,introduced thunk structure intermediate repres...,1.0
1195,smallfix fix duplicated properties alias prope...,remove duplicate scripts move remaining items ...,1.0
70,related ui change api bug fixed,npe payload causing ssvm agent fix also make s...,1.0
1284,small refactor avoid duplicate code,need duplicate close method already inherited ...,1.0
1309,remove duplicate obsolete tests,commandlinerunner handle uris refactored dupli...,1.0


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   text_left   1900 non-null   object 
 1   text_right  1900 non-null   object 
 2   target      1900 non-null   float64
dtypes: float64(1), object(2)
memory usage: 44.7+ KB


From a training set of 100 samples were able to create 1900 samples for training the siamese network.

## Model

In [13]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda, Subtract, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Sequential, Model

In [14]:
MAX_SEQ_LENGTH = 100
VOCAB_SIZE = 10000

In [15]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df_train.cleaned_text)
sequences_left = tokenizer.texts_to_sequences(dataset.text_left)
sequences_right = tokenizer.texts_to_sequences(dataset.text_right)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

x_left = pad_sequences(sequences_left, maxlen=MAX_SEQ_LENGTH)
x_right = pad_sequences(sequences_right, maxlen=MAX_SEQ_LENGTH)

print(x_left.shape)
print(x_right.shape)

Found 584 unique tokens.
(1900, 100)
(1900, 100)


In [16]:
def exponent_neg_manhattan_distance(arms_difference):
    """ Compute the exponent of the opposite of the L1 norm of a vector, to get the left/right inputs
    similarity from the inputs differences. This function is used to turn the unbounded
    L1 distance to a similarity measure between 0 and 1"""

    return K.exp(-K.sum(K.abs(arms_difference), axis=1, keepdims=True))

In [17]:
embeddings_index = {}

f = open('./glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [18]:
EMBEDDING_DIM = 100

In [19]:
num_words = min(VOCAB_SIZE, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > VOCAB_SIZE:
        continue
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector



In [20]:
def siamese_lstm_model(max_length):

    input_shape = (max_length,)
    input_left = Input(input_shape,name = 'input_left')
    input_right = Input(input_shape,name = 'input_right')

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=max_length,
                                trainable=False)

    seq = Sequential(name='sequential_network')
    seq.add(embedding_layer)
    seq.add(Bidirectional(LSTM(32, dropout=0.3, recurrent_dropout=0.)))

    output_left = seq(input_left)
    output_right = seq(input_right)

    # Here we subtract the neuron values of the last layer from the left arm 
    # with the corresponding values from the right arm

    subtracted = Subtract(name='pair_representations_difference')([output_left, output_right])
    malstm_distance = Lambda(exponent_neg_manhattan_distance, 
                             name='masltsm_distance')(subtracted)

    siamese_net = Model(inputs=[input_left, input_right], outputs=malstm_distance)
    siamese_net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

    return siamese_net


siamese_lstm = siamese_lstm_model(MAX_SEQ_LENGTH)
siamese_lstm.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_left (InputLayer)         [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_right (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
sequential_network (Sequential) (None, 64)           92548       input_left[0][0]                 
                                                                 input_right[0][0]                
__________________________________________________________________________________________________
pair_representations_difference (None, 64)           0           sequential_network[0][

In [21]:
siamese_lstm.fit([x_left,x_right], dataset.target, validation_split=0.3, epochs=12);

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


## Predictions

In [22]:
reference_sequences = tokenizer.texts_to_sequences(df_train.cleaned_text)
x_reference_sequences = pad_sequences(reference_sequences, maxlen=MAX_SEQ_LENGTH)

In [27]:
import itertools

def flatten_text_sequence(text):
    flatten = itertools.chain.from_iterable
    text = list(flatten(text))
    return text

def get_prediction(text):
    """ Get the predicted category, and the most similar text
    in the train set. Note that this way of computing a prediction is highly 
    not optimal, but it'll be sufficient for us now. """
    x = tokenizer.texts_to_sequences(text.split())
    x = flatten_text_sequence(x)
    x = pad_sequences([x], maxlen=MAX_SEQ_LENGTH)
    # x = np.array(x)
    # print([x[0]]*len(x_reference_sequences))
    # print(x_reference_sequences.shape)
    # Compute similarities of the text with all text's in the train set
    result = np.repeat(x, len(x_reference_sequences), axis=0)
    # similarities = siamese_lstm.predict([[x[0]]*len(x_reference_sequences), x_reference_sequences])
    similarities = siamese_lstm.predict([result, x_reference_sequences])
    most_similar_index = np.argmax(similarities)
    
    # The predicted category is the one of the most similar example from the train set
    # print(most_similar_index)
    prediction = df_train['class'].iloc[most_similar_index]
    most_similar_example = df_train['cleaned_text'].iloc[most_similar_index]

    return prediction, most_similar_example

https://github.com/amansrivastava17/lstm-siamese-text-similarity

https://github.com/nkthiebaut/nkthiebaut.github.io/blob/source/content/fewshot_learning_nlp.ipynb

In [28]:
x  = df_train['cleaned_text'].iloc[34]
# print(x)
x = tokenizer.texts_to_sequences(x.split())
x = flatten_text_sequence(x)
x = pad_sequences([x], maxlen=MAX_SEQ_LENGTH)  
# x
result = np.repeat(x, len(x_reference_sequences), axis=0)
print(result.shape)

(100, 100)


In [40]:
sample_idx = 22

pred, most_sim = get_prediction(df_test.cleaned_text[sample_idx])

print(f'Sampled Text: {df_test["cleaned_text"].iloc[sample_idx]}')
print(f'True Class: {df_test["class"].iloc[sample_idx]}')
print(f'Predicted Class : {pred}')
print(f'Most similar example in train set: {most_sim}')

Sampled Text: revert cloudstack automation fix test failure test 02 revert vm snapshots smoke test vm snapshots py
True Class: 1
Predicted Class : 1
Most similar example in train set: automation fix test failure test 02 revert vm snapshots smoke test vm snapshots py


In [26]:
# from sklearn.metrics import accuracy_score

# df_eval = df_test[:50]

# y_pred = [get_prediction(text)[0] for text in df_eval['cleaned_text']]
# accuracy = accuracy_score(y_pred, df_eval['class'])

# print(f'Test accuracy (siamese model): {100*accuracy:.2f} %')