In [3]:
#!pip install flair
#!pip install torch
#!pip install protobuf
#!pip install transformers
#!pip install sentence-transformers
#!pip install dataset

In [1]:

import pandas as pd
import numpy as np
import torch
from flair.data import Sentence
import warnings
warnings.filterwarnings('ignore') # setting ignore as a parameter


In [2]:
similar=[
("A black dog walking beside a pool.","A black dog is walking along the side of a pool."),

("A blonde woman looks for medical supplies for work in a suitcase. "," The blond woman is searching for medical supplies in a suitcase."),

("A doubly decker red bus driving down the road.","A red double decker bus driving down a street."),

("There is a black dog jumping into a swimming pool.","A black dog is leaping into a swimming pool."),

("The man used a sword to slice a plastic bottle. ","A man sliced a plastic bottle with a sword.")

]

In [3]:
sim_df = pd.DataFrame(similar, columns=["senl", "sen2"])

In [4]:
sim_df

Unnamed: 0,senl,sen2
0,A black dog walking beside a pool.,A black dog is walking along the side of a pool.
1,A blonde woman looks for medical supplies for ...,The blond woman is searching for medical supp...
2,A doubly decker red bus driving down the road.,A red double decker bus driving down a street.
3,There is a black dog jumping into a swimming p...,A black dog is leaping into a swimming pool.
4,The man used a sword to slice a plastic bottle.,A man sliced a plastic bottle with a sword.


In [5]:
dissimilar= [

("A little girl and boy are reading books","An older child is playing with the doll while gazing out through window") ,

("Two horses standing in a field with trees in the background.","A black and white bird on the body of water with grass in the background."),

("Two people are walking by the ocean" , "Two men in fleeces and hats looking at the camera"),

("A cat is pouncing on a trampoline","A man is selling tomatoes"),
 ("A woman is riding on a horse","A man is turning over tables in anger")
 ]

In [6]:
disim_df = pd.DataFrame(dissimilar, columns=["senl", "sen2"])

In [7]:
disim_df

Unnamed: 0,senl,sen2
0,A little girl and boy are reading books,An older child is playing with the doll while ...
1,Two horses standing in a field with trees in t...,A black and white bird on the body of water wi...
2,Two people are walking by the ocean,Two men in fleeces and hats looking at the camera
3,A cat is pouncing on a trampoline,A man is selling tomatoes
4,A woman is riding on a horse,A man is turning over tables in anger


In [8]:
def sim(s1,s2):
    # cosine similarity function outputs in the range 0-1
    s1=s1.embedding.unsqueeze(0)
    s2=s2.embedding.unsqueeze(0)
    sim = torch.cosine_similarity(s1,s2).item()
    return np.round(sim,2)


def evaluate (embeddings, myPairList):
# it evaluates embeddings for a given list of sentence pair
    scores=[]
    for s1, s2 in myPairList:
        s1,s2=Sentence (s1), Sentence (s2)
        embeddings.embed(s1)
        embeddings.embed(s2)
        score=sim(s1,s2)
        scores.append(score)
    return scores, np.round(np.mean(scores),2)
    

In [9]:
import os
os.environ['CURL_CA_BUNDLE'] = ''

In [10]:
from flair.embeddings import WordEmbeddings,DocumentPoolEmbeddings
#GLOVE Average Based Embeddings

glove_embed = WordEmbeddings('glove')
glove_pool_embeds = DocumentPoolEmbeddings([glove_embed])

In [11]:
evaluate(glove_pool_embeds,similar)

([0.97, 0.99, 0.97, 0.99, 0.98], 0.98)

In [12]:
evaluate(glove_pool_embeds,dissimilar)

([0.91, 0.97, 0.92, 0.85, 0.91], 0.91)

In [13]:
from flair.embeddings import WordEmbeddings,DocumentRNNEmbeddings

#GRU based embeddings
glove_embed = WordEmbeddings('glove')
glove_pool_embeds = DocumentRNNEmbeddings([glove_embed])
print(evaluate(glove_pool_embeds,similar))
print(evaluate(glove_pool_embeds,dissimilar))

([0.98, 1.0, 0.92, 1.0, 0.85], 0.95)
([0.61, 1.0, 0.66, 0.12, 0.46], 0.57)


In [14]:
from flair.embeddings import TransformerDocumentEmbeddings
# Gneral Bert based embeddings
bert_embeddings = TransformerDocumentEmbeddings('bert-base-uncased')
print(evaluate(bert_embeddings,similar))
print(evaluate(bert_embeddings,dissimilar))

([0.85, 0.9, 0.96, 0.91, 0.89], 0.9)
([0.88, 0.93, 0.81, 0.91, 0.93], 0.89)


In [16]:
from flair.embeddings import SentenceTransformerDocumentEmbeddings
#SENTENCE Bert based embeddings
sent_bert_embeddings = SentenceTransformerDocumentEmbeddings('bert-base-nli-mean-tokens')
print(evaluate(sent_bert_embeddings,similar))
print(evaluate(sent_bert_embeddings,dissimilar))

([0.98, 0.95, 0.96, 0.99, 0.98], 0.97)
([0.46, 0.41, 0.19, -0.04, 0.01], 0.21)


### FINE TUNING TRANSFORMER MODELS

In [17]:
from sentence_transformers import SentenceTransformer,util

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

In [18]:
df = pd.read_csv("dup_data.csv")

In [19]:
df.columns

Index(['Unnamed: 0', 'I_Title', 'D_Title', 'Score', 'Is_Duplicate'], dtype='object')

In [20]:
df=df.drop(['Unnamed: 0'],axis=1)

In [21]:
import fasttext
import pandas as pd
import re,string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import numpy as np

In [22]:
def clean_text(input):
    input = str(input)
    puctuation_pattern = re.compile(r'[%s]'%string.punctuation)
    cleantext = BeautifulSoup(input,features="html.parser").text
    cleantext = BeautifulSoup(cleantext,features="html.parser").text
    html_pattern = re.compile(r'&lt|br|div|&gt|\r|\n|\\|;|\d+|/')
    input = html_pattern.sub('', cleantext)
    input = re.sub("-","",input)
    input = re.sub("_","",input)
    input = puctuation_pattern.sub('', input)
    input = re.sub(r"\d+",'',input)
    input = input.lower()
    return input

In [23]:
df['I_Title'] = df['I_Title'].apply(clean_text)
df['D_Title'] = df['D_Title'].apply(clean_text)

In [24]:
input="RDB AI VALIDATION FOR USER STORY ENTITY jan 16th"

In [25]:
cleantext = BeautifulSoup(input,features="html.parser").text
cleantext = BeautifulSoup(cleantext,features="html.parser").text
cleaned_text = re.sub(r"\d+","", "RDB AI VALIDATION FOR USER STORY ENTITY jan 16th")
cleaned_text

'RDB AI VALIDATION FOR USER STORY ENTITY jan th'

In [26]:
df

Unnamed: 0,I_Title,D_Title,Score,Is_Duplicate
0,rdb ai validation for user story entity jan th,rdb ai validation for user story entity jan th...,100.0,1
1,rdb ai validation for user story entity jan th,rdb ai validation for user story entity,75.0,0
2,rdb ai validation for user story entity jan th,rdb ai validation for user story entity jan th,100.0,1
3,story,rra ai backtracking story,100.0,1
4,story,asa th jan story,100.0,1
...,...,...,...,...
1624,new,new product backlog,100.0,1
1625,new,added new story st feb,100.0,1
1626,new,new story check,100.0,1
1627,new,new,100.0,1


In [27]:
df['Is_Duplicate'].value_counts()

0    982
1    647
Name: Is_Duplicate, dtype: int64

In [28]:
len(df)//2

814

In [29]:
from sentence_transformers import InputExample

train_examples = []
#train_data = dataset['train']['set']
# For agility we only 1/2 of our available data
n_examples = len(df)//2

for i in range(n_examples):
    example = list(df.loc[i])
    train_examples.append(InputExample(texts=[example[0], example[1]],label=example[3]))


In [33]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


In [34]:
train_examples[0]

<sentence_transformers.readers.InputExample.InputExample at 0x2441cd1fc70>

In [35]:
from sentence_transformers import losses

train_loss = losses.ContrastiveLoss(model=model)


In [36]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

Iteration:   0%|          | 0/51 [00:00<?, ?it/s]

In [36]:
n_examples1 = len(df)//2

In [37]:
d= model.encode(["rra ai Validation"," rra ai data validation"])
cos_sim = util.cos_sim(d[0],d[1])
cos_sim

tensor([[0.9639]])

In [38]:
x="story entity "
y="rdb ai validation for user story entity"
d= model.encode([x,y])
cos_sim = util.cos_sim(d[0],d[1])
cos_sim

tensor([[0.6789]])

BUILDING MODEL FOR PREDICTION DUPLICATE

In [1]:
#!pip install tensorflow

In [56]:
#pip install daal==2021.4.0
#pip install numpy==1.21.6
#!pip install packaging>=21.3
#import tensorflow as tf
#

In [39]:
import tensorflow as tf
import transformers

In [41]:
max_length = 256  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

# Labels in our dataset.
labels = [0, 1]

In [42]:
df.shape

(1629, 4)

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X= df.drop(columns=['Is_Duplicate',"Score"],axis=1)

In [45]:
Y= df.drop(columns=['I_Title','D_Title','Score'],axis=1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [47]:
print("Number of missing values")
print(X_train.isnull().sum())
X_train.dropna(axis=0, inplace=True)

Number of missing values
I_Title    0
D_Title    0
dtype: int64


In [48]:
print("Train Target Distribution")
print(y_train.Is_Duplicate.value_counts())

Train Target Distribution
0    584
1    393
Name: Is_Duplicate, dtype: int64


In [49]:
# train_df = (
#     train_df[train_df.Is_Duplicate != "-"]
#     .sample(frac=1.0, random_state=42)
#     .reset_index(drop=True)
# )

In [50]:
y_train = tf.keras.utils.to_categorical(y_train.Is_Duplicate, num_classes=2)
y_val = tf.keras.utils.to_categorical(y_val.Is_Duplicate, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test.Is_Duplicate, num_classes=2)


In [51]:
print(X_train.iloc[567])
print(y_train[567])

I_Title               story
D_Title    asa th jan story
Name: 4, dtype: object
[0. 1.]


In [54]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [55]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model.bert(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x0000024422921880>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 256)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 256)]        0           []                               
                                                                                     

In [56]:
train_data = BertSemanticDataGenerator(
    X_train[["I_Title", "D_Title"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    X_val[["I_Title", "D_Title"]].values.astype("str"),
    y_val,
    batch_size=batch_size,
    shuffle=False,
)

In [57]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

Epoch 1/2
Epoch 2/2


In [58]:
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 256)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]',    

In [59]:
# Unfreeze the bert_model.
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

Epoch 1/2
Epoch 2/2


In [60]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

    proba = model.predict(test_data[0])[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]
    return pred, proba

In [63]:
s1="rsp app service data Validation"
s2 ="rsp service data Validation"

In [64]:
check_similarity(s1,s2)



(1, ' 0.93%')

In [65]:
s1="data Validation"
s2="rra ai backtracking story"

In [66]:
check_similarity(s1,s2)



(0, ' 0.97%')

In [67]:
s1="A black dog walking beside a pool."
s2="A black dog is walking along the side of a pool."
check_similarity(s1,s2)



(1, ' 0.98%')

In [69]:
s1="A woman is riding on a horse"
s2="A man is turning over tables in anger"
check_similarity(s1,s2)



(1, ' 0.96%')

'A woman is riding on a horse'