Test case similarity
Cosine similarity (of test case representation vectors) with distance between test case name embeddings (using Word2Vec)

In [None]:

import os
import gc
import pandas as pd
import numpy as np
import math
import statistics as st
import re
import string
import time

from scipy import spatial
import matplotlib.pyplot as plt
from collections import defaultdict 

from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer 
import nltk


**Load clusters obtained by the best approach (ensemble)**

In [None]:

approach_ensemble_dict = {}
cluster_file = open('ensemble_cluster_labels.txt')
for line in cluster_file:
    full_line = line.split()
    cluster_id = int(full_line[0].replace('[', '').replace(']', '').replace(':', ''))
    step_id_list = full_line[1].split(',')
    for step_id in step_id_list:
        approach_ensemble_dict[int(float(step_id))] = cluster_id

Read and preprocess files with test cases and build dataframe

In [None]:

current_dir = os.getcwd() 
parent_dir = os.path.dirname(current_dir) + "\\filtered_data\\"
xlsxfiles = [os.path.join(root, name)
             for root, dirs, files in os.walk(parent_dir)
             for name in files
             if name.endswith((".xlsx"))]

In [None]:

column_names = ["Type", "Key", "Case_Name", "Step_ID", "Steps"]
test_steps_df = pd.DataFrame(columns = column_names)

index_to_add = 0

print("Reading input data...")   

test_file='/content/drive/MyDrive/test_cases.xlsx'
test_data_df = pd.read_excel(test_file)
for index, row in test_data_df.iterrows():
        current_type = row["Type"]
        current_key = row["Key"]
        current_name = row["Case_Name"]
        current_step_id = row["Step_ID"]
        current_steps = row["Steps"]
        test_steps_df.loc[index_to_add] = [current_type, current_key, current_name, current_step_id, current_steps]
        index_to_add += 1

print("Done!")
print("Shape of data => ", test_steps_df.shape)

Reading input data...
Done!
Shape of data =>  (162, 5)


In [None]:

preprocess_clean_data(test_steps_df)

Cleaning test step field...
Number of unique words across all test steps:  352
Number of words that occurred less than 10 times in test steps:  158
Dataset size after preprocessing:  (162, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


In [None]:

step_id_text_tuple_list = list()
test_steps_clustering_list = list()
for index, row in test_steps_df.iterrows():
    step_id = row["Step_ID"]
    step_text = row["Steps"]
    step_id_text_tuple_list.append((step_id,step_text))

    temp_list = list()
    if isinstance(row["Steps"], list):
        for elem in row["Steps"]:
            temp_list.append(elem)
    else:
        if isinstance(row["Steps"], str):
            temp_list.append(row["Steps"])
        
    # Build list of lists of tokens (words)
    test_steps_clustering_list.append(temp_list)
    
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))

Length of list of tuples: 162
Length of list with test steps:  162


In [None]:

index = 0
steps_to_remove = list()
for step in test_steps_clustering_list:
    if len(step) == 0:
        steps_to_remove.append(index)
    index += 1

step_id_text_tuple_list = [step_id_text_tuple_list[index] for index in range(len(step_id_text_tuple_list)) if not index in steps_to_remove]
test_steps_clustering_list = [test_steps_clustering_list[index] for index in range(len(test_steps_clustering_list)) if not index in steps_to_remove]
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))


Length of list of tuples: 162
Length of list with test steps:  162


Build binary matrix of [test_cases] x [clusters] to indicate which clusters are related to each test case

In [None]:
test_case_steps_dict = {}
for index, row in test_steps_df.iterrows():
    test_case_key = row['Key']
    test_step_id = row['Step_ID']
    test_steps = row['Steps']
    if len(test_steps) == 0:
        continue
    if test_case_key in test_case_steps_dict:
        existing_list = test_case_steps_dict[test_case_key]
        existing_list.append(test_step_id)
        test_case_steps_dict[test_case_key] = existing_list
    else:
        test_case_steps_dict[test_case_key] = [test_step_id]
print("Number of test cases: ", len(test_case_steps_dict))


Number of test cases:  61


In [None]:
numeric_matrix = np.zeros((len(test_case_steps_dict),161))

In [None]:
row_index = 0
for test_case_key in test_case_steps_dict:
    steps_ids_list = test_case_steps_dict[test_case_key]
    cluster_ids_to_fill_list = list()
    for each_step_id in steps_ids_list:
        cluster_id = approach_ensemble_dict[each_step_id]
        cluster_ids_to_fill_list.append(cluster_id)
    
   
    tuple_count_clusters = list()
    cluster_ids_set = set(cluster_ids_to_fill_list)
    for elem in cluster_ids_set:
        counter = 0
        for cluster_id in cluster_ids_to_fill_list:
            if cluster_id == elem:
                counter += 1
        tuple_count_clusters.append((elem,counter))

    first_tuple_elements = [a_tuple[0] for a_tuple in tuple_count_clusters] 
    second_tuple_elements = [a_tuple[1] for a_tuple in tuple_count_clusters] 

    numeric_matrix[row_index,first_tuple_elements] = second_tuple_elements
    row_index += 1

In [None]:
for row in range(len(test_case_steps_dict)):
    for col in range(161):
        if numeric_matrix[row,col] > 1:
            print(numeric_matrix[row,col])

Compute cosine similarity score and build similarity matrix with this score

In [None]:

dist_matrix = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))


     

In [None]:
test_case_dict_keys_list = list(test_case_steps_dict.keys())
for i in range(len(test_case_steps_dict)):
    for j in range(i, len(test_case_steps_dict)):
        computed_dist = 1 - spatial.distance.cosine(numeric_matrix[i,:], numeric_matrix[j,:])
        dist_matrix[i,j] = dist_matrix[j,i] = computed_dist


Perform search of different thresholds for the cosine score

In [None]:
similarity_thresholds = [np.around(x, 2) for x in np.arange(0.1, 1.05, 0.05)]

In [None]:
for threshold in overlap_thresholds:
    print("Analyzing threshold : " , threshold)
   
    test_cases_overlap_tuple_list = list()
    for i in range(len(dist_matrix)-1):
        for j in range(i+1, len(dist_matrix)):
            if (dist_matrix[i,j] >= threshold): 
                test_cases_overlap_tuple_list.append((i,j))
                
   
    similar_test_cases_list = list()
    for test_case_tuple in test_cases_overlap_tuple_list:
        index_1 = test_case_tuple[0]
        index_2 = test_case_tuple[1]
        found = False
        for test_case_set in similar_test_cases_list:
            if (index_1 in test_case_set) or (index_2 in test_case_set):
                test_case_set.add(index_1)
                test_case_set.add(index_2)
                found = True
                break
        if not found:
            temp_set = set()
            temp_set.add(index_1)
            temp_set.add(index_2)
            similar_test_cases_list.append(temp_set)
    print("Number of groups of similar test cases: ", len(similar_test_cases_list))

    test_case_key_unique = list()
    for elem in similar_test_cases_list:
        for index in elem:
            if index not in test_case_key_unique:
                test_case_key_unique.append(index)
    print("Number of test cases that have at least another similar case: ", len(test_case_key_unique))
    print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(test_case_key_unique) ))
    
    approach_1_dict = {}
    cluster_id = 0
    for each_set in similar_test_cases_list:
        for elem in each_set:
            case_key = test_case_dict_keys_list[elem]
            approach_1_dict[case_key] = cluster_id
        cluster_id += 1

    for elem in range(len(test_case_steps_dict)):
        if elem not in test_case_key_unique:
            case_key = test_case_dict_keys_list[elem]
            approach_1_dict[case_key] = cluster_id
            cluster_id += 1

   
    file_name = '/content/approach_3_similar_test_cases_' + str(threshold) + '.txt'
    output_file = open(file_name, 'w')

    counter = 0
    for key in approach_1_dict:
        output_file.write(key + ":" + str(approach_1_dict[key]) + "\n")
    output_file.close()


Analyzing threshold :  0.1
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.15
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.2
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.25
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.3
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  

In [None]:
for threshold in similarity_thresholds:
    print("Analyzing threshold : " , threshold)
    
  
    test_cases_overlap_tuple_list = list()
    for i in range(len(dist_matrix)-1):
        for j in range(i+1, len(dist_matrix)):
            if (dist_matrix[i,j] >= threshold): 
                test_cases_overlap_tuple_list.append((i,j))
  
    similar_test_cases_list = list()
    for test_case_tuple in test_cases_overlap_tuple_list:
        index_1 = test_case_tuple[0]
        index_2 = test_case_tuple[1]
        found = False
        for test_case_set in similar_test_cases_list:
            if (index_1 in test_case_set) or (index_2 in test_case_set):
                test_case_set.add(index_1)
                test_case_set.add(index_2)
                found = True
                break
        if not found:
            temp_set = set()
            temp_set.add(index_1)
            temp_set.add(index_2)
            similar_test_cases_list.append(temp_set)
    print("Number of groups of similar test cases: ", len(similar_test_cases_list))

    test_case_key_unique = list()
    for elem in similar_test_cases_list:
        for index in elem:
            if index not in test_case_key_unique:
                test_case_key_unique.append(index)
    print("Number of test cases that have at least another similar case: ", len(test_case_key_unique))
    print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(test_case_key_unique) ))

    approach_3_dict = {}
    cluster_id = 0
    for each_set in similar_test_cases_list:
        for elem in each_set:
            case_key = test_case_dict_keys_list[elem]
            approach_3_dict[case_key] = cluster_id
        cluster_id += 1

    for elem in range(len(test_case_steps_dict)):
        if elem not in test_case_key_unique:
            case_key = test_case_dict_keys_list[elem]
            approach_3_dict[case_key] = cluster_id
            cluster_id += 1

    # save groups of similar test cases
    file_name = '/content/approach_3_similar_test_cases_' + str(threshold) + '.txt'
    output_file = open(file_name, 'w')

    counter = 0
    for key in approach_3_dict:
        output_file.write(key + ":" + str(approach_3_dict[key]) + "\n")
    output_file.close()


Analyzing threshold :  0.1
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.15
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.2
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.25
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.3
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  

Perform search with different overlap thresholds


In [None]:
overlap_thresholds = [np.around(x, 2) for x in np.arange(0.1, 1.05, 0.05)]

In [None]:
for threshold in overlap_thresholds:
    print("Analyzing threshold : " , threshold)
    
    test_cases_overlap_tuple_list = list()
    for i in range(len(dist_matrix)-1):
        for j in range(i+1, len(dist_matrix)):
            if (dist_matrix[i,j] >= threshold):  
                test_cases_overlap_tuple_list.append((i,j))
                
  
    similar_test_cases_list = list()
    for test_case_tuple in test_cases_overlap_tuple_list:
        index_1 = test_case_tuple[0]
        index_2 = test_case_tuple[1]
        found = False
        for test_case_set in similar_test_cases_list:
            if (index_1 in test_case_set) or (index_2 in test_case_set):
                test_case_set.add(index_1)
                test_case_set.add(index_2)
                found = True
                break
        if not found:
            temp_set = set()
            temp_set.add(index_1)
            temp_set.add(index_2)
            similar_test_cases_list.append(temp_set)
    print("Number of groups of similar test cases: ", len(similar_test_cases_list))

    test_case_key_unique = list()
    for elem in similar_test_cases_list:
        for index in elem:
            if index not in test_case_key_unique:
                test_case_key_unique.append(index)
    print("Number of test cases that have at least another similar case: ", len(test_case_key_unique))
    print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(test_case_key_unique) ))
    
    approach_1_dict = {}
    cluster_id = 0
    for each_set in similar_test_cases_list:
        for elem in each_set:
            case_key = test_case_dict_keys_list[elem]
            approach_1_dict[case_key] = cluster_id
        cluster_id += 1

    for elem in range(len(test_case_steps_dict)):
        if elem not in test_case_key_unique:
            case_key = test_case_dict_keys_list[elem]
            approach_1_dict[case_key] = cluster_id
            cluster_id += 1

    
    file_name = '/content/approach_1_similar_test_cases_' + str(threshold) + '.txt'
    output_file = open(file_name, 'w')

    counter = 0
    for key in approach_1_dict:
        output_file.write(key + ":" + str(approach_1_dict[key]) + "\n")
    output_file.close()


Analyzing threshold :  0.1
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.15
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.2
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.25
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  0.3
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  61
Number of test cases that do NOT have any similar case:  0
Analyzing threshold :  