In [None]:
import os
import gc
import pandas as pd
import numpy as np
import math
import statistics as st
import re
import string
import time
import matplotlib.pyplot as plt
from collections import defaultdict  

from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:

approach_ensemble_dict = {}
cluster_file = open('/content/drive/MyDrive/ensemble_cluster_labels_updated.txt')
for line in cluster_file:
    full_line = line.split()
    cluster_id = int(full_line[0].replace('[', '').replace(']', '').replace(':', ''))
    step_id_list = full_line[1].split(',')
    for step_id in step_id_list:
        approach_ensemble_dict[int(float(step_id))] = cluster_id


In [None]:
print("Number of test steps which were clustered by the approach: ", len(approach_ensemble_dict))


Number of test steps which were clustered by the approach:  8


In [None]:

def get_number_unique_words(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    number_unique_words = len(set(words_list))
    return number_unique_words

In [None]:

def get_word_frequency(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    unique_words_list = set(words_list)
    word_occurrence_dict = {}
    for each_word in unique_words_list:
        word_occurrence_dict[each_word] = 0

    for step in test_steps:
        for word in step:
            word_occurrence_dict[word] += 1
            
    ten_times_occurrence_words = list()
    
    for word, occurrence in word_occurrence_dict.items():
        if occurrence < 2:
            ten_times_occurrence_words.append(word)

    return ten_times_occurrence_words


In [None]:

def remove_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
 
    problematic_words = open('word2vec_vocab_problematic.txt', 'r')
    problematic_words_list = list()
    for word in problematic_words:
        problematic_words_list.append(word.lstrip().rstrip())
    
    for index, row in df.iterrows():
        step = row["Steps"]
        df.loc[index]["Steps"] = [elem for elem in step if not elem in problematic_words_list]
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after removing problematic words: ", number_unique_words)



In [None]:

def fix_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    problematic_words = open('word2vec_vocab_to_fix.txt', 'r')
    problematic_words_dict = {}
    for line in problematic_words:
        full_line = line.split(':')
        try:
            problematic_words_dict[full_line[0]] = [x.replace('\n', '') for x in full_line[1].split(',')]
        except:
            problematic_words_dict[full_line[0]] = full_line[1].replace('\n', '')
    
    for index, row in df.iterrows():
        step = row["Steps"]
        modified_step = list()
        for word in step:
            if word in problematic_words_dict:
                modified_step.extend(problematic_words_dict[word])
            else:
                modified_step.append(word)
        df.loc[index]["Steps"] = modified_step 
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after fixing problematic words: ", number_unique_words)


In [None]:

column_names = ["Type", "Key", "Case_Name", "Step_ID", "Steps"]
test_steps_df = pd.DataFrame(columns = column_names)

index_to_add = 0

print("Reading input data...")   
test_file='/content/drive/MyDrive/result_test_step_clustering/test_cases.xlsx'
    
test_data_df = pd.read_excel(test_file)
for index, row in test_data_df.iterrows():
      current_type = row["Type"]
      current_key = row["Key"]
      current_name = row["Case_Name"]
      current_step_id = row["Step_ID"]
      current_steps = row["Steps"]
      test_steps_df.loc[index_to_add] = [current_type, current_key, current_name, current_step_id, current_steps]
      index_to_add += 1

print("Done!")
print("Shape of data => ", test_steps_df.shape)

Reading input data...
Done!
Shape of data =>  (369, 5)


In [None]:

preprocess_clean_data(test_steps_df)


Cleaning test step field...
Number of unique words across all test steps:  818
Number of unique words in test steps after stopword removal:  742
Number of unique words in test steps after lemmatization:  683
Number of words that occurred less than 10 times in test steps:  331
Dataset size after preprocessing:  (369, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


In [None]:

step_id_text_tuple_list = list()
test_steps_clustering_list = list()
for index, row in test_steps_df.iterrows():
    step_id = row["Step_ID"]
    step_text = row["Steps"]
    step_id_text_tuple_list.append((step_id,step_text))

    temp_list = list()
    if isinstance(row["Steps"], list):
        for elem in row["Steps"]:
            temp_list.append(elem)
    else:
        if isinstance(row["Steps"], str):
            temp_list.append(row["Steps"])
        
    
    test_steps_clustering_list.append(temp_list)
    
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))


Length of list of tuples: 369
Length of list with test steps:  369


In [None]:

index = 0
steps_to_remove = list()
for step in test_steps_clustering_list:
    if len(step) == 0:
        steps_to_remove.append(index)
    index += 1

step_id_text_tuple_list = [step_id_text_tuple_list[index] for index in range(len(step_id_text_tuple_list)) if not index in steps_to_remove]
test_steps_clustering_list = [test_steps_clustering_list[index] for index in range(len(test_steps_clustering_list)) if not index in steps_to_remove]
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))


Length of list of tuples: 369
Length of list with test steps:  369


In [None]:
test_case_steps_dict = {}
for index, row in test_steps_df.iterrows():
    test_case_key = row['Key']
    test_step_id = row['Step_ID']
    test_steps = row['Steps']
    if len(test_steps) == 0:
        continue
        
    if test_case_key in test_case_steps_dict:
        existing_list = test_case_steps_dict[test_case_key]
        existing_list.append(test_steps)
        test_case_steps_dict[test_case_key] = existing_list
    else:
        test_case_steps_dict[test_case_key] = [test_steps]
print("Number of test cases: ", len(test_case_steps_dict))



Number of test cases:  126


In [None]:
test_case_steps_keys_list = list(test_case_steps_dict.keys())


In [None]:
duplicate_test_case_tuples = list()
for i in range(len(test_case_steps_keys_list)-1):
    for j in range(i+1, len(test_case_steps_keys_list)):
        case_key_1 = test_case_steps_keys_list[i]
        case_key_2 = test_case_steps_keys_list[j]
        step_list_1 = test_case_steps_dict[case_key_1]
        step_list_2 = test_case_steps_dict[case_key_2]
        step_list_1 = [tuple(x) for x in step_list_1]
        step_list_2 = [tuple(x) for x in step_list_2]
        if set(step_list_1) == set(step_list_2):
            duplicate_test_case_tuples.append((i,j))


In [None]:

duplicate_test_cases_list = list()
for test_case_tuple in duplicate_test_case_tuples:
    index_1 = test_case_tuple[0]
    index_2 = test_case_tuple[1]
    found = False
    for test_case_set in duplicate_test_cases_list:
        if (index_1 in test_case_set) or (index_2 in test_case_set):
            test_case_set.add(index_1)
            test_case_set.add(index_2)
            found = True
            break
    if not found:
        temp_set = set()
        temp_set.add(index_1)
        temp_set.add(index_2)
        duplicate_test_cases_list.append(temp_set)
print("Number of groups of similar test cases: ", len(duplicate_test_cases_list))


Number of groups of similar test cases:  4


In [None]:
baseline_1_dict = {}
cluster_id = 0
indices_of_similar_cases = list()
for each_set in duplicate_test_cases_list:
    for elem in each_set:
        indices_of_similar_cases.append(elem)
        case_key = test_case_steps_keys_list[elem]
        baseline_1_dict[case_key] = cluster_id
    cluster_id += 1
for elem in range(len(test_case_steps_keys_list)):
    if elem not in indices_of_similar_cases:
        case_key = test_case_steps_keys_list[elem]
        baseline_1_dict[case_key] = cluster_id
        cluster_id += 1


In [None]:
print("Number of test cases that have at least another similar case: ", len(indices_of_similar_cases))
print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(indices_of_similar_cases) ))


Number of test cases that have at least another similar case:  9
Number of test cases that do NOT have any similar case:  117


In [None]:
file_name = '/content/baseline_1_similar_test_cases.txt'
output_file = open(file_name, 'w')


In [None]:
counter = 0
for key in baseline_1_dict:
    output_file.write(key + ":" + str(baseline_1_dict[key]) + "\n")
output_file.close()

In [None]:

test_cases_list = list()
cases_added = list()

for index, row in test_steps_df.iterrows():
    case_key = row["Key"]
    case_name = row["Case_Name"]
    step_text = row["Steps"]
    
    if len(step_text) == 0:
        continue
        
    if case_key not in cases_added:
        test_cases_list.append((case_key,case_name))
        cases_added.append(case_key)
    
print("Length of list with test cases: " , len(test_cases_list))

Length of list with test cases:  126


In [None]:
similar_test_case_tuples = list()
for i in range(len(test_cases_list)-1):
    for j in range(i+1, len(test_cases_list)):
        tuple_1 = test_cases_list[i]
        tuple_2 = test_cases_list[j]
        
        case_key_1 = tuple_1[0]
        case_name_1 = tuple_1[1]
        case_key_2 = tuple_2[0]
        case_name_2 = tuple_2[1]

        if case_name_1 == case_name_2:
            similar_test_case_tuples.append((i,j))

In [None]:

similar_test_cases_list = list()
for test_case_tuple in similar_test_case_tuples:
    index_1 = test_case_tuple[0]
    index_2 = test_case_tuple[1]
    found = False
    for test_case_set in similar_test_cases_list:
        if (index_1 in test_case_set) or (index_2 in test_case_set):
            test_case_set.add(index_1)
            test_case_set.add(index_2)
            found = True
            break
    if not found:
        temp_set = set()
        temp_set.add(index_1)
        temp_set.add(index_2)
        similar_test_cases_list.append(temp_set)
print("Number of groups of similar test cases: ", len(similar_test_cases_list))

Number of groups of similar test cases:  4


In [None]:
baseline_2_dict = {}
cluster_id = 0
indices_of_similar_cases = list()
for each_set in similar_test_cases_list:
    for elem in each_set:
        indices_of_similar_cases.append(elem)
        case_key = test_cases_list[elem][0]
        baseline_2_dict[case_key] = cluster_id
    cluster_id += 1
for elem in range(len(test_cases_list)):
    if elem not in indices_of_similar_cases:
        case_key = test_cases_list[elem][0]
        baseline_2_dict[case_key] = cluster_id
        cluster_id += 1


In [None]:
print("Number of test cases that have at least another similar case: ", len(indices_of_similar_cases))
print("Number of test cases that do NOT have any similar case: ", ( len(test_cases_list) - len(indices_of_similar_cases) ))


Number of test cases that have at least another similar case:  8
Number of test cases that do NOT have any similar case:  118


In [None]:
file_name = '/content/print_similar_test_cases.txt'
output_file = open(file_name, 'w')


In [None]:
counter = 0
for key in baseline_2_dict:
    output_file.write(key + ":" + str(baseline_2_dict[key]) + "\n")
output_file.close()
