In [116]:
from urllib.parse import urlparse, urlunparse
from tensorflow.keras.models import load_model
import numpy as np

import re
from ml_lib_remla.preprocessing import Preprocessing



In [117]:
N_DATAPOINTS = 1000
DATASET_PATH = "./../data/DL Dataset/test.txt"
MODEL_PATH = "./../model/model.keras"

In [118]:
def load_dataset(data_path: str):
    """Loads the data split from the path. The path should be a .txt file that
    has been created from the get_data step. his should be stored in the data folder.

    Args:
        data_path (str): The path to the split .txt file.

    Returns:
        Tuple[List[str], List[str]]: Returns a tuple of raw_x and raw_y. raw_x is a
        list of strings for all the sentences in the split and raw_y is their corresponding label.
    """
    print(f"Loading dataset: {data_path}")

    try:
        with open(data_path, "r") as data_file:
            loaded_data = [line.strip() for line in data_file.readlines()[1:]]
    except FileNotFoundError as file_not_found_error:
        raise FileNotFoundError(f"Could not find file {data_path}.") from file_not_found_error
    except OSError as exception:
        raise OSError(f"An error occurred accessing file {data_path}: {exception}") from exception

    raw_x = [line.split("\t")[1] for line in loaded_data]
    raw_y = [line.split("\t")[0] for line in loaded_data]
    return raw_x, raw_y


In [119]:
X_test, y_test = load_dataset(DATASET_PATH)
X_test = X_test[:N_DATAPOINTS]
y_test = y_test[:N_DATAPOINTS]

Loading dataset: ./../data/DL Dataset/test.txt


In [129]:
def test_input_generation(urls):
    # Parse the URLs from the input list
    parsed_urls = [urlparse(url) for url in X_test]

    def replace_scheme(scheme):
        # Replace the scheme of the URL
        if scheme == 'http':
            return 'https'
        elif scheme == 'https':
            return 'http'
        else:
            return scheme

    def replace_tld(netloc, tld):
        # Replace the top-level domain (TLD) in the netloc
        tld_pattern = re.compile(r'\.(com|org|de|net|uk|us|mobi|co\.uk|gov|edu|io|ai|dev|biz|info|mil|int|arpa)\b', re.IGNORECASE)
        new_netloc = tld_pattern.sub(lambda match: '.' + tld.lstrip('.'), netloc)
        return new_netloc

    tld_list = ['.io', '.ai', '.dev']

    # Generate mutant candidates by replacing scheme and TLD
    parsed_urls_scheme = [url._replace(scheme=replace_scheme(url.scheme)) for url in parsed_urls]
    parsed_urls_scheme_tld = [url._replace(scheme=replace_tld(url.netloc, tld=tld)) for url in parsed_urls_scheme for tld in tld_list]

    # Filter out empty mutants
    mutated_urls = np.array([urlunparse(url) for url in parsed_urls_scheme_tld if urlunparse(url) != ""])

    return mutated_urls

def test_oracle_generation(y_pred_original, y_pred_mutant, threshold=0.5):
    n_mutants = len(y_pred_mutant) // len(y_pred_original)
    y_pred_original = y_pred_original.flatten()
    y_pred_mutant = y_pred_mutant.reshape(len(y_pred_original), n_mutants).T

    labels_original = (np.array(y_pred_original) > threshold).astype(int)
    labels_mutant = (np.array(y_pred_mutant) > threshold).astype(int)
    
    failing_tests = np.argwhere(labels_original != labels_mutant)
    
    return failing_tests 

def test_mutamorphic(X_orig):
    # Generate mutant candidates
    mutant_candidates = test_input_generation(X_orig)

    preprocessor = Preprocessing()
    X_orig = preprocessor.tokenize_batch(X_orig)
    X_mutator = preprocessor.tokenize_batch(mutant_candidates)

    model = load_model(MODEL_PATH)
    y_pred_original = model.predict(X_orig)
    y_pred_mutant = model.predict(X_mutator)

    failing_tests = test_oracle_generation(y_pred_original, y_pred_mutant)
    
    return failing_tests

# Perform mutamorphic testing and assert the number of failing tests
failing_tests = test_mutamorphic(X_orig=X_test)
print(failing_tests)
assert len(failing_tests) < len(y_test)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
[[  0   0]
 [  0  12]
 [  0  19]
 [  0  20]
 [  0  36]
 [  0  47]
 [  0  49]
 [  0  52]
 [  0  67]
 [  0  87]
 [  0 101]
 [  0 127]
 [  0 131]
 [  0 137]
 [  0 158]
 [  0 169]
 [  0 171]
 [  0 174]
 [  0 177]
 [  0 198]
 [  0 199]
 [  0 211]
 [  0 227]
 [  0 246]
 [  0 254]
 [  0 255]
 [  0 265]
 [  0 269]
 [  0 273]
 [  0 279]
 [  0 285]
 [  0 287]
 [  0 300]
 [  0 304]
 [  0 307]
 [  0 309]
 [  0 312]
 [  0 319]
 [  0 323]
 [  0 333]
 [  0 351]
 [  0 360]
 [  0 363]
 [  0 371]
 [  0 372]
 [  0 377]
 [  0 398]
 [  0 408]
 [  0 411]
 [  0 417]
 [  0 418]
 [  0 421]
 [  0 432]
 [  0 433]
 [  0 443]
 [  0 445]
 [  0 453]
 [  0 455]
 [  0 487]
 [  0 491]
 [  0 494]
 [  0 495]
 [  0 507]
 [  0 516]
 [  0 523]
 [  0 524]
 [  0 527]
 [  0 535]
 [  0 536]
 [  0 537]
 [  0 542]
 [  0 546]
 [  0 558]
 [  0 560]
 [  0 565]
 [  0 571]
 [  0 590]
 [  0