In [69]:
from urllib.parse import urlparse, urlunparse
from tensorflow.keras.models import load_model
import numpy as np

import re
from ml_lib_remla.preprocessing import Preprocessing


import warnings
warnings.filterwarnings('ignore')

In [103]:
N_DATAPOINTS = 5000
DATASET_PATH = "./../data/DL Dataset/test.txt"
MODEL_PATH = "./../model/model.keras"


N_SAMPLES = 100
TLD_LIST = ['.io', '.ai', '.dev']
TLD_REPAIR_LIST = ['.uk', '.org', '.de']

THRESHOLD = 0.5




In [71]:
def load_dataset(data_path: str):
    """Loads the data split from the path. The path should be a .txt file that
    has been created from the get_data step. his should be stored in the data folder.

    Args:
        data_path (str): The path to the split .txt file.

    Returns:
        Tuple[List[str], List[str]]: Returns a tuple of raw_x and raw_y. raw_x is a
        list of strings for all the sentences in the split and raw_y is their corresponding label.
    """
    print(f"Loading dataset: {data_path}")

    try:
        with open(data_path, "r") as data_file:
            loaded_data = [line.strip() for line in data_file.readlines()[1:]]
    except FileNotFoundError as file_not_found_error:
        raise FileNotFoundError(f"Could not find file {data_path}.") from file_not_found_error
    except OSError as exception:
        raise OSError(f"An error occurred accessing file {data_path}: {exception}") from exception

    raw_x = [line.split("\t")[1] for line in loaded_data]
    raw_y = [line.split("\t")[0] for line in loaded_data]
    return raw_x, raw_y


In [72]:
X_test, y_test = load_dataset(DATASET_PATH)
X_test = np.array(X_test[3500:N_DATAPOINTS])
y_test = np.array(y_test[3500:N_DATAPOINTS])

Loading dataset: ./../data/DL Dataset/test.txt


In [116]:
def input_generation(urls, tld_list = TLD_LIST):
    # Parse the URLs from the input list
    print(urls)
    parsed_urls = [urlparse(url) for url in urls]

    def replace_scheme(scheme):
        # Replace the scheme of the URL
        if scheme == 'http':
            return 'https'
        elif scheme == 'https':
            return 'http'
        else:
            return scheme

    def replace_tld(netloc, tld):
        # Replace the top-level domain (TLD) in the netloc
        tld_pattern = re.compile(r'\.(com|org|de|net|uk|us|mobi|co\.uk|gov|edu|io|ai|dev|biz|info|mil|int|arpa)\b', re.IGNORECASE)
        new_netloc = tld_pattern.sub(lambda match: '.' + tld.lstrip('.'), netloc)
        return new_netloc

    # Generate mutant candidates by replacing scheme and TLD
    parsed_urls_scheme = [url._replace(scheme=replace_scheme(url.scheme)) for url in parsed_urls]
    parsed_urls_scheme_tld = [url._replace(scheme=replace_tld(url.netloc, tld=tld)) for url in parsed_urls_scheme for tld in tld_list]

    # Filter out empty mutants
    mutated_urls = np.array([urlunparse(url) for url in parsed_urls_scheme_tld if urlunparse(url) != ""])

    return mutated_urls

def get_labels(y_pred_original, y_pred_mutant, threshold=0.5):
    labels_original = (np.array(y_pred_original) > threshold).astype(int)
    labels_mutants = (np.array(y_pred_mutant) > threshold).astype(int)
    
    labels_mutant = np.max(labels_mutants, axis=0)
    
    return labels_original, labels_mutant

def oracle_generation(y_pred_original, y_pred_mutant):
    n_mutants = len(y_pred_mutant) // len(y_pred_original)
    y_pred_original = y_pred_original.flatten()
    y_pred_mutant = y_pred_mutant.reshape(len(y_pred_original), n_mutants).T

    labels_original, labels_mutant = get_labels(y_pred_original, y_pred_mutant)
    failing_tests = np.argwhere(labels_original != labels_mutant)
    
    return failing_tests 

def automatic_repair(model,preprocessor, X_failing_mutants, y_prob_original):
    
    mutant_candidates = input_generation(X_failing_mutants, tld_list=TLD_REPAIR_LIST)
    
    X_mutants = preprocessor.tokenize_batch(mutant_candidates)
    y_prob_mutants = model.predict(X_mutants)
    y_prob_mutants.reshape(len(X_failing_mutants), len(TLD_REPAIR_LIST))
    
    labels_original, labels_repaired = get_labels(y_pred_original=y_prob_original, y_pred_mutant=y_prob_mutants)
    labels_final = np.equal(labels_original,labels_repaired)
    
    
    return labels_final

def test_mutamorphic(X_orig):
    # Generate mutant candidates
    mutant_candidates = input_generation(X_orig)

    preprocessor = Preprocessing()
    X_orig_processed = preprocessor.tokenize_batch(X_orig)
    X_mutants = preprocessor.tokenize_batch(mutant_candidates)
    model = load_model(MODEL_PATH)
    y_prob_original = model.predict(X_orig_processed)
    y_pred_mutants = model.predict(X_mutants)
    
    failing_tests = oracle_generation(y_prob_original, y_pred_mutants)

    X_failing_mutants = mutant_candidates.reshape(len(X_orig), len(TLD_LIST))
    y_prob_mutants = y_pred_mutants.reshape(len(X_orig), len(TLD_LIST))
    X_failing_mutants = X_failing_mutants[failing_tests].flatten()
    y_prob_mutants = y_prob_mutants[failing_tests]
    
    labels_final = automatic_repair(model, preprocessor=preprocessor, X_failing_mutants=X_failing_mutants, y_prob_original=y_prob_original)
    
    return labels_final

# Perform mutamorphic testing and assert the number of failing tests.
failing_tests = test_mutamorphic(X_orig=X_test)
# apply heuristic to get less than 10% wrong labels.
assert np.sum(failing_tests) > len(y_test) // 1.1

['http://jxnblk.com/stepkit/'
 'http://businessbanking.53.com.session2120332.versuse.cn/clientbase/form.asp'
 'http://www.abogadosbyg.cl/Accountpaypalaccsecuresummarytoken115323redirectupdateinfo/?cmd=_home&dispatch=5885d80a13c0db1f8e&ee=da61bdbd5191abca069324270ceaed1a'
 ... 'https://www.justeasy.cn/3d/id-341682.html'
 'http://www.usaa.com.kjlkjqw.org.uk/inet/ent_formversionnew/do_action?id=62147325999120009511348897229802282465531106735919176484110704'
 'http://hapoalim.co.il.vvc1.in/webscr.php']
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step
['signin.ebay.io.513422301.513422301.513422301.513422301.513422.7fdeerewreg4rkjw4eergererwx.io://signin.ebay.com.513422301.513422301.513422301.513422301.513422.7fdeerewreg4rkjw4eergererwx.com/sc/saw-cgi/eBayISAPI.dll/'
 'signin.ebay.ai.513422301.513422301.513422301.513422301.513422.7fdeerewreg4rkjw4eergererwx.ai://signin.ebay.com.513422301.5134