In [42]:
from urllib.parse import urlparse, urlunparse
from tensorflow.keras.models import load_model
import numpy as np
import re
from ml_lib_remla.preprocessing import Preprocessing


In [43]:
N_DATAPOINTS = 10
DATASET_PATH = "./../data/DL Dataset/test.txt"
MODEL_PATH = "./../model/model.keras"

In [44]:
def load_dataset(data_path: str):
    """Loads the data split from the path. The path should be a .txt file that
    has been created from the get_data step. his should be stored in the data folder.

    Args:
        data_path (str): The path to the split .txt file.

    Returns:
        Tuple[List[str], List[str]]: Returns a tuple of raw_x and raw_y. raw_x is a
        list of strings for all the sentences in the split and raw_y is their corresponding label.
    """
    print(f"Loading dataset: {data_path}")

    try:
        with open(data_path, "r") as data_file:
            loaded_data = [line.strip() for line in data_file.readlines()[1:]]
    except FileNotFoundError as file_not_found_error:
        raise FileNotFoundError(f"Could not find file {data_path}.") from file_not_found_error
    except OSError as exception:
        raise OSError(f"An error occurred accessing file {data_path}: {exception}") from exception

    raw_x = [line.split("\t")[1] for line in loaded_data]
    raw_y = [line.split("\t")[0] for line in loaded_data]
    return raw_x, raw_y


In [45]:
X_test, y_test = load_dataset(DATASET_PATH)
X_test = X_test[:N_DATAPOINTS]
y_test = y_test[:N_DATAPOINTS]

Loading dataset: ./../data/DL Dataset/test.txt


In [46]:
parsed_urls = [urlparse(url) for url in X_test]
print(parsed_urls[:10])

[ParseResult(scheme='http', netloc='business.hsbc.com.camaract.mobi', path='/system_directory/isa/file.aspx', params='', query='session=61810342760774852870650739159893454615115912022877845677244', fragment=''), ParseResult(scheme='http', netloc='facebook.com-source-page.com', path='/help/contact/4359439512093023/', params='', query='', fragment=''), ParseResult(scheme='http', netloc='michaelnielsen.org', path='/blog/lecture-course-the-google-technology-stack/', params='', query='', fragment=''), ParseResult(scheme='http', netloc='messagerie-17fr.com', path='/fr/91da56ae94f5f4ff2b9dedcbcba90e2b/spg.php', params='', query='amp=&intid=8e63a4d8384a843ee3b10f5b5c48dfef&rnv=026', fragment=''), ParseResult(scheme='https', netloc='www.juventus.com', path='/wps/portal/en/news/diritti%20di%20opzione%2022giugno2011/!ut/p/b1/vzpfbtsgfmafzq_qca7ygc7txhgdgpwh7nw-idj1m-luys6qrfbtj2htpsykdtoniyehvspvowcghwkxajtsfxlia-ko25-779ux3fgwft6vo76hpjtvokgaollcqoo0szftskbo0j4faimmmarrciq0xgv1gfkgof2d_77nspdfi6

In [47]:
def replace_scheme(scheme):
    
    if scheme == 'http':
        return 'https'
    elif scheme == 'https':
        return 'http'
    else:
        return scheme
    
def replace_tld(netloc):
    
    tld_list = ['.io', '.ai', '.dev']
    tld_pattern = re.compile(r'\.(com|org|de|net|uk|us|mobi|co\.uk|gov|edu|io|ai|dev|biz|info|mil|int|arpa)\b', re.IGNORECASE)

    # Use regex to find and replace the TLD in the netloc
    new_netloc = tld_pattern.sub(lambda match: '.' + np.random.choice(tld_list).lstrip('.'), netloc)

    return new_netloc
    

parsed_urls_scheme = [url._replace(scheme=replace_scheme(url.scheme)) for url in parsed_urls]
parsed_urls_scheme_tld = [url._replace(scheme=replace_tld(url.netloc)) for url in parsed_urls_scheme]
mutated_urls = np.array([urlunparse(url) for url in parsed_urls_scheme_tld])
    

In [48]:
preprocessor = Preprocessing()

X_original = preprocessor.tokenize_batch(X_test)
y_original = preprocessor.encode_label_batch(y_test)

X_mutator = preprocessor.tokenize_batch(mutated_urls)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [49]:

model = load_model(MODEL_PATH)
original_predictions = model.predict(X_original)
mutator_predictions = model.predict(X_mutator)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step


In [50]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(original_predictions, mutator_predictions))


[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
