# **URL Malware recognitions**

In [1]:
import pandas as pd
import re
from urllib.parse import urlparse
from tldextract import extract
import math
import requests

### **Dataset preperation**

In [2]:
data1 = pd.read_csv(r"row\kaggle1.csv")
data1.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
data1["label"] = data1["type"].apply(lambda x: 0 if x == "benign" else 1)
data1.drop("type", axis=1, inplace=True)
data1.to_csv(r"data\data1.csv", index=False)
data1.head()

Unnamed: 0,url,label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [4]:
data2 = pd.read_csv(r"row\kaggle2.csv")
data2.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [5]:
data2 = data2.drop(columns=["label", "Unnamed: 0"])
data2.rename(columns={'result': 'label'}, inplace=True)
data2.to_csv(r"data\data2.csv", index=False)
data2.head()

Unnamed: 0,url,label
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [6]:
data3 = pd.read_csv(r"row\kaggle3.csv")
data3.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [7]:
data3 = data3.drop(columns=["label"])
data3.rename(columns={"result": "label"}, inplace=True)
data3.to_csv(r"data\data3.csv", index=False)
data3.head()

Unnamed: 0,url,label
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [8]:
data4 = pd.read_csv(r"row\phishtank_.csv")
data4.head()

Unnamed: 0,url,malware
0,https://fhantommwallet.gitbook.io/us,1
1,https://formularul-web-al-contului.webflow.io/,1
2,https://docs.google.com/presentation/d/e/2PACX...,1
3,https://geemoniahlogiinx.gitbook.io/usa,1
4,https://geimenizlogen.gitbook.io/us,1


In [9]:
data4.rename(columns={"malware": "label"}, inplace=True)
data4.to_csv(r"data\phishtank.csv", index=False)
data4.head()

Unnamed: 0,url,label
0,https://fhantommwallet.gitbook.io/us,1
1,https://formularul-web-al-contului.webflow.io/,1
2,https://docs.google.com/presentation/d/e/2PACX...,1
3,https://geemoniahlogiinx.gitbook.io/usa,1
4,https://geimenizlogen.gitbook.io/us,1


In [10]:
data5 = pd.read_csv(r"row\URLhaus_.csv")
data5.head()

Unnamed: 0,url,malware
0,"""http://117.194.218.20:56416/i""",1
1,"""http://120.61.13.70:41627/bin.sh""",1
2,"""http://59.182.81.104:54634/i""",1
3,"""http://182.57.236.204:51822/i""",1
4,"""http://117.255.24.102:40653/i""",1


In [11]:
def remove_quotes_from_url(url):
    return url.replace('"', "")


data5.loc[:, "url"] = data5["url"].apply(remove_quotes_from_url)
data5.rename(columns={"malware": "label"}, inplace=True)
data5.to_csv(r"data\URLhaus.csv", index=False)
print(len(data5))
data5 = data5.sample(frac=1).reset_index(drop=True)
data5 = data5[:100000]
data5.head()

237793


Unnamed: 0,url,label
0,http://117.204.202.97:58028/Mozi.m,1
1,http://gerlia.shop:8888/3451.dll,1
2,http://117.248.175.138:52953/i,1
3,http://flameshamer.shop:8888/3193.dll,1
4,http://hild.shop:8888/4276.dll,1


In [12]:
data6 = pd.read_csv(r"row\top_1m.csv")
data6.head()

Unnamed: 0,url
0,connectify.me
1,trendmicro.com
2,samsung.com
3,vmware.com
4,cisco.com


In [13]:
data6["label"] = 0
data6 = data6[:2000]
print(len(data6))
data6.head()

2000


Unnamed: 0,url,label
0,connectify.me,0
1,trendmicro.com,0
2,samsung.com,0
3,vmware.com,0
4,cisco.com,0


In [14]:
data1["label"].value_counts(), data2["label"].value_counts(), data3["label"].value_counts()

(label
 0    428103
 1    223088
 Name: count, dtype: int64,
 label
 0    345738
 1    104438
 Name: count, dtype: int64,
 label
 0    316254
 1    316254
 Name: count, dtype: int64)

In [15]:
merged_df = pd.concat([data1, data2, data3, data4, data5, data6])

In [16]:
merged_df["label"].value_counts()

label
0    1092095
1     811203
Name: count, dtype: int64

In [17]:
print("len data before removing duplicates" ,len(merged_df))
data = merged_df.drop_duplicates(subset=["url"])
print("len data after removing duplicates" ,len(data))

len data before removing duplicates 1903298
len data after removing duplicates 1259572


In [18]:
data["label"].value_counts()

label
0    775813
1    483759
Name: count, dtype: int64

In [19]:
data.isnull().sum()

url      0
label    0
dtype: int64

In [22]:
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv(r"data/full_data.csv", index=False)

### **Feature Extraction**

In [None]:
data_copy = data.copy()

In [None]:
# URL length
data_copy["url_length"] = data["url"].apply(len)


# Domain length
data_copy["domain_length"] = data["url"].apply(
    lambda x: len(x.split("/")[2]) if len(x.split("/")) > 2 else 0
)

# Path length

data_copy["path_length"] = data["url"].apply(
    lambda x: len(x.split("/", 3)[-1]) if len(x.split("/")) > 3 else 0
)

# Num dots in domain
data_copy["num_dots_domain"] = data["url"].apply(
    lambda x: x.split("/")[2].count(".") if len(x.split("/")) > 2 else 0
)

In [None]:
features = ["@", "?", "-", "=", ".", "#", "%", "+", "$", "!", "*", ",", "//", "http", "https", "ftp/", ".com", "www.", ".ai", ".org", "index", "&", ";", "_", "~", "php", "asp", "action", "view", "cgi", "login"]

def count_features(url):
    return {feature: url.count(feature) for feature in features}

feature_counts = data["url"].apply(count_features)
data_copy = data_copy.join(pd.DataFrame(feature_counts.tolist(), index=data.index))

total_counts = data_copy[features].sum()
print(total_counts)

In [None]:
def contains_ipv4(url):
    ip_pattern = r"\b((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
    return 1 if bool(re.search(ip_pattern, url)) else 0



data_copy.loc[:, "contains_IPv4"] = data["url"].apply(contains_ipv4)
data_copy["contains_IPv4"].value_counts()

In [None]:
def contains_ipv6(url):
    ipv6_pattern = (
        r"\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|"
        + r"([0-9a-fA-F]{1,4}:){1,7}:|"
        + r"([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|"
        + r"([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|"
        + r"([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|"
        + r"([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|"
        + r"([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|"
        + r"[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|"
        + r":((:[0-9a-fA-F]{1,4}){1,7}|:)|"
        + r"fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|"
        + r"::(ffff(:0{1,4}){0,1}:){0,1}"
        + r"((25[0-5]|(2[0-4]|1{0,1}[0-9]|[1-9]?)[0-9])\.){3,3}"
        + r"(25[0-5]|(2[0-4]|1{0,1}[0-9]|[1-9]?)[0-9])|"
        + r"([0-9a-fA-F]{1,4}:){1,4}:"
        + r"((25[0-5]|(2[0-4]|1{0,1}[0-9]|[1-9]?)[0-9])\.){3,3}"
        + r"(25[0-5]|(2[0-4]|1{0,1}[0-9]|[1-9]?)[0-9]))\b"
    )

    return 1 if bool(re.search(ipv6_pattern, url)) else 0


data_copy.loc[:, "contains_IPv6"] = data["url"].apply(contains_ipv6)
data_copy["contains_IPv6"].value_counts()

In [None]:
def contains_ip_address(url):
    ipv4_pattern = contains_ipv4(url)
    ipv6_pattern = contains_ipv6(url)

    return 0 if ipv4_pattern + ipv6_pattern == 0 else 1


data_copy.loc[:, "IP_exist"] = data["url"].apply(contains_ip_address)
data_copy["IP_exist"].value_counts()

In [None]:
def get_number_of_subdomains(url):
    # Remove the protocol (http:// or https://)
    domain = re.sub(r"^https?://", "", url).split("/")[0]
    # Split the domain into parts
    domain_parts = domain.split(".")
    # Calculate the number of subdomains
    if len(domain_parts) <= 2:
        return 0
    return len(domain_parts) - 2



data_copy.loc[:, "num_subdomains"] = data["url"].apply(get_number_of_subdomains)

In [None]:
def contains_hexadecimal(url):
    hex_pattern = r"%[0-9A-Fa-f]{2}"
    return 1 if bool(re.search(hex_pattern, url)) else 0



data_copy.loc[:, "contains_hexadecimal"] = data["url"].apply(contains_hexadecimal)


data_copy["contains_hexadecimal"].value_counts()

In [None]:
def calculate_entropy(url):
    string = url.strip()
    if not string:
        return 0.0  # Return 0 entropy for empty strings
    # Count the frequency of each character
    frequency = {c: string.count(c) for c in set(string)}
    length = len(string)
    # Calculate probabilities
    prob = [count / length for count in frequency.values()]
    # Calculate entropy
    entropy = sum(p * math.log(p, 2) for p in prob if p > 0)
    return entropy


data_copy.loc[:, "entropy"] = data["url"].apply(calculate_entropy)
data_copy["entropy"].head()

In [None]:
def count_chars_domain_extension(url):
    # Remove the protocol (http://, https://) if present
    url = url.split("//")[-1]
    # Split the URL to get the domain part
    domain = url.split("/")[0]
    # Find the position of the last dot in the domain
    last_dot_index = domain.rfind(".")
    if last_dot_index == -1:
        return -1  # No dot found, hence no extension
    # Extract the part after the last dot
    ext = domain[last_dot_index + 1 :]
    # Check if extension contains only alphabetic characters
    if not ext.isalpha():
        return -1
    # Return the length of the extension or 0 if it exceeds 4 characters
    return min(len(ext), 4)


data_copy.loc[:, "count_num_domain_extension"] = data["url"].apply(count_chars_domain_extension)
data_copy["count_num_domain_extension"].value_counts()

In [None]:
pd.crosstab(data_copy['label'], data_copy['count_num_domain_extension'])

In [None]:
def is_pre_domain(url):
    # Extract domain components using tldextract
    extracted = extract(url)
    # Check if a primary domain is present
    if extracted.domain and extracted.suffix:
        return 1
    return 0


data_copy["is_pre_domain"] = data["url"].apply(is_pre_domain)
pd.crosstab(data_copy["label"], data_copy["is_pre_domain"])

In [None]:
def is_port(url):
    try:
        parsed_url = urlparse(url)
        return 1 if parsed_url.port else 0
    except:
        return 0


data_copy["is_port"] = data["url"].apply(is_port)
pd.crosstab(data_copy["label"], data_copy["is_port"])

In [None]:
data_copy.head()

In [None]:
def count_digits_in_url(url):
    count = 0
    for char in url:
        if char.isdigit():
            count += 1
    return int(count)


data_copy["digits_count"] = data["url"].apply(count_digits_in_url)
data_copy["digits_count"] = data_copy["digits_count"]
data_copy["digits_count"].head()

In [None]:
def count_chars_in_url(url):
    count = 0
    for char in url:
        if char.isalpha():
            count += 1
    return count


data_copy["alpha_count"] = data["url"].apply(count_chars_in_url)
data_copy["alpha_count"].head()

In [None]:
def count_special_in_url(url):
    count = 0
    for char in url:
        if not char.isalnum():
            count += 1
    return count


data_copy["special_chars_count"] = data["url"].apply(count_special_in_url)
data_copy["special_chars_count"].head()

In [None]:
def host_exist(url):
    try:
        # Parse the URL to get the hostname
        hostname = urlparse(url).hostname
        # If the hostname is None, return 0
        if hostname is None:
            return 0
        # Escape special characters in the hostname for regex search
        escaped_hostname = re.escape(hostname)
        # Check if the hostname is present in the URL
        # We use a non-overlapping search to avoid false positives
        if re.search(escaped_hostname, url):
            return 1
        else:
            return 0
    except:
        return 0


data_copy["is_host"] = data["url"].apply(lambda url: host_exist(url))
pd.crosstab(data_copy["label"], data_copy["is_host"])

In [None]:
def num_parameters(url):
    params = url.split("&")
    return len(params) - 1

data_copy["num_params"] = data["url"].apply(num_parameters)
pd.crosstab(data_copy["label"], data_copy["num_params"])

In [None]:
def num_fragments(url):
    fragments = url.split("#")
    return len(fragments) - 1


data_copy["num_fragments"] = data["url"].apply(num_fragments)
pd.crosstab(data_copy["label"], data_copy["num_fragments"])

In [None]:
def num_sub_domains(url):
    subdomains = url.split("http")[-1].split("//")[-1].split("/")
    return len(subdomains) - 1


data_copy["num_sub_domains"] = data["url"].apply(num_sub_domains)
pd.crosstab(data_copy["label"], data_copy["num_sub_domains"])

In [None]:
def domain_extension(url):
    try:
        # Parse the URL to get the netloc (domain)
        netloc = urlparse(url).netloc
        # Remove port if present (e.g., 'example.com:8080' -> 'example.com')
        netloc = netloc.split(":")[0]
        # Split the domain by dots
        parts = netloc.split(".")
        # Return the last part as the domain extension
        if len(parts) > 1 and len(parts) < 5:
            return parts[-1]
        else:
            return ""  # Return empty string if no domain extension is found
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")
        return ""


data_copy["domain_extension"] = data["url"].apply(domain_extension)
pd.crosstab(data_copy["label"], data_copy["domain_extension"])

In [None]:
data_copy.to_csv("data.csv", index=False)

In [None]:
def get_host_length(url):
    try:
        parsed_url = urlparse(url)
        # Extract the host part and compute its length
        host_length = len(parsed_url.netloc) if parsed_url.netloc else 0
        return host_length
    except Exception as e:
        # Handle any parsing errors and return 0
        print(f"Error processing URL '{url}': {e}")
        return 0


data_copy["len_host"] = data["url"].apply(get_host_length)
pd.crosstab(data_copy["label"], data_copy["len_host"])

In [None]:
def get_path_length(url):
    try:
        parsed_url = urlparse(url)
        # Extract the host part and compute its length
        host_length = len(parsed_url.path) if parsed_url.path else 0
        return host_length
    except Exception as e:
        # Handle any parsing errors and return 0
        print(f"Error processing URL '{url}': {e}")
        return 0


data_copy["len_path"] = data["url"].apply(get_path_length)
pd.crosstab(data_copy["label"], data_copy["len_path"])

In [None]:
data_copy.columns

In [None]:
data_copy.head()

In [None]:
data_copy["count_num_domain_extension"].head()