In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import tldextract
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote, urljoin
import ipaddress
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
index_csv_path = Path.cwd().parent / "data" / "index.csv" 
print(index_csv_path)
index_df = pd.read_csv(index_csv_path)
index_df

/Users/venkateshmunaga/Desktop/python_practice/url_phishing/data/index.csv


Unnamed: 0,rec_id,url,website,result,created_date
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42
...,...,...,...,...,...
79995,79996,https://bestjobmanage.com/adminservicedesk/True/,1622125898152592.html,1,2021-05-27 14:31:38
79996,79997,http://vieuxshack.com/download/adobe/b51f18074...,160822961118687.html,1,2020-12-17 23:56:51
79997,79998,https://www.tumblr.com/search/gas%20mask%20tattoo,163570726902772.html,0,2021-10-31 19:07:49
79998,79999,https://www.magnetic-shield.com/pdf/wc_4.pdf,1635701992217159.html,0,2021-10-31 17:39:52


# Extracting URL Features

In [3]:
def safe_get_tld(url):
    try:
        ext = tldextract.extract(url)
        return ext.suffix if ext.suffix else 'no tld'
    except Exception:
        return 'no tld'  # fallback if no valid TLD found


In [4]:
def is_domain_IP_check(url):
    try:
        hostname = urlparse(url).hostname
        if hostname is None:
            return 1
        # Check if hostname is IP address
        ipaddress.ip_address(hostname)
        return 1  # It's an IP address
    except ValueError:
        return 0  # Not an IP, so assume domain

In [5]:
def subdomain_count(url):
    if is_domain_IP_check(url) == 0:
        ext = tldextract.extract(url)
        # ext.subdomain is a string of subdomains separated by dots
        if ext.subdomain:
            # Count the levels in the subdomain by splitting on '.'
            return len(ext.subdomain.split('.'))
        else:
            return 0
    else:
        return 0

# Example usage
url1 = 'https://sub.sub2.example.co.uk/path'
url2 = 'https://example.com'
print(subdomain_count(url1))  # Output: 2 (sub and sub2)
print(subdomain_count(url2))  # Output: 0 (no subdomain)

2
0


In [6]:
# def count_obfuscated_chars(url):
#     # Count percent-encoded characters (%XX)
#     percent_encoded = len(re.findall(r'%[0-9A-Fa-f]{2}', url))
    
#     # Count special characters commonly used in obfuscation
#     special_chars = ['@', '!', '$', '&', '*', '+', ';', '=', '?', '#', '[', ']', '(', ')', '{', '}']
#     special_count = sum(url.count(char) for char in special_chars)
    
#     # Count unicode escape sequences (\uXXXX)
#     unicode_escapes = len(re.findall(r'\\u[0-9A-Fa-f]{4}', url))
    
#     # Count hex representations (0xXX)
#     hex_representations = len(re.findall(r'0x[0-9A-Fa-f]+', url))
    
#     # Total obfuscated characters
#     total_obfuscated = percent_encoded + special_count + unicode_escapes + hex_representations
    
#     return total_obfuscated

def count_obfuscated_chars(url):
    specials = '@!$&*+;=?#[\\](){}%'
    count = sum(url.count(c) for c in specials)
    count += len(re.findall(r'%[0-9A-Fa-f]{2}', url))
    return count

# Example usage
url1 = "http://google.com@190.211.254.196/path%20with%20spaces"
url2 = "https://example.com/normal/path"

print(count_obfuscated_chars(url1))  # Higher count
print(count_obfuscated_chars(url2))  # Lower count


5
0


In [7]:
index_df = index_df.assign(
    tld=lambda df: df['url'].apply(safe_get_tld),
    url_len=lambda df: df['url'].str.len(),
    is_domain_IP=lambda df: df['url'].apply(is_domain_IP_check),
    no_of_sub_domain=lambda df: df['url'].apply(subdomain_count),
    no_of_obfuscated_chars=lambda df: df['url'].apply(count_obfuscated_chars),
    is_https=lambda df: df['url'].apply(lambda u: urlparse(u).scheme == 'https').astype(int),
    no_equal=lambda df: df['url'].str.count('='),
    no_qmark=lambda df: df['url'].str.count(r'\?'),
    no_amp=lambda df: df['url'].str.count('&')
)

In [8]:
index_df

Unnamed: 0,rec_id,url,website,result,created_date,tld,url_len,is_domain_IP,no_of_sub_domain,no_of_obfuscated_chars,is_https,no_equal,no_qmark,no_amp
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32,info,34,0,0,0,0,0,0,0
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38,com,40,0,1,0,1,0,0,0
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48,com,48,0,1,0,1,0,0,0
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02,com,52,0,1,0,1,0,0,0
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42,org.uk,33,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,79996,https://bestjobmanage.com/adminservicedesk/True/,1622125898152592.html,1,2021-05-27 14:31:38,com,48,0,0,0,1,0,0,0
79996,79997,http://vieuxshack.com/download/adobe/b51f18074...,160822961118687.html,1,2020-12-17 23:56:51,com,245,0,0,8,0,3,1,2
79997,79998,https://www.tumblr.com/search/gas%20mask%20tattoo,163570726902772.html,0,2021-10-31 19:07:49,com,49,0,1,4,1,0,0,0
79998,79999,https://www.magnetic-shield.com/pdf/wc_4.pdf,1635701992217159.html,0,2021-10-31 17:39:52,com,44,0,1,0,1,0,0,0


# Extract HTLM Content

In [None]:
DATA_DIR = Path.cwd().parent / "data" / "html data"

def check_html(filename, url):
    file_path = DATA_DIR / filename
    if not file_path.exists():
        return 0, 0, 0, 0, 0, 0, 0, 0
    try:
        text = file_path.read_text(encoding="utf-8", errors="ignore")
        soup = BeautifulSoup(text, "lxml")
    except Exception:
        return 0, 0, 0, 0, 0, 0, 0, 0

    parsed_domain = urlparse(url).netloc
    has_title = int(bool(soup.title and soup.title.string and soup.title.string.strip())) # Checks for title
    has_desc = int(bool(soup.find("meta", attrs={"name": "description"}))) # Checks for description
    has_external_form = int(any(
        urlparse(urljoin(url, form.get("action", ""))).netloc != parsed_domain
        for form in soup.select("form[action]")
    )) # Checks for external_form
    icon_link = soup.find("link", rel=lambda x: x and "icon" in x.lower())
    has_favicon = int(bool(icon_link and icon_link.get("href"))) # Checks for favicon
    no_of_images = len(soup.find_all("img")) # Checks for number of images
    no_of_js = len(soup.find_all("script")) # Checks for number of JavaScript
    has_password_field = int(bool(soup.find("input", attrs= {"type": "password", "id": "password", "name": "password"}))) # Checks for Password field
    has_submit_button = int(
        bool(
            soup.find("input", attrs= {"type": "submit", "value": "Submit Form"}) 
            or soup.find("button", attrs= {"type": "submit"})
        )
    ) # Checks for the submit button
    symbol = u'\N{COPYRIGHT SIGN}'.encode('utf-8')
    symbol = symbol.decode('utf-8')
    pattern = r'' + symbol
    has_copyright_info = int(bool(soup.find_all(string=re.compile(pattern=pattern)))) # Checks for the Copyright 
    
    return has_title, has_desc, has_external_form, has_favicon, no_of_images, no_of_js, has_password_field, has_copyright_info

def run_checks(df, workers=14):
    results = []
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = {executor.submit(check_html, f, u): (f, u) for f, u in zip(df["website"], df["url"])}
        for future in tqdm(as_completed(futures), total=len(futures), desc="HTML Feature Extract"):
            results.append(future.result())
    cols = [
        "has_title", "has_description", "has_external_form_submit", "has_favicon",
        "no_of_images", "no_of_js", "has_password_field", "has_submit_button"
    ]
    df[cols] = results
    return df

index_df = run_checks(index_df, workers=30)
index_df.head()

In [None]:
html_file = Path.cwd().parent / "data" / "html data" / "1635750062162701.html"
soup = BeautifulSoup(html_file.read_text(encoding="utf-8", errors="ignore"), "lxml")
symbol = u'\N{COPYRIGHT SIGN}'.encode('utf-8')
symbol = symbol.decode('utf-8')
pattern = r'' + symbol
int(bool(soup.find_all(string=re.compile(pattern=pattern))))

In [None]:
index_df.loc[index_df['website'] == "1635750062162701.html", 'url'].item()

In [None]:
base_url = 'https://www.investopedia.com/terms/n/next-eleven.asp'
icon_link = soup.find('link', rel=lambda x: x and 'icon' in x.lower())
if icon_link and icon_link.get('href'):
    favicon_url = urljoin(base_url, icon_link['href']) if base_url else icon_link['href']
    print(True, favicon_url)

In [None]:
Path.cwd().parent

In [None]:
import requests

url = 'https://www.python.org'
response = requests.get(url, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
symbol = u'\N{COPYRIGHT SIGN}'.encode('utf-8')
symbol = symbol.decode('utf-8')
pattern = r'' + symbol
soup.find_all(string=re.compile(pattern=pattern))

In [None]:
symbol

In [None]:
re.sub(r'\s+', ' ', soup.text.replace("\n", '')).strip()