In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_wikipedia_safe_urls(seed_urls, max_urls=7000):
    visited = set()
    to_visit = set(seed_urls)
    all_links = set()

    while to_visit and len(all_links) < max_urls:
        url = to_visit.pop()
        if url in visited:
            continue

        try:
            response = requests.get(url, timeout=5)
            soup = BeautifulSoup(response.text, "html.parser")

            for link in soup.find_all("a", href=True):
                href = link["href"]
                if href.startswith("/wiki/") and ":" not in href:
                    to_visit.add("https://en.wikipedia.org" + href)
                elif "http" in href:
                    all_links.add(href)
            
            visited.add(url)
        except Exception as e:
            print(f"Error processing {url}: {e}")

    return list(all_links)

# Seed pages
seed_urls = [
    "https://en.wikipedia.org/wiki/List_of_most_popular_websites",
    "https://en.wikipedia.org/wiki/List_of_search_engines",
    "https://en.wikipedia.org/wiki/List_of_social_networking_websites"
]

# Get Wikipedia safe URLs
safe_urls = get_wikipedia_safe_urls(seed_urls, max_urls=7000)

# Save to CSV
df = pd.DataFrame(safe_urls, columns=["url"])
df["type"] = "benign"
df.to_csv("wikipedia_safe_urls.csv", index=False)

print(f"✔ Collected {len(safe_urls)} safe URLs and saved to CSV!")


✔ Collected 7410 safe URLs and saved to CSV!


In [3]:
import requests
import pandas as pd
import csv

# List of phishing URL sources
PHISHING_FEEDS = [
    "https://openphish.com/feed.txt",
    # "http://data.phishtank.com/data/online-valid.csv",
    "https://urlhaus.abuse.ch/downloads/text_online/",
]

def get_phishing_urls():
    phishing_urls = set()  # Use a set to avoid duplicates

    for feed in PHISHING_FEEDS:
        try:
            response = requests.get(feed, timeout=10)
            if response.status_code == 200:
                content = response.text.split("\n")
                for line in content:
                    line = line.strip()
                    if line and ("http" in line or "." in line):  # Ensure it's a valid URL
                        phishing_urls.add(line)
            else:
                print(f"❌ Failed to fetch from {feed}")
        except Exception as e:
            print(f"⚠ Error fetching {feed}: {e}")

    return list(phishing_urls)

# Get phishing URLs
phishing_urls = get_phishing_urls()

# Save to CSV
df = pd.DataFrame(phishing_urls, columns=["url"])
df["type"] = "phishing"
df.to_csv("phishing_urls.csv", index=False, quoting=csv.QUOTE_ALL)

print(f"✔ Collected {len(phishing_urls)} phishing URLs and saved to CSV!")


✔ Collected 8934 phishing URLs and saved to CSV!


In [4]:
import requests
import pandas as pd
import csv

# List of defacement URL sources
DEFACEMENT_FEEDS = [
    "https://cybercrime-tracker.net/ccamlist.php",
    "https://urlhaus.abuse.ch/downloads/text_online/",
    # "http://www.malwaredomainlist.com/mdlcsv.php",
    "https://feodotracker.abuse.ch/downloads/ipblocklist_recommended.txt"
]

def get_defacement_urls():
    defacement_urls = set()  # Use a set to remove duplicates

    for feed in DEFACEMENT_FEEDS:
        try:
            response = requests.get(feed, timeout=10)
            if response.status_code == 200:
                content = response.text.split("\n")
                for line in content:
                    line = line.strip()
                    if line and ("http" in line or "." in line):  # Ensure it's a valid URL
                        defacement_urls.add(line)
            else:
                print(f"❌ Failed to fetch from {feed}")
        except Exception as e:
            print(f"⚠ Error fetching {feed}: {e}")

    return list(defacement_urls)

# Get defacement URLs
defacement_urls = get_defacement_urls()

# Save to CSV
df = pd.DataFrame(defacement_urls, columns=["url"])
df["type"] = "defacement"
df.to_csv("defacement_urls.csv", index=False, quoting=csv.QUOTE_ALL)

print(f"✔ Collected {len(defacement_urls)} defacement URLs and saved to CSV!")


✔ Collected 8439 defacement URLs and saved to CSV!


In [5]:
# Load malicious dataset
df_malicious = pd.read_csv("malicious_phish.csv")

# Load verified safe URLs
df_safe = pd.read_csv("wikipedia_safe_urls.csv")

# Merge datasets
df_final = pd.concat([df_malicious, df_safe], ignore_index=True)

# Save final dataset
df_final.to_csv("final_url_dataset.csv", index=False)

print("✔ Final dataset with real benign URLs saved!")

✔ Final dataset with real benign URLs saved!


In [6]:
import requests
import pandas as pd
import csv

# List of malware URL sources
MALWARE_FEEDS = [
    "https://urlhaus.abuse.ch/downloads/text/",
    # "https://bazaar.abuse.ch/export/txt/recent/",
    # "http://www.malwaredomainlist.com/mdlcsv.php",
    "https://feodotracker.abuse.ch/downloads/ipblocklist_recommended.txt",
    # "https://cybercrime-tracker.net/all.php",
    "http://vxvault.net/URL_List.php"
]

def get_malware_urls():
    malware_urls = set()  # Use a set to avoid duplicates

    for feed in MALWARE_FEEDS:
        try:
            response = requests.get(feed, timeout=10)
            if response.status_code == 200:
                content = response.text.split("\n")
                for line in content:
                    line = line.strip()
                    if line and not line.startswith("#") and ("http" in line or "." in line):  # Clean URLs
                        malware_urls.add(line)
            else:
                print(f"❌ Failed to fetch from {feed}")
        except Exception as e:
            print(f"⚠ Error fetching {feed}: {e}")

    return list(malware_urls)

# Get malware URLs
malware_urls = get_malware_urls()

# Save to CSV
df = pd.DataFrame(malware_urls, columns=["url"])
df["type"] = "malware"
df.to_csv("malware_urls.csv", index=False, quoting=csv.QUOTE_ALL)

print(f"✔ Collected {len(malware_urls)} malware URLs and saved to CSV!")


✔ Collected 147089 malware URLs and saved to CSV!


In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

def get_wikipedia_links(start_url, limit=15000):
    """
    Crawls Wikipedia pages to collect external links (safe URLs).
    """
    queue = [start_url]  # Start from this URL
    visited = set()
    safe_urls = set()

    while queue and len(safe_urls) < limit:
        url = queue.pop(0)  # Get next URL to process
        if url in visited:
            continue
        visited.add(url)

        print(f"Fetching: {url} ({len(safe_urls)}/{limit})")
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Extract external links
                for link in soup.find_all("a", href=True):
                    href = link["href"]

                    # Convert relative Wikipedia links to absolute
                    if href.startswith("/wiki/"):
                        full_url = "https://en.wikipedia.org" + href
                        if full_url not in visited and len(queue) < 500:  # Avoid infinite loops
                            queue.append(full_url)

                    # Extract external safe links
                    if href.startswith("http") and "wikipedia.org" not in href:
                        safe_urls.add(href)

                    if len(safe_urls) >= limit:
                        break

            else:
                print(f"❌ Failed to fetch {url}")

        except Exception as e:
            print(f"⚠ Error fetching {url}: {e}")

        time.sleep(1)  # Respect Wikipedia’s servers

    return list(safe_urls)

# List of Wikipedia pages to start crawling from
seed_pages = [
    "https://en.wikipedia.org/wiki/List_of_most_popular_websites",
    "https://en.wikipedia.org/wiki/Category:Websites",
    "https://en.wikipedia.org/wiki/Category:Online_services",
    "https://en.wikipedia.org/wiki/List_of_social_media_platforms",
    "https://en.wikipedia.org/wiki/List_of_video_sharing_websites",
    "https://en.wikipedia.org/wiki/List_of_news_websites",
]

# Collect Safe URLs
safe_urls = set()
for page in seed_pages:
    safe_urls.update(get_wikipedia_links(page, limit=15000))
    if len(safe_urls) >= 15000:
        break

# Save to CSV
df = pd.DataFrame(list(safe_urls), columns=["url"])
df["type"] = "benign"
df.to_csv("safe_urls.csv", index=False)

print(f"✔ {len(safe_urls)} Safe URLs saved!")


Fetching: https://en.wikipedia.org/wiki/List_of_most_popular_websites (0/15000)
Fetching: https://en.wikipedia.org/wiki/Main_Page (54/15000)
Fetching: https://en.wikipedia.org/wiki/Wikipedia:Contents (87/15000)
Fetching: https://en.wikipedia.org/wiki/Portal:Current_events (90/15000)
Fetching: https://en.wikipedia.org/wiki/Special:Random (207/15000)
Fetching: https://en.wikipedia.org/wiki/Wikipedia:About (218/15000)
Fetching: https://en.wikipedia.org/wiki/Help:Contents (239/15000)
Fetching: https://en.wikipedia.org/wiki/Help:Introduction (255/15000)
Fetching: https://en.wikipedia.org/wiki/Wikipedia:Community_portal (261/15000)
Fetching: https://en.wikipedia.org/wiki/Special:RecentChanges (339/15000)
Fetching: https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard (342/15000)
Fetching: https://en.wikipedia.org/wiki/Special:SpecialPages (350/15000)
Fetching: https://en.wikipedia.org/wiki/Special:Search (351/15000)
Fetching: https://en.wikipedia.org/wiki/Special:MyContributions (351/15

In [8]:
import requests
import pandas as pd
import time

def fetch_malicious_urls(source_url, limit=5000):
    """
    Fetches malicious URLs from a given threat intelligence feed.
    """
    try:
        response = requests.get(source_url, timeout=10)
        if response.status_code == 200:
            urls = response.text.split("\n")
            urls = [u.strip() for u in urls if u.strip() and not u.startswith("#")]
            return urls[:limit]  # Limit results to avoid excessive duplicates
        else:
            print(f"❌ Failed to fetch: {source_url}")
            return []
    except Exception as e:
        print(f"⚠ Error fetching {source_url}: {e}")
        return []

# Real-time Malicious URL sources
malicious_sources = {
    "phishing": "https://openphish.com/feed.txt",
    "malware": "https://urlhaus.abuse.ch/downloads/text/",
    "defacement": "https://cybercrime-tracker.net/ccamlist.php",
}

# Fetch malicious URLs
malicious_urls = []
for category, url in malicious_sources.items():
    print(f"Fetching {category} URLs from {url}...")
    malicious_urls.extend([(u, category) for u in fetch_malicious_urls(url, limit=5000)])
    time.sleep(2)  # Respect servers

# Save to CSV
df = pd.DataFrame(malicious_urls, columns=["url", "type"])
df.to_csv("malicious_urls.csv", index=False)

print(f"✔ {len(malicious_urls)} Malicious URLs saved!")


Fetching phishing URLs from https://openphish.com/feed.txt...
Fetching malware URLs from https://urlhaus.abuse.ch/downloads/text/...
Fetching defacement URLs from https://cybercrime-tracker.net/ccamlist.php...
✔ 5944 Malicious URLs saved!


In [9]:
import pandas as pd

# List of CSV files
csv_files = ["wikipedia_safe_urls.csv", "phishing_urls.csv", "defacement_urls.csv", "malware_urls.csv","malicious_phish.csv"]

# Read and combine all CSV files
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Save to a new CSV file
combined_df.to_csv("final_url_dataset.csv", index=False)

print(f"✔ Combined {len(combined_df)} rows into 'final_url_dataset.csv'!")


✔ Combined 177816 rows into 'final_url_dataset.csv'!


In [10]:
import pandas as pd

def encode_url(url):
    """Convert a URL into ASCII Hex Encoding (UTF-8 Safe)."""
    return url.encode('utf-8').hex()  # UTF-8 encoding prevents errors

# Load the final URLs CSV
df = pd.read_csv("final_url_dataset.csv")

# Drop any rows with missing URLs
df = df.dropna(subset=["url"])

# Encode URLs safely
df["encoded_url"] = df["url"].apply(encode_url)

# Save the encoded URLs to a new CSV
df[["encoded_url", "type"]].to_csv("final_encoded_urls.csv", index=False)

print(f"✔ Encoded URLs saved to final_encoded_urls.csv")


✔ Encoded URLs saved to final_encoded_urls.csv
