In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from dataclasses import dataclass
import logging

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                    level='INFO',
                    filename='phish_scrap.log')
logger = logging.getLogger()

host_url = "https://phishtank.org"

In [10]:
@dataclass
class ErrorResponse:
    error: str
    status_code: int = 500

In [11]:
def send_api(url):
    try:
        return requests.get(url)
    except Exception as e:
        message = f"while processing your {url=}\nwe got the exception {e}"
        logger.error(message)
        return ErrorResponse(message)

In [12]:
def check_page_up_or_down(url):
    response = send_api(url)
    return 'Active' if response.status_code == 200 else 'Not Active'

In [13]:
def get_phishing_url(id_url):
    time.sleep(3)
    logger.info(f"{id_url=} is processing")
    id_url = id_url.strip()
    response = send_api(id_url)

    if response.status_code != 200:
        logger.error(f"Failed to fetch data from {id_url}")
        return None, None
    
    soup = BeautifulSoup(response.text)
    phish_urls = soup.find_all(name='b')
    phish_urls = [link for phish_url in phish_urls if (link:=(phish_url.string)) and 'http' in link]
    phish_url = phish_urls[0] if phish_urls else None
    is_active = check_page_up_or_down(phish_url)
    return phish_url, is_active
        
    # return ''.join(phish_urls)

In [14]:
def get_phish_ids_in_page(page):
     logger.info(f"processing {page=}")
     search_url = f"phish_search.php?page={page}&active=y&verified=u"
     response = send_api(f"{host_url}/{search_url}")
     soup = BeautifulSoup(response.text)
     links = soup.find_all(name='a')
     ids = {f"{host_url}/{url}": get_phishing_url(f"{host_url}/{url}") for link in links if (url:=link.get('href')) and 'phish_detail.php?phish_id=' in url}
     return ids

In [15]:
def scrap_phish_ids(pagination=1000):
    phish_ids = dict()
    for page in range(pagination):
        ids = get_phish_ids_in_page(page)
        phish_ids |= ids
    return phish_ids

In [16]:
phish_url_dict = scrap_phish_ids(pagination=1)

In [17]:
phish_ids_df = pd.DataFrame({
    "phish_id_url": phish_url_dict.keys(), 
    "phish_url": [url for url, _ in phish_url_dict.values()], 
    'active_state': [active for _, active in phish_url_dict.values()]
    })

In [18]:
phish_ids_df.shape

(20, 3)

In [19]:
phish_ids_df.duplicated().sum()

0

In [20]:
phish_ids_df.to_csv('phish_urls_demo.csv', index=False)