In [38]:
# EXTRACT_IMAGES = True
DEBUG_OUTPUT = True
IMAGE_PATH = './images'
JSON_PATH = 'whois_info.json'

import json
import os
import pytesseract
import re
import requests
import whois

from bs4 import BeautifulSoup
from tqdm import tqdm

# set to tesseract path where necessary
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [4]:
target = "https://securelist.com/apt-phantomlance/96772/"

"""
Modified from:
https://daringfireball.net/2010/07/improved_regex_for_matching_urls
Changes made:
1. Matched actual URLs without any resource links behind
2. Allowed matching of [.] in URL names to extra data from site
"""
url_regex = r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+(?:[.]|\[\.\])[a-z]{1,4}\/?)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))*(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""

"""
MD5 hash matcher
"""
hash_regex = r"""[a-f0-9]{32,}"""

In [40]:
def debug(*s):
    if DEBUG_OUTPUT:
        print(s)


In [6]:
def validate_url(link):
    # this can be written directly as a single line but it's harder to read
    
    if (len(link) == 0 or 
            link.startswith('com.') or  # java packages
            link.startswith('org.') or
            # first occurrence of : is not proceeded with /
            (link.find(':') != -1 and link.find('/') - link.find(':') != 1) or
            # parentheses before query in url
            (link.find('(') != -1 and (link.find('?') > link.find('(') or link.find('?') == -1)) or
            link == "data.raw"): # custom exclusion
        return False
    
    # technically possible to do a reference check against all TLDs and then filter unknowns
    domain = process_url(link)[1]
    if domain.find(',') != -1: # comma in domain name
        return False 
    
    return True
        

In [7]:
def process_url(link):
    if "hxxp" in link:
        link = link.replace("hxxp", "http") 
    link = link.replace('[.]', '.')
    
    domain = link
    if "://" in domain:
        domain = domain.split("://")[1]
    domain = domain.split('/')[0]
    domain = domain.split('?')[0]
    return (link, domain)

In [27]:
def download(url, pathname):
    # stolen from https://www.thepythoncode.com/article/download-files-python
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    response = requests.get(url, stream = True)
    file_size = int(response.headers.get("Content-Length", 0))
    filename = os.path.join(pathname, url.split('/')[-1])
    progress = tqdm(response.iter_content(1024), f"Downloading {filename}",
                    total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, 'wb') as f:
        for data in progress:
            f.write(data)
            progress.update(len(data))

In [9]:
raw = requests.get(target)
page = raw.content

soup = BeautifulSoup(page, 'html5lib')
article = soup.find_all('div', class_ = 'c-wysiwyg')[0]


In [10]:
# get urls from raw site
urls = [l[0] for l in re.findall(url_regex, article.get_text())]
urls = [process_url(x) for x in filter(validate_url, urls)] 

In [11]:
# process images
# if EXTRACT_IMAGES:
debug("Extracting Images")
images = [i.get('src') for i in soup.find_all('div', class_ = 'c-wysiwyg')[0].findChildren('img')]
image_paths = [IMAGE_PATH + '/' + i.split('/')[-1] for i in images]
for url in images:
    download(url, IMAGE_PATH)

Extracting Images


Downloading ./images\sl_phantomlance_01.png:   0%|                                   | 113/113k [00:00<00:03, 37.9kB/s]
Downloading ./images\sl_phantomlance_02.png:   0%|                                   | 200/199k [00:00<00:15, 13.4kB/s]
Downloading ./images\sl_phantomlance_03.png:   0%|                                 | 24.0/23.3k [00:00<00:01, 12.0kB/s]
Downloading ./images\sl_phantomlance_04.png:   0%|                                 | 29.0/28.3k [00:00<00:03, 9.58kB/s]
Downloading ./images\sl_phantomlance_05.png:   0%|                                 | 41.0/40.3k [00:00<00:03, 13.7kB/s]
Downloading ./images\sl_phantomlance_06.png:   0%|                                 | 25.0/24.4k [00:00<00:02, 8.33kB/s]
Downloading ./images\sl_phantomlance_07.png:   0%|                                 | 35.0/34.9k [00:00<00:02, 17.7kB/s]
Downloading ./images\sl_phantomlance_08.png:   0%|                                 | 48.0/47.6k [00:00<00:06, 8.00kB/s]
Downloading ./images\sl_phantomlance_09.

In [12]:
image_urls = []
image_hashes = []
for img in image_paths:
    img_text = pytesseract.image_to_string(img)
    image_urls += [l[0] for l in re.findall(url_regex, img_text)]
    image_hashes += re.findall(hash_regex, img_text)

In [13]:
image_urls = [process_url(x) for x in filter(validate_url, image_urls)]

In [14]:
# get hashes
hashes = re.findall(hash_regex, article.get_text())

In [15]:
# remove urls duplicates
urls = list(set(urls))
image_urls = list(set(urls))
# generate URL output
all_urls = urls + image_urls
max_domain_length = len(max([_[1] for _ in all_urls], key = len)) + 4
print("SRC  DOMAIN".ljust(max_domain_length) + "URL")
print("-" * (max_domain_length + 15))
for url, domain in urls:
    print("SRC  " + domain.ljust(max_domain_length) + url)
for url, domain in image_urls:
    print("IMG  " + domain.ljust(max_domain_length) + url)


SRC  DOMAIN                        URL
--------------------------------------------------
SRC  video.viodger.com                  video.viodger.com
SRC  log4jv.info                        log4jv.info
SRC  viodger.com                        viodger.com
SRC  cerisecaird.com                    cerisecaird.com
SRC  bulknewsexpress.news               bulknewsexpress.news
SRC  ps.andreagahuvrauvin.com           ps.andreagahuvrauvin.com
SRC  kaspersky.com                      kaspersky.com
SRC  androidappsapk.co                  https://androidappsapk.co/detail-cham-soc-be-yeu-babycare/
SRC  luxury.BeerAddress                 luxury.BeerAddress
SRC  mine.remaariegarcia.com            mine.remaariegarcia.com
SRC  egg.stralisemariegar.com           egg.stralisemariegar.com
SRC  us.jaxonsorensen.club              us.jaxonsorensen.club
SRC  ming.chujong.com                   ming.chujong.com
SRC  quam.viperse.com                   quam.viperse.com
SRC  sqllitlever.info                   sqllitlev

In [16]:
# remove duplicate hashes
hashes = list(set(hashes))
image_hashes = list(set(image_hashes))
max_hash_length = len(max(hashes + image_hashes, key = len))
print("SRC  HASHES (VERIFY HASH IF SRC IS IMG)\n" + "-" * (max_hash_length + 5))
for h in hashes:
    print("SRC  " + h)
for h in image_hashes:
    print("IMG  " + h)

SRC  HASHES (VERIFY HASH IF SRC IS IMG)
---------------------------------------------
SRC  8b35b3956078fc28e5709c5439e4dcb0
SRC  a795f662d10040728e916e1fd7570c1d
SRC  d23472f47833049034011cad68958b46
SRC  d1eb52ef6c2445c848157beaba54044f
SRC  6bf9b834d841b13348851f2dc033773e
SRC  3285ae59877c6241200f784b62531694
SRC  83c423c36ecda310375e8a1f4348a35e
SRC  8d5c64fdaae76bb74831c0543a7865c3
SRC  ce5bae8714ddfca9eb3bb24ee60f042d
SRC  306d3ed0a7c899b5ef9d0e3c91f05193
SRC  0d5c03da348dce513bf575545493f3e3
SRC  af44bb0dd464680395230ade0d6414cd
SRC  7285f44fa75c3c7a27bbb4870fc0cdca
SRC  94a3ca93f1500b5bd7fd020569e46589
SRC  51f9a7d4263b3a565dec7083ca00340f
SRC  b4706f171cf98742413d642b6ae728dc
SRC  07e01c2fa020724887fc39e5c97eccee
SRC  243e2c6433815f2ecc204ada4821e7d6
SRC  7048d56d923e049ca7f3d97fb5ba9812
SRC  0e7b59b601a1c7ecd6f2f54b5cd8416a
SRC  2e49775599942815ab84d9de13e338b3
SRC  79f06cb9281177a51278b2a33090c867
SRC  e648a2cc826707aec33208408b882e31
SRC  54777021c34b0aed226145fde8424991
SR

In [36]:
# part 2: extract whois information
print("Extracting whois information from domain list.")
domain = {'src': [], 'img' : []}
for u in urls:
    try:
        a = whois.query(u[1])
        domain['src'].append(a.__dict__)
    except:
        print("unknown domain: " + u[1])
        
for u in image_urls:
    try:
        a = whois.query(u[1])
        domain['img'].append(a.__dict__)
    except:
        print("unknown domain: " + u[1])


Extracting whois information from domain list.
unknown domain: bulknewsexpress.news
unknown domain: luxury.BeerAddress
unknown domain: ming.chujong.com
unknown domain: inc.graceneufville.com
unknown domain: apkpourandroid.com
unknown domain: sadma.knrowz.com
unknown domain: apk.support
unknown domain: apkcombo.com
unknown domain: game2015.net
unknown domain: anofrio.com
unknown domain: osloger.biz
unknown domain: apk.support
unknown domain: apkpure.ai
unknown domain: nhaccuatui.android.zyngacdn.com
unknown domain: jang.goongnam.com
unknown domain: itpk.mostmkru.com
unknown domain: log.osloger.biz
unknown domain: 113.171.224.175
unknown domain: mtk.baimind.com
unknown domain: codedexon.churchaddress
unknown domain: browsersyn.com
unknown domain: term.ursulapaulet.com
unknown domain: mokkha.goongnam.com
unknown domain: taiphanmemfacebookmoi.info
unknown domain: download.com.vn
unknown domain: nhaccuatui.android.zyngacdn.com
unknown domain: bulknewsexpress.news
unknown domain: luxury.Beer

In [39]:
json_output = json.dumps(domain, indent = 4, default = str)
print(json_output)

with open(JSON_PATH, 'w') as f:
    f.write(json_output)


{
    "src": [
        {
            "name": "viodger.com",
            "registrar": "NameSilo, LLC",
            "creation_date": "2017-05-16 07:35:14",
            "expiration_date": "2021-05-16 07:35:14",
            "last_updated": null,
            "status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
            "name_servers": "{'ns2.dnsowl.com\\r\\r', 'ns1.dnsowl.com\\r', 'ns3.dnsowl.com\\r\\r', 'ns3.dnsowl.com\\r', 'ns2.dnsowl.com\\r', 'ns1.dnsowl.com\\r\\r'}"
        },
        {
            "name": "log4jv.info",
            "registrar": "Namesilo, LLC",
            "creation_date": "2015-12-09 08:43:58",
            "expiration_date": "2021-12-09 08:43:58",
            "last_updated": null,
            "status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
            "name_servers": "{'ns2.qhoster.net\\r', 'ns2.qhoster.net\\r\\r', 'ns3.qhoster.net\\r\\r', 'ns1.qhoster.net\\r\\r', 'ns4.qhoster.net\\r\\r', 'ns4.qho