In [19]:
EXTRACT_IMAGES = True
DEBUG_OUTPUT = True
IMAGE_PATH = './images'

import os
import pytesseract
import re
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [90]:
target = "https://securelist.com/apt-phantomlance/96772/"

"""
Modified from:
https://daringfireball.net/2010/07/improved_regex_for_matching_urls
Changes made:
1. Matched actual URLs without any resource links behind
2. Allowed matching of [.] in URL names to extra data from site
"""
url_regex = r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+(?:[.]|\[\.\])[a-z]{1,4}\/?)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))*(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""

"""
MD5 hash matcher
"""
hash_regex = r"""[a-f0-9]{32,}"""

In [9]:
def debug(s):
    if DEBUG_OUTPUT:
        print(s)


In [53]:
def validate_url(link):
    # this can be written directly as a single line but it's harder to read
    
    if (len(link) == 0 or 
            link.startswith('com.') or  # java packages
            link.startswith('org.') or
            # first occurrence of : is not proceeded with /
            (link.find(':') != -1 and link.find('/') - link.find(':') != 1) or
            # parentheses before query in url
            (link.find('(') != -1 and (link.find('?') > link.find('(') or link.find('?') == -1)) or
            link == "data.raw"): # custom exclusion
        return False
    
    # technically possible to do a reference check against all TLDs and then filter unknowns
    domain = process_url(link)[1]
    if domain.find(',') != -1: # comma in domain name
        return False 
    
    return True
        

In [57]:
def process_url(link):
    if "hxxp" in link:
        link = link.replace("hxxp", "http") 
    link = link.replace('[.]', '.')
    
    domain = link
    if "://" in domain:
        domain = domain.split("://")[1]
    domain = domain.split('/')[0]
    domain = domain.split('?')[0]
    return (link, domain)

In [12]:
def download(url, pathname):
    # stolen from https://www.thepythoncode.com/article/download-files-python
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    response = requests.get(url, stream = True)
    file_size = int(response.headers.get("Content-Length", 0))
    filename = os.path.join(pathname, url.split('/')[-1])
    progress = tqdm(response.iter_content(1024), f"Downloading {filename}",
                    total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, 'wb') as f:
        for data in progress:
            f.write(data)
            progress.update(len(data))

In [13]:
raw = requests.get(target)
page = raw.content

soup = BeautifulSoup(page, 'html5lib')
article = soup.find_all('div', class_ = 'c-wysiwyg')[0]


In [58]:
# get urls from raw site
urls = [l[0] for l in re.findall(url_regex, article.get_text())]
urls = [process_url(x) for x in filter(validate_url, urls)] 

[('kaspersky.com', 'kaspersky.com'),
 ('cloud.anofrio.com', 'cloud.anofrio.com'),
 ('video.viodger.com', 'video.viodger.com'),
 ('api.anaehler.com', 'api.anaehler.com'),
 ('https://apkcombo.com', 'apkcombo.com'),
 ('https://apk.support/', 'apk.support'),
 ('https://apkpure.com', 'apkpure.com'),
 ('https://apkpourandroid.com', 'apkpourandroid.com'),
 ('https://androidappsapk.co/detail-cham-soc-be-yeu-babycare/',
  'androidappsapk.co'),
 ('https://apkpure.ai/ads-skipper', 'apkpure.ai'),
 ('osloger.biz', 'osloger.biz'),
 ('log4jv.info', 'log4jv.info'),
 ('sqllitlever.info', 'sqllitlever.info'),
 ('anofrio.com', 'anofrio.com'),
 ('anaehler.com', 'anaehler.com'),
 ('viodger.com', 'viodger.com'),
 ('browsersyn.com', 'browsersyn.com'),
 ('cerisecaird.com', 'cerisecaird.com'),
 ('luxury.BeerAddress', 'luxury.BeerAddress'),
 ('codedexon.churchaddress', 'codedexon.churchaddress'),
 ('https://apk.support/app-en/com.codedexon.churchaddress', 'apk.support'),
 ('bulknewsexpress.news', 'bulknewsexpre

In [31]:
# process images
# if EXTRACT_IMAGES:
debug("Extracting Images")
images = [i.get('src') for i in soup.find_all('div', class_ = 'c-wysiwyg')[0].findChildren('img')]
image_paths = [IMAGE_PATH + '/' + i.split('/')[-1] for i in images]
for url in images:
    download(url, IMAGE_PATH)

Extracting Images


Downloading ./images\sl_phantomlance_01.png:   0%|                                   | 113/113k [00:00<00:31, 3.65kB/s]
Downloading ./images\sl_phantomlance_02.png:   0%|                                   | 200/199k [00:00<00:13, 15.4kB/s]
Downloading ./images\sl_phantomlance_03.png:   0%|                                 | 24.0/23.3k [00:00<00:02, 7.99kB/s]
Downloading ./images\sl_phantomlance_04.png:   0%|                                 | 29.0/28.3k [00:00<00:01, 14.5kB/s]
Downloading ./images\sl_phantomlance_05.png:   0%|                                 | 41.0/40.3k [00:00<00:01, 40.0kB/s]
Downloading ./images\sl_phantomlance_06.png:   0%|                                 | 25.0/24.4k [00:00<00:01, 24.9kB/s]
Downloading ./images\sl_phantomlance_07.png:   0%|                                 | 35.0/34.9k [00:00<00:02, 17.5kB/s]
Downloading ./images\sl_phantomlance_08.png:   0%|                                 | 48.0/47.6k [00:00<00:07, 6.86kB/s]
Downloading ./images\sl_phantomlance_09.

In [93]:
image_urls = []
image_hashes = []
for img in image_paths:
    img_text = pytesseract.image_to_string(img)
    image_urls += [l[0] for l in re.findall(url_regex, img_text)]
    image_hashes += re.findall(hash_regex, img_text)

In [94]:
image_urls = [process_url(x) for x in filter(validate_url, image_urls)]

In [95]:
# get hashes
hashes = re.findall(hash_regex, article.get_text())

In [77]:
# remove urls duplicates
urls = list(set(urls))
image_urls = list(set(urls))
# generate URL output
all_urls = urls + image_urls
max_domain_length = len(max([_[1] for _ in all_urls], key = len)) + 4
print("SRC  DOMAIN".ljust(max_domain_length) + "URL")
print("-" * (max_domain_length + 15))
for url, domain in urls:
    print("SRC  " + domain.ljust(max_domain_length) + url)
for url, domain in image_urls:
    print("IMG  " + domain.ljust(max_domain_length) + url)


SRC  DOMAIN                        URL
--------------------------------------------------
SRC  osloger.biz                        osloger.biz
SRC  viodger.com                        viodger.com
SRC  anaehler.com                       anaehler.com
SRC  bulknewsexpress.news               bulknewsexpress.news
SRC  ssl.arkouthrie.com                 ssl.arkouthrie.com
SRC  mokkha.goongnam.com                mokkha.goongnam.com
SRC  cerisecaird.com                    cerisecaird.com
SRC  aki.viperse.com                    aki.viperse.com
SRC  api.anaehler.com                   api.anaehler.com
SRC  game2015.net                       game2015.net
SRC  apkcombo.com                       https://apkcombo.com
SRC  apk.support                        https://apk.support/app-en/com.codedexon.churchaddress
SRC  s3.hiahornber.com                  s3.hiahornber.com
SRC  file.log4jv.info                   file.log4jv.info
SRC  browsersyn.com                     browsersyn.com
SRC  luxury.BeerAddress  

In [98]:
# remove duplicate hashes
hashes = list(set(hashes))
image_hashes = list(set(image_hashes))
max_hash_length = len(max(hashes + image_hashes, key = len))
print("SRC  HASHES (VERIFY HASH IF SRC IS IMG)\n" + "-" * (max_hash_length + 5))
for h in hashes:
    print("SRC  " + h)
for h in image_hashes:
    print("IMG  " + h)

SRC  HASHES (VERIFY HASH IF SRC IS IMG)
---------------------------------------------
SRC  94a3ca93f1500b5bd7fd020569e46589
SRC  65d399e6a77acf7e63ba771877f96f8e
SRC  872a3dd2cd5e01633b57fa5b9ac4648d
SRC  a097b8d49386c8aab0bb38bbfdf315b2
SRC  7048d56d923e049ca7f3d97fb5ba9812
SRC  83c423c36ecda310375e8a1f4348a35e
SRC  315f8e3da94920248676b095786e26ad
SRC  0e7c2adda3bc65242a365ef72b91f3a8
SRC  8008bedaaebc1284b1b834c5fd9a7a71
SRC  8d5c64fdaae76bb74831c0543a7865c3
SRC  ce5bae8714ddfca9eb3bb24ee60f042d
SRC  c630ab7b51f0c0fa38a4a0f45c793e24
SRC  b107c35b4ca3e549bdf102de918749ba
SRC  83cd59e3ed1ba15f7a8cadfe9183e156
SRC  c399d93146f3d12feb32da23b75304ba
SRC  641f0cc057e2ab43f5444c5547e80976
SRC  810ef71bb52ea5c3cfe58b8e003520dc
SRC  0e7b59b601a1c7ecd6f2f54b5cd8416a
SRC  d61c18e577cfc046a6252775da12294f
SRC  306d3ed0a7c899b5ef9d0e3c91f05193
SRC  51f9a7d4263b3a565dec7083ca00340f
SRC  d23472f47833049034011cad68958b46
SRC  b4706f171cf98742413d642b6ae728dc
SRC  d1eb52ef6c2445c848157beaba54044f
SR