In [108]:
import common
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import os, pathlib, enum
import json
from pathlib import Path

root_url = Path("https://mcfp.felk.cvut.cz/publicDatasets/")
directory_path = Path(f"mcfp/publicDataset/")

re_name = re.compile(r"name:\s+([\w\.\-\_]*)\b", re.MULTILINE | re.IGNORECASE)
re_li_md5 = re.compile(r"sha256:\s+\b([A-Fa-f0-9]{32})\b$", re.MULTILINE | re.IGNORECASE)
re_li_sha256 = re.compile(r"sha256:\s+\b([A-Fa-f0-9]{64})\b$", re.MULTILINE | re.IGNORECASE)
re_md5 = re.compile(r"([A-Fa-f0-9]{32})", re.MULTILINE | re.IGNORECASE)
re_sha256 = re.compile(r"([a-f0-9]{64})", re.MULTILINE | re.IGNORECASE)

def re_run(self, rgx, string):
    match = rgx.search(string)
    return match.group(1) if match else None

def scrape_table(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    table = soup.find('table')
    if table is None:
        return None
    table_data = []
    for row in table.find_all('tr'):
        columns = row.find_all('td')
        row_data = [col.text.strip() for col in columns]
        if row_data:
            table_data.append(row_data)
    return table_data

class File:
    def __init__(self, folder: MCFPFolder, name: str, size: str = "", ts: str = ""):
        self.folder = folder
        self.name = name
        self.ts = ts
        self.size = size
        self._content = None
        pass
    
    @property
    def content(self):
        if self._content:
            return self._content
        fpath = self.folder.dir.joinpath(self.name if len(self.name) else "index.html")
        url = self.folder.url.joinpath(self.name)
        if fpath.is_file():
            with open(fpath, "br") as fp:
                self._content = fp.read()
        else:
            response = requests.get(url)
            if response.status_code == 200:
                self.content = response.content
                with open(fpath, "bw") as fp:
                    fp.write(response.content)
        return self._content

    @property
    def txt(self):
        return self.content.decode("utf-8")
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return self.name
                

class Readme(File):
    def __init__(self, folder: MCFPFolder, name: str, size: str = "", ts: str = ""):
        super().__init__(folder, name, size, ts)
        pass

    @property
    def lists(self):
        soup = BeautifulSoup(self.txt, 'html.parser')
        return [ li.text for li in soup.find_all('li') ]

    @property
    def sha256(self):
        match = re_li_sha256.search(self.txt)
        if match:
            return match.group(1)
        match = re_sha256.search(self.txt)
        if match:
            return match.group(1)
        # print("SHA256 not present.")
        return None
    
    @property
    def md5(self):
        match = re_li_md5.search(self.txt, re.IGNORECASE)
        if match:
            return match.group(1)
        match = re_md5.search(self.txt, re.IGNORECASE)
        if match:
            return match.group(1)
        # print("SHA256 not present.")
        return None

    @property
    def mwname(self):
        match = re_name.search(self.txt.lower(), re.IGNORECASE)
        if match:
            return match.group(1)
        # print("Name not present.")
        return None

class MCFPFolderType(enum.Enum):
    BOTNET = 0
    MIXED = 1
    NORMAL = 2
    OTHER = 3
    pass

    @staticmethod
    def translate(name):
        if 0 <= name.find("Botnet"): return MCFPFolderType.BOTNET
        if 0 <= name.find("Mixed"): return MCFPFolderType.MIXED
        if 0 <= name.find("Normal"): return MCFPFolderType.NORMAL
        return MCFPFolderType.OTHER

class MCFPFolder:
    def __init__(self, name):
        self.name = name
        self.url = root_url.joinpath(name)
        self.dir = directory_path.joinpath(name)
        self.dir.mkdir(exist_ok=True)
        self.htmlindex = File(self, "")
        self.type = MCFPFolderType.translate(self.name)
        if self.type == MCFPFolderType.BOTNET:
            self.id = self.name.replace("CTU-Malware-Capture-Botnet-", "").replace("/", "")
        elif self.type == MCFPFolderType.NORMAL:
            self.id = self.name.replace("CTU-Malware-Capture-Normal-", "").replace("/", "")
        elif self.type == MCFPFolderType.MIXED:
            self.id = self.name.replace("CTU-Mixed-Capture-", "").replace("/", "")
        else:
            self.id = self.name
        pass

    @property
    def table(self):
        if not self.htmlindex.txt:
                raise Exception("No htmlindex for %s" % self.name)
        table = scrape_table(self.htmlindex.txt)
        if not table:
            raise Exception("No table in htmlindex for %s" % self.name)
        return [ row for row in table ]

    @property
    def files(self):
        return [ File(self, row[1], ts=row[2], size=row[3]) for row in self.table if row[1].lower().find("parent") == -1]

    @property
    def readmes(self):
        return [ file for file in self.files if file.name.lower().find("readme") >= 0 ]
        
    @property
    def readme(self):
        if self.readmes:
            tmp = [ file for file in self.readmes if file.name.lower() == "readme.html" ]
            file = tmp[0] if tmp else self.readmes[0]
            return Readme(self, file.name, ts=file.ts, size=file.size)
        return None

    @property
    def pcaps(self):
        return [ file for file in self.files if file["name"].lower().endswith(".pcap") ]

    def __str__(self):
        return self.name
        

database = common.Database()

botnets = {}


In [165]:
pcaps = pd.read_sql("SELECT * FROM pcap WHERE infected is true", database.engine)

pcaps["mcfp_id"] = pcaps[pcaps["dataset"] == "CTU-13"]["name"].apply(lambda x: x.split("_")[0])
pcaps["mcfp_pcap"] = pcaps[pcaps["dataset"] == "CTU-13"]["name"].apply(lambda x: x[x.find("_"):])
ids = pcaps.set_index("mcfp_id").copy()
ids["find"] = False
ids["mwname"] = None
ids["sha256"] = None
ids["md5"] = None
botnets = {}
for folder in root_page_dirs:
    folder = MCFPFolder(folder)
    # if folder.type in [ MCFPFolderType.BOTNET, MCFPFolderType.MIXED ] and folder.readme and (not folder.readme.sha256 and not folder.readme.md5):
        # print("%s\n\tmwname: %s\n\tsha256: %s\n\t   md5: %s\n\n" % (folder, folder.readme.mwname, folder.readme.sha256, folder.readme.md5))
    if folder.type == MCFPFolderType.BOTNET:
        try:
            row = ids.loc[folder.id, "name"]
            ids.at[folder.id, "find"] = True
            ids.at[folder.id, "mwname"] = folder.readme.mwname
            ids.at[folder.id, "sha256"] = folder.readme.sha256
            ids.at[folder.id, "md5"] = folder.readme.md5
        except:
            # print("err")
            pass
        pass
    pass

pcap_mw = ids.groupby(["md5"]).agg({
    "name": lambda s: s.to_list(),
    "id": lambda s: s.to_list(),
    "mwname": lambda s: list(set(s.to_list())),
    "sha256": lambda s: list(set(s.to_list())),
    "find": "sum"
})

pcap_mw["mwname"] = pcap_mw["mwname"].apply(lambda n: n[0] if n[0] else None)
pcap_mw["sha256"] = pcap_mw["sha256"].apply(lambda n: n[0] if n[0] else None)

pcap_mw = pcap_mw.reset_index()

pcap_mw


Unnamed: 0,md5,name,id,mwname,sha256,find
0,13fbc418d5a37bdc2c10da11a6ef46ae,[71_2014-04-07_capture-win19.fixed.pcap],[57],,,1
1,14010ce6f03e0a978693424d60e34ba9,[166-1_2016-04-29_win-3.pcap],[37],tinba,8006cbd1c70b2ed096af9c72d6fef2c3e9cb0a41685408...,1
2,1db5333a57f56c4b80bc213ed7675793,[170-1_capture-win6.pcap],[41],necurse,b3ca90d4b289d31bbdefbff7d9b42bfeb0f44f97e4738c...,1
3,24dcfdb1f46e4018500db101234f6cd7,[69_2014-04-07_capture-win17.pcap],[54],caphaw,9966bc568225c509c3e3b3ff9f548ba59f39908200e3e8...,1
4,3018e99857f31a59e0777396ae634a8f,"[226-2_2017-02-27_win16.fixed.pcap, 226-1_2017...","[44, 43]",worm.netsky,c8fffb2e737514c551b2d7bcaf8baa459564b059cab1a3...,2
5,35cf982449765a4f163bcf822e663f03,[125-1_2015-06-07_capture-win5.pcap],[34],,,1
6,43ecaeb983683f57af842c8993e242e6,[100_2014-12-20_capture-win5.fixed.pcap],[27],,,1
7,5aeb4b21066217ea08f77ba9390f31b5,[140-2_2015-10-27_capture-win11.pcap],[36],bunitu,002e9d6217b17923b69710792fd85765c41a0fc5406ce6...,1
8,6323b0f509920c6482d3c5737bb68c60,[16_2013-08-28_capture-win11.pcap],[26],virustotal,bab391b77129150445521fddc073997ca6c7cb3db35eb1...,1
9,6548d6013af8f8ccccf41cf0cd78372b,"[303-1_2017-08-12_capture-win2.pcap, 301-1_cap...","[48, 47]",sathurbot,20ae9e5f8f26635c627afce5eaeeb749af459f55138c80...,2


In [194]:
from sqlalchemy import text
from sqlalchemy.exc import IntegrityError
import psycopg2

for _, row in pcap_mw.iterrows():
    with database.engine.connect() as conn:
        try:
            bo = conn.execute(
                text("""
                INSERT INTO public.malware(
                name, sha256, md5)
                VALUES (:name, :sha256, :md5)
                RETURNING ID;
                """),
                [{"name": row["mwname"], "sha256": row["sha256"], "md5": row["md5"] }]
            )
            conn.commit()
        except IntegrityError as e:
            try:
                raise e.orig
            except psycopg2.errors.UniqueViolation as e:
                pass
            except:
                print(e)

db_malwares = pd.read_sql("""SELECT * FROM malware""", database.engine.connect()).set_index("md5")
for _, row in pcap_mw.iterrows():
    with database.engine.connect() as conn:
        mwid = int(db_malwares.loc[row["md5"], "id"])
        for pcapid in row["id"]:
            try:
                conn.execute(
                    text("""
                    UPDATE PCAP SET MALWARE_ID=:mwid WHERE id=:id;
                    """),
                    [{"id": int(pcapid), "mwid": mwid }]
                )
                conn.commit()
            except IntegrityError as e:
                try:
                    raise e.orig
                except psycopg2.errors.UniqueViolation as e:
                    pass
                except:
                    print(e)


In [206]:

malwares = pd.read_sql("""SELECT * FROM malware""", database.engine.connect()).set_index("id")
pcaps = pd.read_sql("""SELECT * FROM PCAP WHERE dataset='CTU-13' and infected is true""", database.engine.connect())[["name", "malware_id", "id"]]
test = []
for _,pcap in pcaps.iterrows():
    md5 = malwares.loc[pcap["malware_id"], "md5"]
    mcfp_mw = pcap_mw.set_index("md5").loc[md5]
    test.append(pcap["id"] in mcfp_mw["id"])

all(test)

True