In [108]:
import common
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import os, pathlib, enum
import json
from pathlib import Path

root_url = Path("https://mcfp.felk.cvut.cz/publicDatasets/")
directory_path = Path(f"mcfp/publicDataset/")

re_name = re.compile(r"name:\s+([\w\.\-\_]*)\b", re.MULTILINE | re.IGNORECASE)
re_li_md5 = re.compile(r"sha256:\s+\b([A-Fa-f0-9]{32})\b$", re.MULTILINE | re.IGNORECASE)
re_li_sha256 = re.compile(r"sha256:\s+\b([A-Fa-f0-9]{64})\b$", re.MULTILINE | re.IGNORECASE)
re_md5 = re.compile(r"([A-Fa-f0-9]{32})", re.MULTILINE | re.IGNORECASE)
re_sha256 = re.compile(r"([a-f0-9]{64})", re.MULTILINE | re.IGNORECASE)

def re_run(self, rgx, string):
    match = rgx.search(string)
    return match.group(1) if match else None

def scrape_table(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    table = soup.find('table')
    if table is None:
        return None
    table_data = []
    for row in table.find_all('tr'):
        columns = row.find_all('td')
        row_data = [col.text.strip() for col in columns]
        if row_data:
            table_data.append(row_data)
    return table_data

class File:
    def __init__(self, folder: MCFPFolder, name: str, size: str = "", ts: str = ""):
        self.folder = folder
        self.name = name
        self.ts = ts
        self.size = size
        self._content = None
        pass
    
    @property
    def content(self):
        if self._content:
            return self._content
        fpath = self.folder.dir.joinpath(self.name if len(self.name) else "index.html")
        url = self.folder.url.joinpath(self.name)
        if fpath.is_file():
            with open(fpath, "br") as fp:
                self._content = fp.read()
        else:
            response = requests.get(url)
            if response.status_code == 200:
                self.content = response.content
                with open(fpath, "bw") as fp:
                    fp.write(response.content)
        return self._content

    @property
    def txt(self):
        return self.content.decode("utf-8")
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return self.name
                

class Readme(File):
    def __init__(self, folder: MCFPFolder, name: str, size: str = "", ts: str = ""):
        super().__init__(folder, name, size, ts)
        pass

    @property
    def lists(self):
        soup = BeautifulSoup(self.txt, 'html.parser')
        return [ li.text for li in soup.find_all('li') ]

    @property
    def sha256(self):
        match = re_li_sha256.search(self.txt)
        if match:
            return match.group(1)
        match = re_sha256.search(self.txt)
        if match:
            return match.group(1)
        # print("SHA256 not present.")
        return None
    
    @property
    def md5(self):
        match = re_li_md5.search(self.txt, re.IGNORECASE)
        if match:
            return match.group(1)
        match = re_md5.search(self.txt, re.IGNORECASE)
        if match:
            return match.group(1)
        # print("SHA256 not present.")
        return None

    @property
    def mwname(self):
        match = re_name.search(self.txt.lower(), re.IGNORECASE)
        if match:
            return match.group(1)
        # print("Name not present.")
        return None

class MCFPFolderType(enum.Enum):
    BOTNET = 0
    MIXED = 1
    NORMAL = 2
    OTHER = 3
    pass

    @staticmethod
    def translate(name):
        if 0 <= name.find("Botnet"): return MCFPFolderType.BOTNET
        if 0 <= name.find("Mixed"): return MCFPFolderType.MIXED
        if 0 <= name.find("Normal"): return MCFPFolderType.NORMAL
        return MCFPFolderType.OTHER

class MCFPFolder:
    def __init__(self, name):
        self.name = name
        self.url = root_url.joinpath(name)
        self.dir = directory_path.joinpath(name)
        self.dir.mkdir(exist_ok=True)
        self.htmlindex = File(self, "")
        self.type = MCFPFolderType.translate(self.name)
        if self.type == MCFPFolderType.BOTNET:
            self.id = self.name.replace("CTU-Malware-Capture-Botnet-", "").replace("/", "")
        elif self.type == MCFPFolderType.NORMAL:
            self.id = self.name.replace("CTU-Malware-Capture-Normal-", "").replace("/", "")
        elif self.type == MCFPFolderType.MIXED:
            self.id = self.name.replace("CTU-Mixed-Capture-", "").replace("/", "")
        else:
            self.id = self.name
        pass

    @property
    def table(self):
        if not self.htmlindex.txt:
                raise Exception("No htmlindex for %s" % self.name)
        table = scrape_table(self.htmlindex.txt)
        if not table:
            raise Exception("No table in htmlindex for %s" % self.name)
        return [ row for row in table ]

    @property
    def files(self):
        return [ File(self, row[1], ts=row[2], size=row[3]) for row in self.table if row[1].lower().find("parent") == -1]

    @property
    def readmes(self):
        return [ file for file in self.files if file.name.lower().find("readme") >= 0 ]
        
    @property
    def readme(self):
        if self.readmes:
            tmp = [ file for file in self.readmes if file.name.lower() == "readme.html" ]
            file = tmp[0] if tmp else self.readmes[0]
            return Readme(self, file.name, ts=file.ts, size=file.size)
        return None

    @property
    def pcaps(self):
        return [ file for file in self.files if file["name"].lower().endswith(".pcap") ]

    def __str__(self):
        return self.name
        

database = common.Database()

botnets = {}


In [165]:
pcaps = pd.read_sql("SELECT * FROM pcap WHERE infected is true", database.engine)

pcaps["mcfp_id"] = pcaps[pcaps["dataset"] == "CTU-13"]["name"].apply(lambda x: x.split("_")[0])
pcaps["mcfp_pcap"] = pcaps[pcaps["dataset"] == "CTU-13"]["name"].apply(lambda x: x[x.find("_"):])
ids = pcaps.set_index("mcfp_id").copy()
ids["find"] = False
ids["mwname"] = None
ids["sha256"] = None
ids["md5"] = None
botnets = {}
for folder in root_page_dirs:
    folder = MCFPFolder(folder)
    # if folder.type in [ MCFPFolderType.BOTNET, MCFPFolderType.MIXED ] and folder.readme and (not folder.readme.sha256 and not folder.readme.md5):
        # print("%s\n\tmwname: %s\n\tsha256: %s\n\t   md5: %s\n\n" % (folder, folder.readme.mwname, folder.readme.sha256, folder.readme.md5))
    if folder.type == MCFPFolderType.BOTNET:
        try:
            row = ids.loc[folder.id, "name"]
            ids.at[folder.id, "find"] = True
            ids.at[folder.id, "mwname"] = folder.readme.mwname
            ids.at[folder.id, "sha256"] = folder.readme.sha256
            ids.at[folder.id, "md5"] = folder.readme.md5
        except:
            # print("err")
            pass
        pass
    pass

pcap_mw = ids.groupby(["md5"]).agg({
    "name": lambda s: s.to_list(),
    "id": lambda s: s.to_list(),
    "mwname": lambda s: list(set(s.to_list())),
    "sha256": lambda s: list(set(s.to_list())),
    "find": "sum"
})

pcap_mw["mwname"] = pcap_mw["mwname"].apply(lambda n: n[0] if n[0] else None)
pcap_mw["sha256"] = pcap_mw["sha256"].apply(lambda n: n[0] if n[0] else None)

pcap_mw = pcap_mw.reset_index()

pcap_mw


Unnamed: 0,md5,name,id,mwname,sha256,find
0,13fbc418d5a37bdc2c10da11a6ef46ae,[71_2014-04-07_capture-win19.fixed.pcap],[57],,,1
1,14010ce6f03e0a978693424d60e34ba9,[166-1_2016-04-29_win-3.pcap],[37],tinba,8006cbd1c70b2ed096af9c72d6fef2c3e9cb0a41685408...,1
2,1db5333a57f56c4b80bc213ed7675793,[170-1_capture-win6.pcap],[41],necurse,b3ca90d4b289d31bbdefbff7d9b42bfeb0f44f97e4738c...,1
3,24dcfdb1f46e4018500db101234f6cd7,[69_2014-04-07_capture-win17.pcap],[54],caphaw,9966bc568225c509c3e3b3ff9f548ba59f39908200e3e8...,1
4,3018e99857f31a59e0777396ae634a8f,"[226-2_2017-02-27_win16.fixed.pcap, 226-1_2017...","[44, 43]",worm.netsky,c8fffb2e737514c551b2d7bcaf8baa459564b059cab1a3...,2
5,35cf982449765a4f163bcf822e663f03,[125-1_2015-06-07_capture-win5.pcap],[34],,,1
6,43ecaeb983683f57af842c8993e242e6,[100_2014-12-20_capture-win5.fixed.pcap],[27],,,1
7,5aeb4b21066217ea08f77ba9390f31b5,[140-2_2015-10-27_capture-win11.pcap],[36],bunitu,002e9d6217b17923b69710792fd85765c41a0fc5406ce6...,1
8,6323b0f509920c6482d3c5737bb68c60,[16_2013-08-28_capture-win11.pcap],[26],virustotal,bab391b77129150445521fddc073997ca6c7cb3db35eb1...,1
9,6548d6013af8f8ccccf41cf0cd78372b,"[303-1_2017-08-12_capture-win2.pcap, 301-1_cap...","[48, 47]",sathurbot,20ae9e5f8f26635c627afce5eaeeb749af459f55138c80...,2


In [194]:
from sqlalchemy import text
from sqlalchemy.exc import IntegrityError
import psycopg2

for _, row in pcap_mw.iterrows():
    with database.engine.connect() as conn:
        try:
            bo = conn.execute(
                text("""
                INSERT INTO public.malware(
                name, sha256, md5)
                VALUES (:name, :sha256, :md5)
                RETURNING ID;
                """),
                [{"name": row["mwname"], "sha256": row["sha256"], "md5": row["md5"] }]
            )
            conn.commit()
        except IntegrityError as e:
            try:
                raise e.orig
            except psycopg2.errors.UniqueViolation as e:
                pass
            except:
                print(e)

db_malwares = pd.read_sql("""SELECT * FROM malware""", database.engine.connect()).set_index("md5")
for _, row in pcap_mw.iterrows():
    with database.engine.connect() as conn:
        mwid = int(db_malwares.loc[row["md5"], "id"])
        for pcapid in row["id"]:
            try:
                conn.execute(
                    text("""
                    UPDATE PCAP SET MALWARE_ID=:mwid WHERE id=:id;
                    """),
                    [{"id": int(pcapid), "mwid": mwid }]
                )
                conn.commit()
            except IntegrityError as e:
                try:
                    raise e.orig
                except psycopg2.errors.UniqueViolation as e:
                    pass
                except:
                    print(e)


In [205]:

malwares = pd.read_sql("""SELECT * FROM malware""", database.engine.connect()).set_index("id")
pcaps = pd.read_sql("""SELECT * FROM PCAP WHERE dataset='CTU-13' and infected is true""", database.engine.connect())[["name", "malware_id", "id"]]
test = []
for _,pcap in pcaps.iterrows():
    md5 = malwares.loc[pcap["malware_id"], "md5"]
    mcfp_mw = pcap_mw.set_index("md5").loc[md5]
    test.append(pcap["id"] in mcfp_mw["id"])

all(test)

True

In [110]:
"""
Save all pages under `https://mcfp.felk.cvut.cz/publicDatasets/` and its subdirectories
"""

regex_getCTUIDs = r">(CTU-[\w\-\d]*)\/<"

directory_path.mkdir(exist_ok=True)

root_page = download_ifnotexists(root_url, directory_path.joinpath("index.html"))

table = scrape_table(root_page)

root_page_dirs = [ row[1] for row in table if row[1][-1]=="/"]
botnets = {}
for dirname in root_page_dirs:
    folder = MCFPFolder(dirname)
    if folder.type in [ MCFPFolderType.BOTNET, MCFPFolderType.MIXED ] and folder.readme and (not folder.readme.sha256 and not folder.readme.md5):
        print("%s\n\tmwname: %s\n\tsha256: %s\n\t   md5: %s\n\n" % (folder, folder.readme.mwname, folder.readme.sha256, folder.readme.md5))
    pass

for botnet_id in botnets:
    for filename_readme in botnets[botnet_id]["readmes"]["names"]:
        readme = download_ifnotexists(f"{root_url}/{botnet_id}/{filename_readme}", dirpath.joinpath(filename_readme))
        if readme:
            botnets[botnet_id]["files"]["readmes"]["content"].append(readme)
            pass
        pass
    pass

with open("capture.json", "w") as fp:
    json.dump(botnets, fp)

CTU-Malware-Capture-Botnet-25/
	mwname: None
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-35-1/
	mwname: None
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-73/
	mwname: win1
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-102/
	mwname: this
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-175-1/
	mwname: None
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-205-1/
	mwname: http
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-205-2/
	mwname: http
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-321-1/
	mwname: None
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-340-1/
	mwname: access
	sha256: None
	   md5: None


CTU-Malware-Capture-Botnet-344-1/
	mwname: access
	sha256: None
	   md5: None




In [16]:
with open("capture.json") as fp:
    botnets = json.load(fp)


regexs = []
regexs.append([ "probable name", re.compile(r"probable name:\s+(.*?)$", re.MULTILINE) ])
regexs.append([ "sha256", re.compile(r"sha256:\s+(.*?)$", re.MULTILINE) ])

for botnet_id in botnets:
    if not botnet_id.startswith("CTU"): continue
    print(botnet_id, botnets[botnet_id]["files"])
    if "readme.html" in botnets[botnet_id]["files"]["readmes"]:
        readme = botnets[capture]["readmes"]["readme.html"]
        soup = BeautifulSoup(readme, 'html.parser')
        for li in soup.find_all('li'):
            for regex in regexs:
                tmp = extract_first_group(regex, li.text.lower())
                if tmp:
                    print(tmp)
                pass
            pass
        pass
    break

CTU-13-Dataset/ {'readmes': ['<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n <head>\n  <title>Index of /publicDatasets/CTU-13-Dataset</title>\n </head>\n <body>\n<h1>Index of /publicDatasets/CTU-13-Dataset</h1>\n  <table>\n   <tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>\n   <tr><th colspan="5"><hr></th></tr>\n<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/publicDatasets/">Parent Directory</a></td><td>&nbsp;</td><td align="right">  - </td><td>&nbsp;</td></tr>\n<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="CTU-13-Dataset.tar.bz2">CTU-13-Dataset.tar.bz2</a></td><td align="right">2014-11-01 16:32  </td><td align="right">1.9G</td><td>&nbsp;</td></tr>\n<tr><td valign="top"><img src="/icons/text.gif" alt="[TXT]"></td><

In [8]:
from IPython.display import Markdown
import numpy as np

# display(Markdown(pd.DataFrame(botnets).T["probable name"].value_counts().to_markdown()))
df = pd.DataFrame(botnets).T.fillna(np.NaN)

df[df["sha256"].isna()][["probable name", "sha256"]]

KeyError: 'sha256'

In [68]:
pcaps = pd.read_sql("SELECT * FROM PCAP", database.engine)

dfbackbone = []
for _, pcap in pcaps[pcaps["dataset"] == "CTU-13"].iterrows():
    mcfp_id = pcap["name"][:pcap["name"].find("_")]
    filename = pcap["name"][pcap["name"].find("_") + 1:].replace(".fixed", "").replace(".original", "")
    if pcap["infected"]:
        name = "CTU-Malware-Capture-Botnet-%s" % mcfp_id
        if name in botnets:
            if filename in botnets[name]["files"][".pcap"]:
                if len(botnets[name]["mw"]) == 2:
                    dfbackbone.append([pcap["id"], mcfp_id, filename, botnets[name]["mw"][0], botnets[name]["mw"][1]])
                else:
                    dfbackbone.append([pcap["id"], mcfp_id, filename, "-", "-"])
                pass
            else:
                print(filename, botnets[name]["files"][".pcap"], "\n")

malwares = pd.DataFrame(dfbackbone, columns=["pcaps_id", "MCFP_ID", "filename", "mwname", "mwsha256"])
malwares

353-1.5001-10000.pcap ['353-1.0-1000.pcap', '353-1.0-5000.pcap', '2018-05-07_capture.pcap'] 



Unnamed: 0,pcaps_id,MCFP_ID,filename,mwname,mwsha256
0,52,355-1,2018-05-07_capture-win6.pcap,simda,72e97227948b8a6254e2717c0318115042c1c070abb282...
1,44,226-2,2017-02-27_win16.pcap,worm.netsky,c8fffb2e737514c551b2d7bcaf8baa459564b059cab1a3...
2,42,225-1,capture_win1.pcap,tinba,4904b2bfb4becf349662ba0ff6f3ade860c7e9086a674e...
3,40,169-3,2016-08-03_win4.pcap,miuref,e12a2c2b633ac12cec3e0d32950dcd5011d2aba4a9b955...
4,43,226-1,2017-2-27_win5.pcap,worm.netsky,c8fffb2e737514c551b2d7bcaf8baa459564b059cab1a3...
5,48,303-1,2017-08-12_capture-win2.pcap,sathurbot,20ae9e5f8f26635c627afce5eaeeb749af459f55138c80...
6,26,16,2013-08-28_capture-win11.pcap,-,-
7,57,71,2014-04-07_capture-win19.pcap,-,-
8,30,10,2013-08-20_capture-win7.pcap,-,-
9,37,166-1,2016-04-29_win-3.pcap,tinba,8006cbd1c70b2ed096af9c72d6fef2c3e9cb0a41685408...


In [70]:
from sqlalchemy import text

# tmp = malwares[["pcaps_id", "mwname", "mwsha256"]].groupby("mwsha256").agg({
#         "pcaps_id": lambda x: x.drop_duplicates().to_list(),
#         "mwname": lambda x: ",".join(x.drop_duplicates().to_list())
#     }).reset_index(inplace=True)

print(tmp)

with database.engine.connect() as conn:
    for row in malwares.to_dict(orient="records"):
        # 10
        if row["filename"] in ['2013-08-20_capture-win7.pcap', '2013-08-20_capture-win10.pcap', '2013-08-20_capture-win7.pcap', '2013-08-20_capture-win9.pcap']:
            row["mwsha256"] = "bab391b77129150445521fddc073997ca6c7cb3db35eb1a7c45c65ca365370c3"
            row["mwname"] = "oleprn.dll"
            pass
        # 100
        if row["filename"] in ['2014-12-20_capture-win5.pcap']:
            row["mwsha256"] = "8bfa2d39be44de91958226983d983a07c1bcf75b007ab17b295215d08a63f032"
            row["mwname"] = "WINAPI32.EXE"
            pass
        # 110-6
        if row["filename"] in ['2015-06-07_capture-win9.pcap']:
            row["mwsha256"] = "517da0900570ec9e9793d9e32a8445b5910b95618c4e1ea63da799d4a785dcd9"
            row["mwname"] = "htbot.exe"
            pass
        # 111-3
        if row["filename"] in ['2015-04-22_capture-win10.pcap']:
            row["mwsha256"] = "517da0900570ec9e9793d9e32a8445b5910b95618c4e1ea63da799d4a785dcd9"
            row["mwname"] = "htbot.exe"
            pass
        # 125-1
        if row["filename"] in ['2015-06-07_capture-win5.pcap']:
            row["mwsha256"] = "ce0889dcae941653565fa2580ca1062173540a157c30173aca853fbee335df07"
            row["mwname"] = "Geodo"
            pass
        # 126-1
        if row["filename"] in ['2015-06-07_capture-win7.pcap']:
            row["mwsha256"] = "4f236c31c1a9867013656960d11108f315a0f00f698a72836141082c7d9fce80"
            row["mwname"] = "Geodo/Feodo"
            pass
        # 16
        if row["filename"] in ['2013-08-28_capture-win11.pcap']:
            row["mwsha256"] = "bab391b77129150445521fddc073997ca6c7cb3db35eb1a7c45c65ca365370c3"
            row["mwname"] = "oleprn.dll"
            pass
        # 5
        if row["filename"] in ['2013-08-20_capture-win14.pcap']:
            row["mwsha256"] = "80334036d44bfa4a2bdc0fa5b75d015513d00e38f26f043fb7f26fabcad0c2b7"
            row["mwname"] = "8219s.exe"
            pass
        # 6
        if row["filename"] in ['2013-08-20_capture-win6.pcap', '2013-08-20_capture-win11.pcap']:
            row["mwsha256"] = "dc991de6ae9989902c1e59d30050223de5a0c1327c2e54e0f6da8134e3f447ca"
            row["mwname"] = "3ab45z.exe"
            pass
        # 7
        if row["filename"] in ['2013-08-20_capture-win1.pcap']:
            row["mwsha256"] = "781a96f056e43d9f9765c38f24e557e4cbe41d9d0bf17d637ce20c33248f8899"
            row["mwname"] = "FzPfH6.exe"
            pass
        # 71
        if row["filename"] in ['2014-04-07_capture-win19.pcap']:
            row["mwsha256"] = "efe31de9456d2661a2026a4bac9ebf1061b74290010aef95f11bab8c21241c59"
            row["mwname"] = "salesforce_ssl_cert.zip"
            pass
        try:
            conn.execute(
                text("INSERT INTO public.malware(name, sha256, dga, info, mwclass, mwtype) VALUES (:name, :sha256);"),
                [{ "name": row["mwname"], "sha256": row["mwsha256"] }]
            )
            conn.commit()
        except:
            pass

with database.engine.connect() as conn:
    for pcap_id in row["pcaps_id"]:
        mw = pd.read_sql("""SELECT * FROM MALWARE WHERE SHA256 = '%s'""" % row["mwsha256"], database.engine)
        if mw["id"].shape[0] > 0:
            bo = conn.execute(
                text("UPDATE public.pcap SET malware_id = :mwid WHERE id = :pcapid;"),
                [{"pcapid": int(pcap_id), "mwid":  int(mw["id"].iloc[0])}]
            )
            conn.commit()
            pass
        pass
    pass

None


AttributeError: 'NoneType' object has no attribute 'iterrows'

In [66]:
# ukws = malwares[malwares["mwsha256"] == "-"].groupby("MCFP_ID").agg({"filename": lambda x: x.to_list()}).reset_index()
# for _, row in ukws.iterrows():
#     print(f"""
#     # {row["MCFP_ID"]}
#     if row["filename"] in {row["filename"]}:
#         row["mwsha256"] = ""
#         row["mwname"] = ""
#         pass
#     """, end="")
ukws = malwares[malwares["mwsha256"] == "-"].to_dict(orient="records")
for row in ukws:


with database.engine.connect() as conn:
    for row in ukws:
        try:
            bo = conn.execute(
                text("""
                INSERT INTO public.malware(
                name, sha256, dga, info, mwclass, mwtype)
                VALUES (:name, :sha256, :dga, :info, :mwclass, :mwtype)
                RETURNING ID;
                """),
                [{"name": row["mwname"], "sha256": row["mwsha256"], "dga": None, "info": None, "mwclass": None, "mwtype": None}]
            )
            conn.commit()
        except:
            pass

with database.engine.connect() as conn:
    for row in ukws:
        mw = pd.read_sql("""SELECT * FROM MALWARE WHERE SHA256 = '%s'""" % row["mwsha256"], database.engine)
        if row["id"].shape[0] > 0:
            bo = conn.execute(
                text("""
                UPDATE public.pcap
                SET malware_id = :mwid
                WHERE id = :pcapid
                """),
                [{"pcapid": int(pcap_id), "mwid":  int(mw["id"].iloc[0])}]
            )
            conn.commit()
            pass
        pass
    pass

[{'pcaps_id': 26,
  'MCFP_ID': '16',
  'filename': '2013-08-28_capture-win11.pcap',
  'mwname': 'oleprn.dll',
  'mwsha256': 'bab391b77129150445521fddc073997ca6c7cb3db35eb1a7c45c65ca365370c3'},
 {'pcaps_id': 57,
  'MCFP_ID': '71',
  'filename': '2014-04-07_capture-win19.pcap',
  'mwname': 'salesforce_ssl_cert.zip',
  'mwsha256': 'efe31de9456d2661a2026a4bac9ebf1061b74290010aef95f11bab8c21241c59'},
 {'pcaps_id': 30,
  'MCFP_ID': '10',
  'filename': '2013-08-20_capture-win7.pcap',
  'mwname': 'oleprn.dll',
  'mwsha256': 'bab391b77129150445521fddc073997ca6c7cb3db35eb1a7c45c65ca365370c3'},
 {'pcaps_id': 28,
  'MCFP_ID': '10',
  'filename': '2013-08-20_capture-win10.pcap',
  'mwname': 'oleprn.dll',
  'mwsha256': 'bab391b77129150445521fddc073997ca6c7cb3db35eb1a7c45c65ca365370c3'},
 {'pcaps_id': 56,
  'MCFP_ID': '6',
  'filename': '2013-08-20_capture-win6.pcap',
  'mwname': '3ab45z.exe',
  'mwsha256': 'dc991de6ae9989902c1e59d30050223de5a0c1327c2e54e0f6da8134e3f447ca'},
 {'pcaps_id': 29,
  'MCFP

In [None]:


def insert_malware(db: common.Database, name, sha256, dga=None, info=None, mwclass=None, mwtype=None):

    with database.engine.connect() as conn:
        conn.execute(
            text("""
            INSERT INTO public.malware(
            name, sha256, dga, info, mwclass, mwtype)
            VALUES (:name, :sha256, :dga, :info, :mwclass, :mwtype);
            """),
            [{"name": name, "sha256": sha256, "dga": dga, "info": info, "mwclass": mwclass, "mwtype": mwtype}]
        )
        conn.commit()
        pass




In [None]:
pcaps = pd.read_sql("SELECT * FROM PCAP", database.engine)
malwares = pd.read_sql("SELECT * FROM MALWARE WHERE dga!=0", database.engine)

In [None]:
pcaps[(pcaps["dataset"] == "CTU-13") & pcaps["infected"]]

In [None]:
readmes = {}
for _, pcap in pcaps[(pcaps["dataset"] == "CTU-13") & pcaps["infected"]].iterrows():
    mcfp_id = pcap["name"].split("_")[0]
    print(pcap["name"], pcap["name"].split("_")[0])
    readmes[mcfp_id] = download_readme("https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-%s/README.md", pcap["name"].split("_")[0])


In [None]:
for pcap in readmes:
    if readmes[pcap] is False:
        print(pcap)

In [None]:
malwares

In [None]:
pcaps[pcaps["dataset"] == "CTU-13"][["id", "malware_id"]].groupby("malware_id").count()

In [None]:
pcaps[(pcaps["dataset"] == "CTU-13") & pcaps["infected"]]["name"].str.split("_").apply(lambda x: x[0])

# 353-1

- Probable Name: Simda
- MD5: eb7c74c66f801abde07e0d1a72cbec79
- SHA1: cab0e0925479ae7f2dac4c0529e5811f1d12e563
- SHA256: 72e97227948b8a6254e2717c0318115042c1c070abb2825a0a9fdff1fbd98f1e
- Password of zip file: infected

In [None]:
import psycopg2

try:
    insert_malware(database, name="Simda", sha256="72e97227948b8a6254e2717c0318115042c1c070abb2825a0a9fdff1fbd98f1e")
except:
    print("not inserted")
    pass

In [None]:

with open("pcap_43.ipynb") as fp:
    json = fp.read()

print(json)

for id, pcap in pcaps.iterrows():
    