## Setup - Import modules and setup database connections

In [1]:
# import necessary modules and setup database connection

import json
import logging
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker


# Load settings from settings.json
settings_file = '../settings.json'
with open(settings_file) as f:
    settings = json.load(f)

# Database setup
DATABASE_URL = settings['sqlalchemy_database_uri']
engine = create_engine(DATABASE_URL, echo=False)
Session = sessionmaker(bind=engine)

# move current directory to parent directory
import os
os.chdir('..')

from models import Sample, Tag, Analysis, sample_tag, Prototypes, Ingredient, Candidate
session = Session()

## Count all major entities

In [None]:
import os, json

# project root dir
project_root = os.path.dirname(os.getcwd())
settings_file = os.path.join(project_root, 'settings.json')
with open(settings_file, 'r') as f:
    settings = json.load(f)

from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
from models import Sample, Tag, Analysis, sample_tag, Ingredient, Candidate

# Database setup
DATABASE_URL = settings['sqlalchemy_database_uri']
engine = create_engine(DATABASE_URL, echo=False)
Session = sessionmaker(bind=engine)
session = Session()

Number of samples in the database: 5944
Number of tags in the database: 11
Number of analyses in the database: 17878
Number of prototypes in the database: 0
Number of ingredients in the database: 0
Number of candidates in the database: 8


## Get tags and counts

In [3]:
# get tags and their associated sample counts
session.expire_all()

tags = session.query(Tag).all()
tag_sample_counts = {}
for tag in tags:
    sample_count = session.query(Sample).join(sample_tag).filter(sample_tag.c.tag_id == tag.id).count()
    tag_sample_counts[(tag.key, tag.value)] = sample_count
print("\nTag Sample Counts:")
for tag_value, sample_count in tag_sample_counts.items():
    print(f"Tag: {tag_value}, Sample Count: {sample_count}")


Tag Sample Counts:
Tag: ('family', 'bsi'), Sample Count: 3
Tag: ('tatic', 'scheduled_task'), Sample Count: 3
Tag: ('ttp', 'wmi'), Sample Count: 1
Tag: ('ttp', 'com'), Sample Count: 1
Tag: ('ttp', 'cmd'), Sample Count: 1
Tag: ('family', 'benign'), Sample Count: 5908
Tag: ('class', 'wmi'), Sample Count: 15
Tag: ('class', 'purple'), Sample Count: 1
Tag: ('class', 'cow'), Sample Count: 1
Tag: ('class', 'com'), Sample Count: 8
Tag: ('class', 'cmd'), Sample Count: 5


In [4]:
# get all samples asscoiated with a specific tag
session.expire_all()

tag_key = 'class'
tag_value = 'wmi'
samples_with_tag = session.query(Sample).join(sample_tag).join(Tag).filter(Tag.key == tag_key, Tag.value == tag_value).all()
print(f"\nSamples with tag ({tag_key}={tag_value}):")
for sample in samples_with_tag:
    print(f"Sample ID: {sample.sha256}, Name: {sample.filepath}")


Samples with tag (class=wmi):
Sample ID: 3e538fdd57f368e3cbd31c14a9e1e6880c81e94c93282871c903020471a14190, Name: /mnt/data/gimc/3e/3e53/3e538fdd57f368e3cbd31c14a9e1e6880c81e94c93282871c903020471a14190
Sample ID: f2a839f3eac858ddb450a162a9faa6fe54391fb0f0b0c715584cffe36db6e191, Name: /mnt/data/gimc/f2/f2a8/f2a839f3eac858ddb450a162a9faa6fe54391fb0f0b0c715584cffe36db6e191
Sample ID: b788d12fed5dd6ad3c0331ee21e8b4c7f568b7f38116457a83370875a1315971, Name: /mnt/data/gimc/b7/b788/b788d12fed5dd6ad3c0331ee21e8b4c7f568b7f38116457a83370875a1315971
Sample ID: 79a9409ff29b2e967161e01a0f27bcb0153a66a604e667120e30a8c09ca8deef, Name: /mnt/data/gimc/79/79a9/79a9409ff29b2e967161e01a0f27bcb0153a66a604e667120e30a8c09ca8deef
Sample ID: c38bf4cb95005533dd52991b059bfbc60a13f81590e04735bd8a5ace221ee14b, Name: /mnt/data/gimc/c3/c38b/c38bf4cb95005533dd52991b059bfbc60a13f81590e04735bd8a5ace221ee14b
Sample ID: 6a181382dbbf14cdab0262153bf0bcc85957f95b8d720ebe93295fe520b7cdd1, Name: /mnt/data/gimc/6a/6a18/6a181382

In [None]:
count_Candidates = session.query(Candidate).count()
count_samples = session.query(Sample).count()
count_Analyses = session.query(Analysis).count()
count_Tags = session.query(Tag).count()
count_Ingredients = session.query(Ingredient).count()

print(f"Number of candidates in the database: {count_Candidates}")
print(f"Number of samples in the database: {count_samples}")
print(f"Number of analyses in the database: {count_Analyses}")


Analyses with tag (class=wmi):
Analysis ID: 17848, Sample ID: f2a839f3eac858ddb450a162a9faa6fe54391fb0f0b0c715584cffe36db6e191, Analysis Status: 2
Analysis ID: 17849, Sample ID: b788d12fed5dd6ad3c0331ee21e8b4c7f568b7f38116457a83370875a1315971, Analysis Status: 2
Analysis ID: 17859, Sample ID: 49fa19821dc17169120ff0160580ba0053ac882f263e5d94a5fa9dd26bb1eba3, Analysis Status: 2
Analysis ID: 17853, Sample ID: 79a9409ff29b2e967161e01a0f27bcb0153a66a604e667120e30a8c09ca8deef, Analysis Status: 2
Analysis ID: 17860, Sample ID: d078cb881b19d2e9ed54fe04985cf59f153ea14d33b20668817d44488915c019, Analysis Status: 2
Analysis ID: 17854, Sample ID: c38bf4cb95005533dd52991b059bfbc60a13f81590e04735bd8a5ace221ee14b, Analysis Status: 2
Analysis ID: 17865, Sample ID: 2e5ce94e324f3cda8e0da23f9b71387f7c4f13793c22eb9df180e61946c425f1, Analysis Status: 2
Analysis ID: 17856, Sample ID: 6a181382dbbf14cdab0262153bf0bcc85957f95b8d720ebe93295fe520b7cdd1, Analysis Status: 2
Analysis ID: 17861, Sample ID: 3d4049c24

In [6]:
# get counts of all statuses of analyses
session.expire_all()

from sqlalchemy import func
analysis_status_counts = session.query(Analysis.status, func.count(Analysis.id)).group_by(Analysis.status).all()
print("\nAnalysis Status Counts:")
for status, count in analysis_status_counts:
    print(f"Status: {status}, Count: {count}")


Analysis Status Counts:
Status: 2, Count: 17331
Status: 3, Count: 547


In [7]:
# get analysis by sample sha256
session.expire_all()

sample_sha256 = 'f2a839f3eac858ddb450a162a9faa6fe54391fb0f0b0c715584cffe36db6e191'
analysis_for_sample = session.query(Analysis).join(Sample).filter(Sample.sha256 == sample_sha256).all()
print(f"\nAnalyses for sample with SHA256 {sample_sha256}:")
for analysis in analysis_for_sample:
    print(f"Analysis ID: {analysis.id}, Status: {analysis.status}")
if not analysis_for_sample:
    print("No analyses found for this sample.")


Analyses for sample with SHA256 f2a839f3eac858ddb450a162a9faa6fe54391fb0f0b0c715584cffe36db6e191:
Analysis ID: 17848, Status: 2


In [8]:
# get analysis by its ID
session.expire_all()

analysis_id = 17849  # replace with desired analysis ID
analysis = session.query(Analysis).filter(Analysis.id == analysis_id).first()
if analysis:
    print(f"\nAnalysis ID: {analysis.id}, Status: {analysis.status}, Sample ID: {analysis.sample}")
else:
    print(f"\nNo analysis found with ID {analysis_id}.")


Analysis ID: 17849, Status: 2, Sample ID: b788d12fed5dd6ad3c0331ee21e8b4c7f568b7f38116457a83370875a1315971


In [9]:
# get candidates
session.expire_all()

candidates = session.query(Candidate).all()
print("\nCandidates:")
for candidate in candidates:
    class_tag_value = None
    for t in candidate.tags:
        if t.key == 'class':
            class_tag_value = t.value
    print(f"Candidate ID: {candidate.hash}, Status: {candidate.status}, F1: {candidate.F1}, F2: {candidate.F2}, F3: {candidate.F3}, Analysis ID: {candidate.analysis_id}, Error: {candidate.error_message}, Class Tag: {class_tag_value}")



Candidates:
Candidate ID: 7abec6d0e8b676015f472d274491b5a8d5ee0339ea4b53aa55c6a7e3694db9ee, Status: 3, F1: 0.2, F2: 0.3333333333333333, F3: 0.02028191275894642, Analysis ID: 17861, Error: None, Class Tag: None
Candidate ID: a29097919f40ba277abda67cdf616d59eda96649b3b2d8d50a8b4410c642ffc6, Status: 3, F1: 1.0, F2: 0.3333333333333333, F3: 0.02028191275894642, Analysis ID: 17862, Error: None, Class Tag: None
Candidate ID: 76b6cc1637d9e65dae68fd71f00883652bd7914f47016f7189fb451ec08e187e, Status: 3, F1: 0.5, F2: 1.0, F3: 0.989303469657898, Analysis ID: 17863, Error: Build VM timeout, Class Tag: None
Candidate ID: a4e381fdab1f71481eb33e09e9528800234c7da84a6d30f63f3339b20c03e71b, Status: 3, F1: 1.0, F2: 1.0, F3: 0.989303469657898, Analysis ID: 17866, Error: None, Class Tag: None
Candidate ID: 33b2547820326675e8356ec6cad137b52e1b57990e66e180b4089356c7753b43, Status: 3, F1: 1.0, F2: 1.0, F3: 0.9981027245521545, Analysis ID: 17872, Error: None, Class Tag: cmd
Candidate ID: 67f5f2678acff690d5dc4c

In [10]:
# # update status of a candidate to 0
# session.expire_all()

# candidate_hash = 'baa9f40b8d15fd64663654eb3c7f299fb33b904d6a8d3fd0918fbef8a162f86b'  # replace with actual candidate hash
# candidate = session.query(Candidate).filter(Candidate.hash == candidate_hash).first()
# if candidate:
#     candidate.status = 0  # replace with desired status
#     candidate.F1 = None
#     candidate.F2 = None
#     candidate.F3 = None
#     candidate.analysis_id = None
#     session.commit()
#     print(f"\nUpdated candidate {candidate_hash} to status '{candidate.status}'.")
# else:
#     print(f"\nNo candidate found with hash {candidate_hash}.")

In [11]:
# get full candidate details
session.expire_all()

import base64
candidate_hash = 'a4e381fdab1f71481eb33e09e9528800234c7da84a6d30f63f3339b20c03e71b'  # replace with actual candidate hash
candidate = session.query(Candidate).filter(Candidate.hash == candidate_hash).first()
if candidate:
    print(f"\nCandidate Details for {candidate_hash}:")
    print(f"Hash: {candidate.hash}")
    print(f"Status: {candidate.status}")
    print(f"F1: {candidate.F1}")
    print(f"F2: {candidate.F2}")
    print(f"F3: {candidate.F3}")
    print(f"Analysis ID: {candidate.analysis_id}")
    # unbase64 the code snippet and print first XX characters
    decoded_code = base64.b64decode(candidate.code).decode('utf-8', errors='ignore')
    print(f"Code Snippet: {decoded_code[:500]}")
else:
    print(f"\nNo candidate found with hash {candidate_hash}.")


Candidate Details for a4e381fdab1f71481eb33e09e9528800234c7da84a6d30f63f3339b20c03e71b:
Hash: a4e381fdab1f71481eb33e09e9528800234c7da84a6d30f63f3339b20c03e71b
Status: 3
F1: 1.0
F2: 1.0
F3: 0.989303469657898
Analysis ID: 17866
Code Snippet: #include <windows.h>
#include <comdef.h>
#include <Wbemidl.h>
#include <iostream>

#define INSTALL_NAME L"GIMCTestBSI"
#define TIMER_INTERVAL 2000  // 2 seconds in milliseconds

// Simple benign payload - writes timestamp to file to demonstrate persistence
const wchar_t* PAYLOAD_SCRIPT = 
    L"var fso = new ActiveXObject('Scripting.FileSystemObject');"
    L"var logFile = 'C:\\\\Users\\\\Public\\\\gimc_test.log';"
    L"var f = fso.OpenTextFile(logFile, 8, true);"
    L"var d = new Date();"
   


In [12]:
print(decoded_code)

#include <windows.h>
#include <comdef.h>
#include <Wbemidl.h>
#include <iostream>

#define INSTALL_NAME L"GIMCTestBSI"
#define TIMER_INTERVAL 2000  // 2 seconds in milliseconds

// Simple benign payload - writes timestamp to file to demonstrate persistence
const wchar_t* PAYLOAD_SCRIPT = 
    L"var fso = new ActiveXObject('Scripting.FileSystemObject');"
    L"var logFile = 'C:\\\\Users\\\\Public\\\\gimc_test.log';"
    L"var f = fso.OpenTextFile(logFile, 8, true);"
    L"var d = new Date();"
    L"f.WriteLine('[' + d.toLocaleString() + '] WMI persistence triggered');"
    L"f.Close();";

class WMIPersistence {
private:
    IWbemLocator* pLoc;
    IWbemServices* pSvc;
    
public:
    WMIPersistence() : pLoc(NULL), pSvc(NULL) {}
    
    ~WMIPersistence() {
        if (pSvc) pSvc->Release();
        if (pLoc) pLoc->Release();
        CoUninitialize();
    }
    
    bool Initialize() {
        HRESULT hres;
        
        // Initialize COM
        hres = CoInitializeEx(0, COINIT_MULTI

In [13]:
# get sample from analysis ID
session.expire_all()

analysis_id = 5000  # replace with desired analysis ID
analysis = session.query(Analysis).filter(Analysis.id == analysis_id).first()
if analysis:
    sample = session.query(Sample).filter(Sample.sha256 == analysis.sample).first()
    if sample:
        print(f"\nSample for Analysis ID {analysis_id}:")
        print(f"Sample ID: {sample.sha256}, Name: {sample.filepath}")
    else:
        print(f"\nNo sample found for Analysis ID {analysis_id}.")
    # get all tags for the sample
    tags = session.query(Tag).join(sample_tag).join(Sample).filter(Sample.sha256 == sample.sha256).all()
    print(f"\nTags for Sample ID {sample.sha256}:")
    for tag in tags:
        print(f"Tag: {tag.key}={tag.value}")


Sample for Analysis ID 5000:
Sample ID: 9417a8191a87057481807e31fe64079c63d6468c3bd6b3d6aae0de3fac8f3baa, Name: /mnt/data/gimc/94/9417/9417a8191a87057481807e31fe64079c63d6468c3bd6b3d6aae0de3fac8f3baa

Tags for Sample ID 9417a8191a87057481807e31fe64079c63d6468c3bd6b3d6aae0de3fac8f3baa:
Tag: family=bsi
Tag: tatic=scheduled_task
Tag: ttp=com


In [14]:
import torch
from classifier.models.cnn_nlp import CNN_NLP

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from transformers import AutoTokenizer
classifier_path = '/mnt/data/gimc/classifier/model_data/cnn4bsi_checkpoint.pth'
tokenizer_path = '/mnt/data/gimc/classifier/model_data/mal-reformer'

TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_path)
TOKENIZER.pad_token = "[PAD]"
TOKENIZER.cls_token = "[CLS]"
TOKENIZER.sep_token = "[SEP]"

checkpoint = torch.load(classifier_path)
vocab_size = 20000
embed_dim = 128
num_classes = 4
dropout = 0.5
MODEL = CNN_NLP(
    pretrained_embedding=None,
    freeze_embedding=False,
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    filter_sizes=[3, 4, 5],
    num_filters=[10, 10, 10],
    num_classes=num_classes,
    dropout=dropout
)
MODEL.load_state_dict(checkpoint['model_states'][-1])
MODEL.to(DEVICE)
MODEL.eval()

def mal_tokenizer(line):
    """
    Tokenize a line of text
    """
    line = line.lower()
    line = line.replace(',', ' ')
    line = line.replace('\\', ' ')
    line = line.replace('\\\\', ' ')
    return line.split()

dynamic_report_tokenized = []

with open(analysis.report) as f:
    report = f.read()
    dynamic_report = json.loads(report)['dynamic']
    for item in dynamic_report:
        line = f"{item['Operation']}, {item['Path']}, {item['Result']}"
        dynamic_report_tokenized.extend(mal_tokenizer(line))

report_text = " ".join(dynamic_report_tokenized)

MAX_SEQUENCE_LENGTH = 20480 * 2
inputs = TOKENIZER(
    report_text,
    padding='max_length',
    truncation=True,
    max_length=MAX_SEQUENCE_LENGTH,
    return_tensors='pt'
).to(DEVICE)

input_ids = inputs['input_ids']

# Run inference
with torch.no_grad():
    logits = MODEL(input_ids)
    # Apply softmax to get probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

# turn props into a list of probabilities
probabilities = probs.cpu().numpy().flatten().tolist()
print("Class Probabilities:")
for i, prob in enumerate(probabilities):
    print(f"Class {i}: {prob:.4f}")

Class Probabilities:
Class 0: 0.0042
Class 1: 0.9948
Class 2: 0.0000
Class 3: 0.0010
