In [8]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor
from mitreattack.navlayers import Layer
import requests

In [9]:
df = pd.read_csv('train1.csv')
df.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7,0,,53447,...,36144.0,0,,0.0,0,0,0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7,0,,53447,...,57858.0,0,,0.0,0,0,0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7,0,,53447,...,52682.0,0,,0.0,0,0,0,0.0,3.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7,0,,53447,...,20050.0,0,,0.0,0,0,0,0.0,3.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7,0,,53447,...,19844.0,0,0.0,0.0,0,0,0,0.0,1.0,1


In [10]:
df.describe()

Unnamed: 0,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
count,100.0,100.0,100.0,9.0,100.0,100.0,100.0,100.0,100.0,90.0,...,98.0,100.0,36.0,99.0,100.0,100.0,100.0,98.0,98.0,100.0
mean,0.0,6.86,0.02,1198.0,45765.77,1.33,1.01,0.99,109.74,74544.133333,...,32817.857143,0.43,0.0,0.0,0.11,0.04,0.05,0.244898,7.285714,0.49
std,0.0,0.984937,0.140705,779.924516,14813.245567,0.472582,0.1,0.1,63.431442,48663.925945,...,20634.524979,0.49757,0.0,0.0,0.314466,0.196946,0.219043,0.432238,4.545305,0.502418
min,0.0,0.0,0.0,146.0,39.0,1.0,1.0,0.0,3.0,167.0,...,2109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,7.0,0.0,614.0,46605.0,1.0,1.0,1.0,53.0,29826.0,...,14915.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,0.0,7.0,0.0,1161.0,53447.0,1.0,1.0,1.0,100.0,71041.0,...,33096.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
75%,0.0,7.0,0.0,1910.0,53447.0,2.0,1.0,1.0,164.0,117080.75,...,51037.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,1.0
max,0.0,7.0,1.0,2064.0,62773.0,2.0,2.0,1.0,214.0,162796.0,...,70432.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,15.0,1.0


In [2]:

# Function to parse bytes file and convert to numpy array
def parse_bytes_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    byte_values = []
    for line in lines:
        parts = line.strip().split()
        if len(parts) > 1:
            byte_values.extend(parts[1:])
    
    # Convert hexadecimal to integer, ignoring '??' values
    byte_values = [int(b, 16) if b != '??' else 0 for b in byte_values]
    return np.array(byte_values, dtype=np.uint8)

In [3]:

# Function to convert byte array to image
def bytes_to_image(byte_array, image_size):
    target_size = image_size[0] * image_size[1]
    
    # Truncate or pad the byte array to match the target size
    if len(byte_array) > target_size:
        byte_array = byte_array[:target_size]
    else:
        byte_array = np.pad(byte_array, (0, target_size - len(byte_array)), 'constant')
    
    byte_array = byte_array.reshape(image_size)
    byte_array = np.stack([byte_array]*3, axis=-1)  # Convert to 3 channels
    return Image.fromarray(byte_array)

In [4]:

# Process each file in the dataset directory
def process_directory(input_dir, output_dir, image_size):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.bytes'):
            file_path = os.path.join(input_dir, file_name)
            byte_array = parse_bytes_file(file_path)
            image = bytes_to_image(byte_array, image_size)
            output_path = os.path.join(output_dir, f'{os.path.splitext(file_name)[0]}.png')
            image.save(output_path)
            print(f'Saved image to {output_path}')

In [5]:

# Example usage
input_dir = 'ransomware_dataset'  # Directory containing .bytes files
output_dir = 'ransomware_images'  # Directory to save the generated images
image_size = (256, 256)  # Resize to 256x256

process_directory(input_dir, output_dir, image_size)

def image_to_tensor(image_path, transform):
    image = Image.open(image_path)
    return transform(image)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # 将图像调整为 224x224 以适应 ViT
    transforms.ToTensor(),
])

# Convert saved images to tensor
for file_name in os.listdir(output_dir):
    if file_name.endswith('.png'):
        image_path = os.path.join(output_dir, file_name)
        tensor_image = image_to_tensor(image_path, transform)
        # print(tensor_image.shape)  # Print tensor shape for verification

FileNotFoundError: [WinError 3] 系統找不到指定的路徑。: 'ransomware_dataset'

In [None]:

# Define custom dataset
class RansomwareDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.image_paths = [os.path.join(img_dir, img) for img in os.listdir(img_dir)]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")  # Convert image to RGB
        label = 0  # Replace with actual label extraction logic
        if self.transform:
            image = self.transform(image)
        return image, label


In [None]:

# Example usage
img_dir = 'ransomware_images'
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])
dataset = RansomwareDataset(img_dir, transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Load model and feature extractor
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=2)  # Adjust num_labels as necessary
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# Training loop (simplified)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
model.train()

for epoch in range(1):
    for images, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(images, labels=labels)  # Provide labels to compute loss
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

print("Training complete.")