## Dataset

What is cpe? https://www.acunetix.com/blog/articles/common-platform-enumeration-cpe-explained/

In [1]:
import pandas as pd
import numpy as np
import json
import datasets
np.random.seed(42)

datapath = '/mnt/data/sonia/honeygan/data/processed/data.csv'
df = pd.read_csv(datapath)
print(df.isna().sum())
df.head()

os                 0
ip_str             0
port               0
module             0
cpe           734580
cpe_count          0
category           0
os_generic         0
single_cpe    824511
dtype: int64


Unnamed: 0,os,ip_str,port,module,cpe,cpe_count,category,os_generic,single_cpe
0,Ubuntu,43.205.13.243,21,ftp,,0,file_sharing,ubuntu,
1,Ubuntu,43.205.13.243,22,ssh,['cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0...,1,remote_access,ubuntu,cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5
2,Ubuntu,43.205.13.243,80,http,['cpe:/a:igor_sysoev:nginx'],1,webserver,ubuntu,cpe:/a:igor_sysoev:nginx
3,Ubuntu,43.205.13.243,443,https,['cpe:/a:igor_sysoev:nginx'],1,webserver,ubuntu,cpe:/a:igor_sysoev:nginx
4,Windows (Build 6.3.9600),206.233.189.205,80,http,['cpe:/a:igor_sysoev:nginx'],1,webserver,windows,cpe:/a:igor_sysoev:nginx


In [6]:
dfs = df[(~df.single_cpe.isna())]
dfs['port_str'] = dfs.apply(lambda x: str(x['port']), axis=1)
dfs = dfs[['os_generic', 'ip_str', 'port_str', 'module']]
dfs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs['port_str'] = dfs.apply(lambda x: str(x['port']), axis=1)


Unnamed: 0,os_generic,ip_str,port_str,module
1,ubuntu,43.205.13.243,22,ssh
2,ubuntu,43.205.13.243,80,http
3,ubuntu,43.205.13.243,443,https
4,windows,206.233.189.205,80,http
6,windows,206.233.189.205,8081,https-simple-new


In [7]:
prompt = datasets.Dataset.from_dict({'prompt': ['a ', ' at IP ', ', port ', ', offering service ', '',]})
prompt[:]

{'prompt': ['a ', ' at IP ', ', port ', ', offering service ', '']}

In [8]:
data = datasets.Dataset.from_pandas(dfs)
data = data.remove_columns("__index_level_0__")
data[:5]

{'os_generic': ['ubuntu', 'ubuntu', 'ubuntu', 'windows', 'windows'],
 'ip_str': ['43.205.13.243',
  '43.205.13.243',
  '43.205.13.243',
  '206.233.189.205',
  '206.233.189.205'],
 'port_str': ['22', '80', '443', '80', '8081'],
 'module': ['ssh', 'http', 'https', 'http', 'https-simple-new']}

In [9]:
prompt_len = sum([len(chunk) for chunk in prompt['prompt']])
prompt_len

35

In [10]:
data = data.map(lambda x: {'length': sum([len(col) for col in x.values()]) + prompt_len})

Map:   0%|          | 0/219434 [00:00<?, ? examples/s]

In [11]:
split = data.train_test_split(test_size=0.1)
trainval, test = split['train'], split['test']
split = trainval.train_test_split(test_size=0.1)
train, val = split['train'], split['test']
print(len(train), len(val), len(test), len(train)/len(data), len(val)/len(data), len(test)/len(data))
ds = datasets.DatasetDict({
    'train': train,
    'eval': val,
    'test': test,
    'prompt': prompt})
ds

177741 19749 21944 0.809997539123381 0.08999972656926457 0.10000273430735437


DatasetDict({
    train: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 177741
    })
    eval: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 19749
    })
    test: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 21944
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 5
    })
})

## Save

In [20]:
ds.save_to_disk('/mnt/data/sonia/honeygan/apr23.dat')

Saving the dataset (0/1 shards):   0%|          | 0/177741 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19749 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21944 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

In [2]:
import os
p='/mnt/data/sonia/honeygan/apr23.dat'

if 'dataset_dict.json' in os.listdir(p):
    dataset = datasets.DatasetDict({})
    for f in os.listdir(p):
        if f.endswith('.json'): continue
        dataset[f] = datasets.load_from_disk(os.path.join('/mnt/data/sonia/honeygan/apr23.dat', f))
        
dataset

DatasetDict({
    eval: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 19749
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 5
    })
    train: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 177741
    })
    test: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'module', 'length'],
        num_rows: 21944
    })
})

In [3]:
dataset['prompt']['prompt']

['a ', ' at IP ', ', port ', ', offering service ', '']

In [24]:
feats = list(dataset['train'].features)
{k:v for k,v in zip(list(range(len(feats))), feats)}

{0: 'os_generic', 1: 'ip_str', 2: 'port_str', 3: 'module', 4: 'length'}

In [11]:
from transformers import AutoTokenizer
tok_path = '/mnt/data/zoo/llama2/llama2-7b-hf/'
tokenizer = AutoTokenizer.from_pretrained(
    tok_path,
    cache_dir=None,
    padding_side="right",
    use_fast=False, # Fast tokenizer giving issues.
    tokenizer_type='llama',
    trust_remote_code=True,
)

In [23]:
tokenizer.encode('<unk>')

[1, 0]

In [24]:
tokenizer.encode('Ubuntu<unk>43.205.13.243<unk>22<unk>cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5')

[1,
 8294,
 0,
 29946,
 29941,
 29889,
 29906,
 29900,
 29945,
 29889,
 29896,
 29941,
 29889,
 29906,
 29946,
 29941,
 0,
 29906,
 29906,
 0,
 29883,
 412,
 8419,
 29874,
 29901,
 3150,
 29890,
 4928,
 29901,
 22156,
 845,
 29901,
 29947,
 29889,
 29906,
 29886,
 29896,
 8294,
 29899,
 29946,
 8767,
 29900,
 29889,
 29945]