## Dataset

What is cpe? https://www.acunetix.com/blog/articles/common-platform-enumeration-cpe-explained/

In [1]:
import pandas as pd
import numpy as np
import json
import datasets
np.random.seed(42)

datapath = '/mnt/data/sonia/honeygan/data/processed/data.csv'
df = pd.read_csv(datapath)
print(df.isna().sum())
df.head()

os                 0
ip_str             0
port               0
module             0
cpe           734580
cpe_count          0
category           0
os_generic         0
single_cpe    824511
dtype: int64


Unnamed: 0,os,ip_str,port,module,cpe,cpe_count,category,os_generic,single_cpe
0,Ubuntu,43.205.13.243,21,ftp,,0,file_sharing,ubuntu,
1,Ubuntu,43.205.13.243,22,ssh,['cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0...,1,remote_access,ubuntu,cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5
2,Ubuntu,43.205.13.243,80,http,['cpe:/a:igor_sysoev:nginx'],1,webserver,ubuntu,cpe:/a:igor_sysoev:nginx
3,Ubuntu,43.205.13.243,443,https,['cpe:/a:igor_sysoev:nginx'],1,webserver,ubuntu,cpe:/a:igor_sysoev:nginx
4,Windows (Build 6.3.9600),206.233.189.205,80,http,['cpe:/a:igor_sysoev:nginx'],1,webserver,windows,cpe:/a:igor_sysoev:nginx


In [2]:
dfs = df[(~df.single_cpe.isna())]
dfs.drop(['cpe', 'cpe_count', 'category'], axis=1, inplace=True)
print(dfs.isna().sum(), df.shape)
dfs.head()

os            0
ip_str        0
port          0
module        0
os_generic    0
single_cpe    0
dtype: int64 (1043945, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs.drop(['cpe', 'cpe_count', 'category'], axis=1, inplace=True)


Unnamed: 0,os,ip_str,port,module,os_generic,single_cpe
1,Ubuntu,43.205.13.243,22,ssh,ubuntu,cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5
2,Ubuntu,43.205.13.243,80,http,ubuntu,cpe:/a:igor_sysoev:nginx
3,Ubuntu,43.205.13.243,443,https,ubuntu,cpe:/a:igor_sysoev:nginx
4,Windows (Build 6.3.9600),206.233.189.205,80,http,windows,cpe:/a:igor_sysoev:nginx
6,Windows (Build 6.3.9600),206.233.189.205,8081,https-simple-new,windows,cpe:/a:igor_sysoev:nginx


In [3]:
sents = dfs.apply(lambda x: f"a {x['os']} server visible at IP {x['ip_str']}, port {x['port']}, offering the service {x['single_cpe']}.", axis=1)
# sents = dfs.apply(lambda x: '<unk>'.join([x['os'], x['ip_str'], str(x['port']), x['single_cpe']]), axis=1)
sents[1]

'a Ubuntu server visible at IP 43.205.13.243, port 22, offering the service cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5.'

In [4]:
prompts = len(sents) * ["a ~ server visible at IP ~, port ~, offering the service ~."]
prompts[1]

'a ~ server visible at IP ~, port ~, offering the service ~.'

In [5]:
data = datasets.Dataset.from_dict({'input': prompts, 'output':sents.to_list()})
data[5001]

{'input': 'a ~ server visible at IP ~, port ~, offering the service ~.',
 'output': 'a Windows (Build 10.0.14393) server visible at IP 155.93.175.136, port 25, offering the service cpe:/a:microsoft:exchange_server.'}

In [6]:
max([len(x) for x in data['output']]), sum([len(x) for x in data['output']])/len(data)

(201, 133.81417647219666)

In [9]:
test

'test'

In [14]:
split = data.train_test_split(test_size=0.1)
trainval, test = split['train'], split['test']
split = trainval.train_test_split(test_size=0.1)
train, val = split['train'], split['test']
print(len(train), len(val), len(test), len(train)/len(data), len(val)/len(data), len(test)/len(data))
ds = datasets.DatasetDict({
    'train': train,
    'val': val,
    'test': test})
ds

177741 19749 21944 0.809997539123381 0.08999972656926457 0.10000273430735437


DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 177741
    })
    val: Dataset({
        features: ['input', 'output'],
        num_rows: 19749
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 21944
    })
})

## Save

In [13]:
data.save_to_disk('/mnt/data/sonia/honeygan/cloze_apr18_pull.dat')

Saving the dataset (0/1 shards):   0%|          | 0/219434 [00:00<?, ? examples/s]

In [14]:
dfsent = pd.DataFrame(sents)
dfsent.to_csv('/mnt/data/sonia/honeygan/cloze_apr18_pull.csv', index=True)

In [9]:
reloaded_dataset = datasets.load_from_disk('/mnt/data/sonia/honeygan/llama_format_feb13.dat')
reloaded_dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 219434
})

In [11]:
from transformers import AutoTokenizer
tok_path = '/mnt/data/zoo/llama2/llama2-7b-hf/'
tokenizer = AutoTokenizer.from_pretrained(
    tok_path,
    cache_dir=None,
    padding_side="right",
    use_fast=False, # Fast tokenizer giving issues.
    tokenizer_type='llama',
    trust_remote_code=True,
)

In [23]:
tokenizer.encode('<unk>')

[1, 0]

In [24]:
tokenizer.encode('Ubuntu<unk>43.205.13.243<unk>22<unk>cpe:/a:openbsd:openssh:8.2p1 Ubuntu-4ubuntu0.5')

[1,
 8294,
 0,
 29946,
 29941,
 29889,
 29906,
 29900,
 29945,
 29889,
 29896,
 29941,
 29889,
 29906,
 29946,
 29941,
 0,
 29906,
 29906,
 0,
 29883,
 412,
 8419,
 29874,
 29901,
 3150,
 29890,
 4928,
 29901,
 22156,
 845,
 29901,
 29947,
 29889,
 29906,
 29886,
 29896,
 8294,
 29899,
 29946,
 8767,
 29900,
 29889,
 29945]