In [19]:
import cudf

In [20]:
MODEL_FILE = 'placeholder_pii2'

TOKENIZER_HASH_FILE = 'bert-base-cased-hash.txt'

### EXAMPLE DATA

In [21]:
pcap_data = ["POST /netq/user/v1/ HTTP/1.0\r\nHost: bing_bong-customermgmt:8080\r\nclaimed-opid: 0\r\nclaimed-user: \
            admin\r\nclaimed-role: admin\r\nclaimed-custid:0\r\nclaimed-namespace: NAN\r\nAuthorization: Bearer \
            eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiYWRtaW4iLCJleHBpcmVzQXQiOjE2MTQyNjAwMDM2NTQsIm9waWQiOj\
            AsImN1c3RJZCI6MCwidXNlciI6ImFkbOIn0.bja4ZepgxxOurDiDYgwH5qRS9tO4bfF6nCjo7Ip1qE4\r\nUser-Agent: \
            ddde-http/10.1.7\r\nContent-Type: application/json\r\nContent-Length: 341\r\n\r\n{\"password\":\"Netq@123\",\
            \"id\":\"rachel@email.com\",\"email\":\"rachel@email.com\",\"first_name\":\"Rachel\",\"last_name\":\"Allen\"\
            ,\"role\":\"admin\",\"admin_password\":\"Pass@123\",\"default_workbench\":{\"workspace_id\":\"DEFAULT\",\"workbench_id\"\
            :\"DEFAULT\"},\"preferences\":{\"theme\":\"jade\",\"language\":\"en-initial\",\"date_format\":\"M/d/yy h:mm a\",\"timezone\":\"+0530\
            \POST /netq/user/v1/ HTTP/1.0\r\nHost: netq-app-customermgmt:8080\r\nclaimed-opid: 0\r\nclaimed-user: admin\r\nclaimed-role:\}",
            "\u0000\u0000\u0000&\u0000\u0003\u0000\t\u0000\u0000\u0000G\u0000\u0016snapshotStatusProducer\u0000\u0001\u0001\u0000\u0000\u0000"]


In [22]:
# to cudf

cudf_input = cudf.Series(pcap_data)

### PREPROCESSING

In [23]:
import cudf
import cupy
import torch
from torch.utils.dlpack import from_dlpack, to_dlpack

In [24]:
def bert_cased_tokenize(strings, max_seq_len):
        """
        converts cudf.Series of strings to two torch tensors and meta_data- token ids and attention mask with padding
        """
        max_seq_len = 512
        num_strings = len(strings)
        token_ids, mask, meta_data = strings.str.subword_tokenize(
            TOKENIZER_HASH_FILE,
            max_length=max_seq_len,
            stride=500,
            do_lower=False,
            do_truncate=True,
        )

        # convert from cupy to torch tensor using dlpack
        input_ids = from_dlpack(
            token_ids.reshape(-1, 512).astype(cupy.float).toDlpack()
        )
        attention_mask = from_dlpack(
            mask.reshape(-1, 512).astype(cupy.float).toDlpack()
        )
        
        return input_ids.type(torch.long), attention_mask.type(torch.long), meta_data.reshape(-1,3)

In [25]:
# cudf subword tokenizer

input_ids, masks, meta_data = bert_cased_tokenize(cudf_input, 512)

two strings- first one spans two rows of the tensor

In [26]:
input_ids.size() # on device

torch.Size([2, 512])

In [27]:
masks.size() # on device

torch.Size([2, 512])

In [28]:
meta_data # on host

array([[  0,   0, 501],
       [  1,   0,  11]], dtype=uint32)

### INFERENCE

In [29]:
model = torch.load(MODEL_FILE).to('cuda')

In [33]:
with torch.no_grad():
    logits = model(input_ids, token_type_ids=None, attention_mask=masks)[0]
    probs = torch.sigmoid(logits[:, 1])
    preds = probs.ge(0.5)

In [34]:
probs

tensor([0.8865, 0.1480], device='cuda:0')

In [35]:
preds

tensor([ True, False], device='cuda:0')