# Tokenizer Training - MalReformer

The tokenizer will be trained using the `google/reformer-crime-and-punishment` algorithm from the `tokenizers` library. Training data is from GMIC dataset.

### Imports

In [1]:
# imports
%load_ext autoreload
%autoreload 2

import os
import random
import json

import torch

### Settings

In [2]:

with open ('../settings.json') as f:
    settings = json.load(f)

DATA_DIR = os.path.join(settings['data_path'], 'classifier')
os.makedirs(DATA_DIR, exist_ok=True)
CHECKPOINT_DIR = os.path.join(DATA_DIR, "model_data")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
CHECKPOINT_PREFIX = os.path.join(CHECKPOINT_DIR, "tokenizer")

db_uri = settings['sqlalchemy_database_uri']

VOCAB_SIZE = 20_000

### Get Data from the Database

In [3]:
from utils.mal_data import get_mal_data
benign_reports = get_mal_data(['benign'])

Loading benign reports from file


In [4]:
from tqdm import tqdm
import json
json_data = []

# make sure that CHECKPOINT_PREFIX exists
os.makedirs(CHECKPOINT_PREFIX, exist_ok=True)

file_count = 0
for report in tqdm(benign_reports, desc="Processing reports"):
    text = " ".join(report[0])
    label = 0

    # add to json_data
    json_data.append({"text": text, "label": label})

    # write to file every 1000 lines
    if len(json_data) == 1000:
        with open(os.path.join(CHECKPOINT_PREFIX, f"json_{file_count}.json"), 'w') as f:
            for line in json_data:
                json.dump(line, f)
                f.write("\n")
        json_data = []
        file_count += 1
    
# save last file
if len(json_data) > 0:
    with open(os.path.join(CHECKPOINT_PREFIX, f"json_{file_count}.json"), 'w') as f:
        for line in json_data:
            json.dump(line, f)
            f.write("\n")
    json_data = []
    file_count += 1

Processing reports: 100%|██████████| 9714/9714 [00:10<00:00, 926.64it/s] 


In [5]:
# get file names in CHECKPOINT_PREFIX
import glob
json_files = glob.glob(os.path.join(CHECKPOINT_PREFIX, "*.json"))
json_files

['/media/mike/data/gimc/classifier/model_data/tokenizer/json_0.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_1.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_2.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_3.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_4.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_5.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_6.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_7.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_8.json',
 '/media/mike/data/gimc/classifier/model_data/tokenizer/json_9.json']

### Create Iterator

In [6]:
import json
class LineIterator:
    def __init__(self, file_paths):
        """
        Iterator to yield lines from a file.

        :param file_path: Paths to the file.
        """
        self.file_paths = file_paths

    def __iter__(self):
        for file_path in self.file_paths:
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    json_data = json.loads(line)
                    yield json_data['text']

# Example usage:
file_paths = json_files
line_iterator = LineIterator(file_paths)
for line in line_iterator:
    print(line[1000:1050])
    break

 heap name not found regopenkey hklm system curren


In [7]:

from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("google/reformer-crime-and-punishment")

### Example tokenization of old tokenizer

In [8]:
it = iter(line_iterator)
tokens = old_tokenizer.tokenize(next(it))
tokens[:200]

['▁p',
 'ro',
 'c',
 'ess',
 '▁st',
 'art',
 '▁su',
 'c',
 'c',
 'ess',
 '▁th',
 're',
 'ad',
 '▁c',
 're',
 'at',
 'e',
 '▁su',
 'c',
 'c',
 'ess',
 '▁l',
 'o',
 'ad',
 '▁',
 'im',
 'a',
 'ge',
 '▁c',
 ':',
 '▁u',
 's',
 'er',
 's',
 '▁u',
 's',
 'er',
 '▁p',
 'ro',
 'j',
 'e',
 'ct',
 's',
 '▁g',
 'im',
 'c',
 '▁s',
 'and',
 'b',
 'o',
 'x',
 '▁t',
 'est',
 's',
 'am',
 'p',
 'le',
 '.',
 'e',
 'x',
 'e',
 '▁su',
 'c',
 'c',
 'ess',
 '▁l',
 'o',
 'ad',
 '▁',
 'im',
 'a',
 'ge',
 '▁c',
 ':',
 '▁w',
 'in',
 'd',
 'ow',
 's',
 '▁s',
 'y',
 'st',
 'e',
 'm',
 '<unk>',
 '▁n',
 't',
 'd',
 'll',
 '.',
 'd',
 'll',
 '▁su',
 'c',
 'c',
 'ess',
 '▁c',
 're',
 'at',
 'e',
 'f',
 'i',
 'le',
 '▁c',
 ':',
 '▁w',
 'in',
 'd',
 'ow',
 's',
 '▁p',
 're',
 'f',
 'et',
 'ch',
 '▁t',
 'est',
 's',
 'am',
 'p',
 'le',
 '.',
 'e',
 'x',
 'e',
 '-',
 '<unk>',
 'a',
 '<unk>',
 'f',
 '.',
 'p',
 'f',
 '▁su',
 'c',
 'c',
 'ess',
 '▁qu',
 'er',
 'y',
 'st',
 'and',
 'ard',
 'in',
 'f',
 'or',
 'm',
 'ation',

### Train new tokenizer

In [9]:
tokenizer = old_tokenizer.train_new_from_iterator(line_iterator, VOCAB_SIZE, min_frequency=2)






### Example tokenization of new tokenizer

In [10]:
it = iter(line_iterator)
tokens = tokenizer.tokenize(next(it))
tokens[:200]

['▁process',
 '▁start',
 '▁success',
 '▁thread',
 '▁create',
 '▁success',
 '▁load',
 '▁image',
 '▁c:',
 '▁users',
 '▁user',
 '▁projects',
 '▁gimc',
 '▁sandbox',
 '▁testsample.exe',
 '▁success',
 '▁load',
 '▁image',
 '▁c:',
 '▁windows',
 '▁system32',
 '▁ntdll.dll',
 '▁success',
 '▁createfile',
 '▁c:',
 '▁windows',
 '▁prefetch',
 '▁testsample.exe-37254a4f.pf',
 '▁success',
 '▁querystandardinformationfile',
 '▁c:',
 '▁windows',
 '▁prefetch',
 '▁testsample.exe-37254a4f.pf',
 '▁success',
 '▁readfile',
 '▁c:',
 '▁windows',
 '▁prefetch',
 '▁testsample.exe-37254a4f.pf',
 '▁success',
 '▁readfile',
 '▁c:',
 '▁windows',
 '▁prefetch',
 '▁testsample.exe-37254a4f.pf',
 '▁success',
 '▁closefile',
 '▁c:',
 '▁windows',
 '▁prefetch',
 '▁testsample.exe-37254a4f.pf',
 '▁success',
 '▁regopenkey',
 '▁hklm',
 '▁system',
 '▁currentcontrolset',
 '▁control',
 '▁session',
 '▁manager',
 '▁reparse',
 '▁regopenkey',
 '▁hklm',
 '▁system',
 '▁currentcontrolset',
 '▁control',
 '▁session',
 '▁manager',
 '▁success',
 '▁

### Save the tokenizer

In [11]:
tokenizer_path = os.path.join(CHECKPOINT_DIR, "mal-reformer")
tokenizer.save_pretrained(tokenizer_path)

('/media/mike/data/gimc/classifier/model_data/mal-reformer/tokenizer_config.json',
 '/media/mike/data/gimc/classifier/model_data/mal-reformer/special_tokens_map.json',
 '/media/mike/data/gimc/classifier/model_data/mal-reformer/tokenizer.json')