# Preprocess TRAM training data

In [1]:
import os
import sys

sys.path.append(os.path.abspath('../..'))

import pandas as pd
import json

from libs import mitre

## Load MITRE Attack

In [2]:
techniques = mitre.list_techniques()

tech_map = {}
for tech_file in techniques:
    tech = mitre.load_technique_file(tech_file)
    tech_map[tech['ID']] = tech['Name']
    
tech_map['T1521'] = 'Encrypted Channel'
tech_map['T1533'] = 'Data from Local System'
tech_map['T1053.001'] = 'At'

## Load Data

In [3]:
filepath = "../../data/mitre/tram_training_data.json"

with open(filepath, 'r') as f:
    content = json.loads(f.read())

## Convert to a DataFrame

In [4]:
content.keys()

dict_keys(['name', 'text', 'ml_model', 'created_on', 'updated_on', 'sentences'])

In [5]:
data = []

for sent in content['sentences']:
    if len(sent['mappings']) > 0:
        for mapping in sent['mappings']:
            row = {
                'text': sent['text'],
                'tech_id': mapping['attack_id'],
                'tech_name': tech_map[mapping['attack_id']],
                'label': 1
            }
        data.append(row)
    else:
        row = {
            'text': sent['text'],
            'tech_id': 'TNONE',
            'tech_name': 'N\A',
            'label': 0
        }
        data.append(row)

data = pd.DataFrame(data)
data.head()

Unnamed: 0,text,tech_id,tech_name,label
0,"From these reports, we know that the group use...",T1189,Drive-by Compromise,1
1,"We believe this access was abused, for example...",T1189,Drive-by Compromise,1
2,What does the resulting watering hole look lik...,T1189,Drive-by Compromise,1
3,This targeting of third party organizations to...,T1189,Drive-by Compromise,1
4,Online news outlets and general websites were ...,T1189,Drive-by Compromise,1


## Summary

In [6]:
print(f'No. of sentences: {len(data)}')
print(f'No. of techniques: {len(data["tech_id"].drop_duplicates())}')
print(f'\nDistribution of labels: \n{data["label"].value_counts()}')

No. of sentences: 12588
No. of techniques: 142

Distribution of labels: 
0    11063
1     1525
Name: label, dtype: int64


## Export

In [7]:
data.to_csv('../../data/tram_training_dataset.csv', index=False)