# Integration Test Data

Let's create a subset of our data for integration testing.

In [1]:
import os

In [3]:
os.chdir("../")

In [14]:
import pandas as pd
from random import choice

In [5]:
data_df = pd.read_csv("data/raw/dga_domains.csv")

In [8]:
columns = data_df.columns
columns

Index(['host', 'domain', 'class', 'subclass'], dtype='object')

In [13]:
sample_df = data_df.groupby("subclass").apply(lambda x: x.sample(5, random_state = 42)).reset_index(drop=True)
sample_df = sample_df[columns]
sample_df

Unnamed: 0,host,domain,class,subclass
0,cfiaroyruugfw.ru,cfiaroyruugfw,dga,cryptolocker
1,myhsrdqmogxxvvh.ru,myhsrdqmogxxvvh,dga,cryptolocker
2,dgrnntdplbrtg.ru,dgrnntdplbrtg,dga,cryptolocker
3,cgeoiyxoradbymu.ru,cgeoiyxoradbymu,dga,cryptolocker
4,qtnoktxfnnkkyv.ru,qtnoktxfnnkkyv,dga,cryptolocker
5,wspkvovhrsayqlrsaupbmrmjsg.ru,wspkvovhrsayqlrsaupbmrmjsg,dga,goz
6,tomrdijpzkhijrfauwzlljhyttsx.ru,tomrdijpzkhijrfauwzlljhyttsx,dga,goz
7,camjtovhzxfutokpfwslneqto.ru,camjtovhzxfutokpfwslneqto,dga,goz
8,rweulvobduttpzkbxsenfj.ru,rweulvobduttpzkbxsenfj,dga,goz
9,kbcejbpbduxyxrcqzxlxwdwclrqk.ru,kbcejbpbduxyxrcqzxlxwdwclrqk,dga,goz


In [16]:
def create_record(host, is_legit):
    result = dict()
    result["host"] = host
    result["domain"] = host.split(".")[0]
    
    if is_legit:
        result["class"] = "legit"
        result["subclass"] = "legit"
    else:
        result["class"] = "dga"
        result["subclass"] = choice(["newgoz", "goz", "cryptolocker"])
    
    return result

In [17]:
create_record("reddit.com", True)

{'host': 'reddit.com',
 'domain': 'reddit',
 'class': 'legit',
 'subclass': 'legit'}

Let's add some intentionally tricky data

In [22]:
additional_records = [
    create_record(x[0], x[1]) for x in [
        ("reddit.com", True),
        ("line\bbreak.co.uk", True),
        ("   whitespace.com   ", True),
        ("?????.ru", False),
    ]
]

In [24]:
pd.DataFrame(additional_records)

Unnamed: 0,host,domain,class,subclass
0,reddit.com,reddit,legit,legit
1,linebreak.co.uk,linebreak,legit,legit
2,whitespace.com,whitespace,legit,legit
3,?????.ru,?????,dga,newgoz


In [26]:
test_data_df = pd.concat([sample_df, pd.DataFrame(additional_records)])

In [27]:
test_data_df

Unnamed: 0,host,domain,class,subclass
0,cfiaroyruugfw.ru,cfiaroyruugfw,dga,cryptolocker
1,myhsrdqmogxxvvh.ru,myhsrdqmogxxvvh,dga,cryptolocker
2,dgrnntdplbrtg.ru,dgrnntdplbrtg,dga,cryptolocker
3,cgeoiyxoradbymu.ru,cgeoiyxoradbymu,dga,cryptolocker
4,qtnoktxfnnkkyv.ru,qtnoktxfnnkkyv,dga,cryptolocker
5,wspkvovhrsayqlrsaupbmrmjsg.ru,wspkvovhrsayqlrsaupbmrmjsg,dga,goz
6,tomrdijpzkhijrfauwzlljhyttsx.ru,tomrdijpzkhijrfauwzlljhyttsx,dga,goz
7,camjtovhzxfutokpfwslneqto.ru,camjtovhzxfutokpfwslneqto,dga,goz
8,rweulvobduttpzkbxsenfj.ru,rweulvobduttpzkbxsenfj,dga,goz
9,kbcejbpbduxyxrcqzxlxwdwclrqk.ru,kbcejbpbduxyxrcqzxlxwdwclrqk,dga,goz


In [28]:
test_data_df.to_csv("integrationtests/test_data.csv", index = False)