In [171]:
##### create datastore from blob storage
from azureml.core import Workspace, Datastore, Dataset
import json
import os
import pandas as pd
import time

In [172]:
# load secrets, needs to contain 'DOWNLOAD_STORAGE_CON_STR' and 'DOWNLOAD_STORAGE_CONTAINER'
with open("secrets.json") as f:
    secrets = json.load(f)

In [173]:
# find file to download
from azure.storage.blob import BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(secrets['DOWNLOAD_STORAGE_CON_STR'])
src_container = blob_service_client.get_container_client(secrets['DOWNLOAD_STORAGE_CONTAINER'])
[b.name for b in src_container.list_blobs('UNSW-NB15')]

['UNSW-NB15 - CSV Files.tar']

In [174]:
# download tar file
os.makedirs('downloaded_data', exist_ok=True)
dest_file = "./downloaded_data/UNSW.tar"
blob = src_container.get_blob_client('UNSW-NB15 - CSV Files.tar')
with open(file=dest_file, mode="wb") as f:
    f.write(blob.download_blob().readall())

In [175]:
# extract
!tar -xvf {dest_file}

UNSW-NB15 - CSV Files/
UNSW-NB15 - CSV Files/NUSW-NB15_GT.csv
UNSW-NB15 - CSV Files/NUSW-NB15_features.csv
UNSW-NB15 - CSV Files/The UNSW-NB15 description.pdf
UNSW-NB15 - CSV Files/UNSW-NB15_1.csv
UNSW-NB15 - CSV Files/UNSW-NB15_2.csv
UNSW-NB15 - CSV Files/UNSW-NB15_3.csv
UNSW-NB15 - CSV Files/UNSW-NB15_4.csv
UNSW-NB15 - CSV Files/UNSW-NB15_LIST_EVENTS.csv
UNSW-NB15 - CSV Files/a part of training and testing set/
UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_testing-set.csv
UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_training-set.csv


In [176]:
# list records in each file
!cd "UNSW-NB15 - CSV Files" && wc -l *.csv

   188914 NUSW-NB15_GT.csv
       50 NUSW-NB15_features.csv
   700001 UNSW-NB15_1.csv
   700001 UNSW-NB15_2.csv
   700001 UNSW-NB15_3.csv
   440044 UNSW-NB15_4.csv
      209 UNSW-NB15_LIST_EVENTS.csv
  2729220 total


In [177]:
# save the features file file
raw_data_dir = "UNSW-NB15 - CSV Files"
header = pd.read_csv(f'{raw_data_dir}/NUSW-NB15_features.csv', encoding='iso-8859-1', index_col = 'No.')
header.to_csv('data/features.csv')

In [178]:
# check head of the first csv
pd.read_csv(f'{raw_data_dir}/UNSW-NB15_1.csv', nrows = 4, header = None, names = header.Name.values)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0


In [179]:
# now we are going to loop over all the csv files, and attach the headers
t0 = time.time()
for file_name in [f"UNSW-NB15_{i}.csv" for i in range(1,5)]:
    # read in data and attach header
    data = pd.read_csv(os.path.join(raw_data_dir, file_name), header = None, names = header.Name.values)
    # write out to data dir
    data.to_csv(os.path.join('data/files', file_name))
    print('written', file_name) 
print("Execution time:", time.time()-t0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


written UNSW-NB15_1.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


written UNSW-NB15_2.csv
written UNSW-NB15_3.csv
written UNSW-NB15_4.csv


In [180]:
# clean up
!rm -rf "UNSW-NB15 - CSV Files"
!rm -rf "downloaded_data"

In [181]:
# list default datastore
ws = Workspace.from_config()
ds = ws.get_default_datastore()
ds

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-2c043982-3b30-4ed7-8a62-c79d650f0365",
  "account_name": "mlwnb15sstorage41f580322",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [182]:
# upload to datastore
ds.upload('data/files', target_path ='./data')

Uploading an estimated of 4 files
Target already exists. Skipping upload for data/UNSW-NB15_1.csv
Target already exists. Skipping upload for data/UNSW-NB15_2.csv
Target already exists. Skipping upload for data/UNSW-NB15_3.csv
Target already exists. Skipping upload for data/UNSW-NB15_4.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_7ea6f079b9e8406b8c7ef05b2683b0b1

In [183]:
# create tabulated dataset
tab_dataset = Dataset.Tabular.from_delimited_files([(ds, 'data/*.csv')])
tab_dataset = tab_dataset.register(workspace=ws, name='nb15_table') 

In [203]:
# dataset can eb read to pandas using the following
tab_dataset.to_pandas_dataframe()

Unnamed: 0,Column1,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,0,59.166.0.0,1390.0,149.171.126.6,53.0,udp,CON,0.001055,132,164,...,0.0,3,7,1,3,1,1,1,,0
1,1,59.166.0.0,33661.0,149.171.126.9,1024.0,udp,CON,0.036133,528,304,...,0.0,2,4,2,3,1,1,2,,0
2,2,59.166.0.6,1464.0,149.171.126.7,53.0,udp,CON,0.001119,146,178,...,0.0,12,8,1,2,2,1,1,,0
3,3,59.166.0.5,3593.0,149.171.126.5,53.0,udp,CON,0.001209,132,164,...,0.0,6,9,1,1,1,1,1,,0
4,4,59.166.0.3,49664.0,149.171.126.0,53.0,udp,CON,0.001169,146,178,...,0.0,7,9,1,1,1,1,1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,440039,59.166.0.5,33094.0,149.171.126.7,43433.0,tcp,FIN,0.087306,320,1828,...,,1,2,3,3,1,1,3,,0
2540043,440040,59.166.0.7,20848.0,149.171.126.4,21.0,tcp,CON,0.365058,456,346,...,2.0,2,2,2,2,2,2,2,,0
2540044,440041,59.166.0.3,21511.0,149.171.126.9,21.0,tcp,CON,6.335154,1802,2088,...,2.0,2,2,4,2,2,2,2,,0
2540045,440042,59.166.0.9,35433.0,149.171.126.0,80.0,tcp,CON,2.200934,3498,166054,...,,1,1,2,4,2,2,2,,0
