In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt

## Dataset
1. can-train-and-test dataset
    - attack-free
    - DoS
    - accessory
    - force-neutral
    - rpm
    - standstill

In [2]:
## import datasets
dataset_dir = './datasets/cantrainandtest/can-train-and-test/set_01/train_01/'

attack_free_1 = pd.read_csv(dataset_dir + "attack-free-1.csv")
attack_free_2 = pd.read_csv(dataset_dir + "attack-free-2.csv")

DoS_1 = pd.read_csv(dataset_dir + "DoS-1.csv")
DoS_2 = pd.read_csv(dataset_dir + "DoS-2.csv")

accessory_1 = pd.read_csv(dataset_dir + "accessory-1.csv")
accessory_2 = pd.read_csv(dataset_dir + "accessory-2.csv")

force_neutral_1 = pd.read_csv(dataset_dir + "force-neutral-1.csv")
force_neutral_2 = pd.read_csv(dataset_dir + "force-neutral-2.csv")

rpm_1 = pd.read_csv(dataset_dir + "rpm-1.csv")
rpm_2 = pd.read_csv(dataset_dir + "rpm-2.csv")

standstill_1 = pd.read_csv(dataset_dir + "standstill-1.csv")
standstill_2 = pd.read_csv(dataset_dir + "standstill-2.csv")

In [3]:
## concatenate related datasets
attack_free = pd.concat([attack_free_1, attack_free_2])
DoS = pd.concat([DoS_1, DoS_2])
accessory = pd.concat([accessory_1, accessory_2])
force_neutral = pd.concat([force_neutral_1, force_neutral_2])
rpm = pd.concat([rpm_1, rpm_2])
standstill = pd.concat([standstill_1, standstill_2])

In [4]:
## concatenate all data-samples into one
can_train_and_test_ds = pd.concat([attack_free, DoS, accessory, force_neutral, rpm, standstill])
can_train_and_test_ds

Unnamed: 0,timestamp,arbitration_id,data_field,attack
0,1.672531e+09,0C1,3000000430000004,0
1,1.672531e+09,0C5,3000000430000004,0
2,1.672531e+09,184,000200000000,0
3,1.672531e+09,1C7,065CB9A200003F,0
4,1.672531e+09,1CD,0000000000,0
...,...,...,...,...
1267139,1.672532e+09,199,CFFF0FFFEFFE00FF,0
1267140,1.672532e+09,1A1,00000000000000,0
1267141,1.672532e+09,1E5,46000E8000FFF201,0
1267142,1.672532e+09,1C3,0696068F00000000,0


#### change values of the attack in each dataset

In [5]:
## since accessory mode is just another attack free mode we'd switch the "0" (attack-free) for "1" indicating accessory mode
accessory['attack'] = accessory['attack'].replace(0,1)
DoS['attack'] = DoS['attack'].replace(1,2)
force_neutral['attack'] = force_neutral['attack'].replace(1,3)
rpm['attack'] = rpm['attack'].replace(1,4)
standstill['attack'] = standstill['attack'].replace(1,5)

##### merge all subset into a single dataset

In [6]:
merged_datasets = pd.concat([attack_free, accessory, DoS, force_neutral, rpm, standstill])

filtered = merged_datasets[merged_datasets['attack'] == 1]

merged_datasets

Unnamed: 0,timestamp,arbitration_id,data_field,attack
0,1.672531e+09,0C1,3000000430000004,0
1,1.672531e+09,0C5,3000000430000004,0
2,1.672531e+09,184,000200000000,0
3,1.672531e+09,1C7,065CB9A200003F,0
4,1.672531e+09,1CD,0000000000,0
...,...,...,...,...
1267139,1.672532e+09,199,CFFF0FFFEFFE00FF,0
1267140,1.672532e+09,1A1,00000000000000,0
1267141,1.672532e+09,1E5,46000E8000FFF201,0
1267142,1.672532e+09,1C3,0696068F00000000,0


##### One Hot Vector Encode the various attacks

In [7]:
# deprecated
# onv_ds = pd.get_dummies(merged_datasets, columns=['attack'], prefix='attack')

# onv_ds['attack_0'] = onv_ds['attack_0'].astype(int)
# onv_ds['attack_1'] = onv_ds['attack_1'].astype(int)
# onv_ds['attack_2'] = onv_ds['attack_2'].astype(int)
# onv_ds['attack_3'] = onv_ds['attack_3'].astype(int)
# onv_ds['attack_4'] = onv_ds['attack_4'].astype(int)
# onv_ds['attack_5'] = onv_ds['attack_5'].astype(int)

# onv_ds

In [8]:
# onv_ds.isnull().sum()

##### Save new dataset file

In [9]:
# onv_ds.to_csv("updated_dataset.csv", sep=',', index=False, encoding='utf-8')

#### Merge and process test datasets

In [10]:
## import datasets
dataset_dir = './datasets/cantrainandtest/can-train-and-test/set_01/test_01_known_vehicle_known_attack/'

DoS_3 = pd.read_csv(dataset_dir + "DoS-3.csv")
DoS_4 = pd.read_csv(dataset_dir + "DoS-4.csv")

force_neutral_3 = pd.read_csv(dataset_dir + "force-neutral-3.csv")
force_neutral_4 = pd.read_csv(dataset_dir + "force-neutral-4.csv")

rpm_3 = pd.read_csv(dataset_dir + "rpm-3.csv")
rpm_4 = pd.read_csv(dataset_dir + "rpm-4.csv")

standstill_3 = pd.read_csv(dataset_dir + "standstill-3.csv")
standstill_4 = pd.read_csv(dataset_dir + "standstill-4.csv")

# merge related datasets
DoS = pd.concat([DoS_3, DoS_4])
force_neutral = pd.concat([force_neutral_3, force_neutral_4])
rpm = pd.concat([rpm_3, rpm_4])
standstill = pd.concat([standstill_3, standstill_4])

# Label Encode Categorical Data
label_encoder = preprocessing.LabelEncoder() 

DoS["arbitration_id"] = label_encoder.fit_transform(DoS["arbitration_id"])
DoS["data_field"] = label_encoder.fit_transform(DoS["data_field"])

force_neutral["arbitration_id"] = label_encoder.fit_transform(force_neutral["arbitration_id"])
force_neutral["data_field"] = label_encoder.fit_transform(force_neutral["data_field"])

rpm["arbitration_id"] = label_encoder.fit_transform(rpm["arbitration_id"])
rpm["data_field"] = label_encoder.fit_transform(rpm["data_field"])

standstill["arbitration_id"] = label_encoder.fit_transform(standstill["arbitration_id"])
standstill["data_field"] = label_encoder.fit_transform(standstill["data_field"])

# save to file
DoS.to_csv("./datasets/clean-data/test-data/kv-ka/DoS.csv", sep=',', index=False, encoding='utf-8')
force_neutral.to_csv("./datasets/clean-data/test-data/kv-ka/force_neutral.csv", sep=',', index=False, encoding='utf-8')
rpm.to_csv("./datasets/clean-data/test-data/kv-ka/rpm.csv", sep=',', index=False, encoding='utf-8')
standstill.to_csv("./datasets/clean-data/test-data/kv-ka/standstill.csv", sep=',', index=False, encoding='utf-8')


In [41]:
## import datasets
dataset_dir = './datasets/cantrainandtest/can-train-and-test/set_01/test_02_unknown_vehicle_known_attack/'

DoS_3 = pd.read_csv(dataset_dir + "DoS-3.csv")
DoS_4 = pd.read_csv(dataset_dir + "DoS-4.csv")

force_neutral_3 = pd.read_csv(dataset_dir + "force-neutral-3.csv")
force_neutral_4 = pd.read_csv(dataset_dir + "force-neutral-4.csv")

rpm_3 = pd.read_csv(dataset_dir + "rpm-3.csv")
rpm_4 = pd.read_csv(dataset_dir + "rpm-4.csv")

standstill_3 = pd.read_csv(dataset_dir + "standstill-3.csv")
standstill_4 = pd.read_csv(dataset_dir + "standstill-4.csv")

# merge related datasets
DoS = pd.concat([DoS_3, DoS_4])
force_neutral = pd.concat([force_neutral_3, force_neutral_4])
rpm = pd.concat([rpm_3, rpm_4])
standstill = pd.concat([standstill_3, standstill_4])

# Label Encode Categorical Data
label_encoder = preprocessing.LabelEncoder() 

DoS["arbitration_id"] = label_encoder.fit_transform(DoS["arbitration_id"])
DoS["data_field"] = label_encoder.fit_transform(DoS["data_field"])

force_neutral["arbitration_id"] = label_encoder.fit_transform(force_neutral["arbitration_id"])
force_neutral["data_field"] = label_encoder.fit_transform(force_neutral["data_field"])

rpm["arbitration_id"] = label_encoder.fit_transform(rpm["arbitration_id"])
rpm["data_field"] = label_encoder.fit_transform(rpm["data_field"])

standstill["arbitration_id"] = label_encoder.fit_transform(standstill["arbitration_id"])
standstill["data_field"] = label_encoder.fit_transform(standstill["data_field"])

# save to file
# DoS.to_csv("./datasets/clean-data/test-data/uv-ka/DoS.csv", sep=',', index=False, encoding='utf-8')
# force_neutral.to_csv("./datasets/clean-data/test-data/uv-ka/force_neutral.csv", sep=',', index=False, encoding='utf-8')
# rpm.to_csv("./datasets/clean-data/test-data/uv-ka/rpm.csv", sep=',', index=False, encoding='utf-8')
# standstill.to_csv("./datasets/clean-data/test-data/uv-ka/standstill.csv", sep=',', index=False, encoding='utf-8')

In [15]:
## import datasets
dataset_dir = './datasets/cantrainandtest/can-train-and-test/set_01/test_03_known_vehicle_unknown_attack/'

double_3 = pd.read_csv(dataset_dir + "double-3.csv")
double_4 = pd.read_csv(dataset_dir + "double-4.csv")
fuzzing_3 = pd.read_csv(dataset_dir + "fuzzing-3.csv")
fuzzing_4 = pd.read_csv(dataset_dir + "fuzzing-4.csv")
interval_3 = pd.read_csv(dataset_dir + "interval-3.csv")
interval_4 = pd.read_csv(dataset_dir + "interval-4.csv")
speed_3 = pd.read_csv(dataset_dir + "speed-3.csv")
speed_4 = pd.read_csv(dataset_dir + "speed-4.csv")
systematic_3 = pd.read_csv(dataset_dir + "systematic-3.csv")
systematic_4 = pd.read_csv(dataset_dir + "systematic-4.csv")
triple_3 = pd.read_csv(dataset_dir + "triple-3.csv")
triple_4 = pd.read_csv(dataset_dir + "triple-4.csv")

# merge related datasets
double = pd.concat([double_3, double_4])
fuzzing = pd.concat([fuzzing_3, fuzzing_4])
interval = pd.concat([interval_3, interval_4])
speed = pd.concat([speed_3, speed_4])
systematic = pd.concat([systematic_3, systematic_4])
triple = pd.concat([triple_3, triple_4])

# Label Encode Categorical Data
label_encoder = preprocessing.LabelEncoder() 

double["arbitration_id"] = label_encoder.fit_transform(double["arbitration_id"])
double["data_field"] = label_encoder.fit_transform(double["data_field"])

fuzzing["arbitration_id"] = label_encoder.fit_transform(fuzzing["arbitration_id"])
fuzzing["data_field"] = label_encoder.fit_transform(fuzzing["data_field"])

interval["arbitration_id"] = label_encoder.fit_transform(interval["arbitration_id"])
interval["data_field"] = label_encoder.fit_transform(interval["data_field"])

speed["arbitration_id"] = label_encoder.fit_transform(speed["arbitration_id"])
speed["data_field"] = label_encoder.fit_transform(speed["data_field"])

systematic["arbitration_id"] = label_encoder.fit_transform(systematic["arbitration_id"])
systematic["data_field"] = label_encoder.fit_transform(systematic["data_field"])

triple["arbitration_id"] = label_encoder.fit_transform(triple["arbitration_id"])
triple["data_field"] = label_encoder.fit_transform(triple["data_field"])

# save to file
# double.to_csv("./datasets/clean-data/test-data/kv-ua/double.csv", sep=',', index=False, encoding='utf-8')
# fuzzing.to_csv("./datasets/clean-data/test-data/kv-ua/fuzzing.csv", sep=',', index=False, encoding='utf-8')
# interval.to_csv("./datasets/clean-data/test-data/kv-ua/interval.csv", sep=',', index=False, encoding='utf-8')
# speed.to_csv("./datasets/clean-data/test-data/kv-ua/speed.csv", sep=',', index=False, encoding='utf-8')
# systematic.to_csv("./datasets/clean-data/test-data/kv-ua/systematic.csv", sep=',', index=False, encoding='utf-8')
# triple.to_csv("./datasets/clean-data/test-data/kv-ua/triple.csv", sep=',', index=False, encoding='utf-8')

In [16]:
## import datasets
dataset_dir = './datasets/cantrainandtest/can-train-and-test/set_01/test_04_unknown_vehicle_unknown_attack/'

double_3 = pd.read_csv(dataset_dir + "double-3.csv")
double_4 = pd.read_csv(dataset_dir + "double-4.csv")
fuzzing_3 = pd.read_csv(dataset_dir + "fuzzing-3.csv")
fuzzing_4 = pd.read_csv(dataset_dir + "fuzzing-4.csv")
interval_3 = pd.read_csv(dataset_dir + "interval-3.csv")
interval_4 = pd.read_csv(dataset_dir + "interval-4.csv")
speed_3 = pd.read_csv(dataset_dir + "speed-3.csv")
speed_4 = pd.read_csv(dataset_dir + "speed-4.csv")
systematic_3 = pd.read_csv(dataset_dir + "systematic-3.csv")
systematic_4 = pd.read_csv(dataset_dir + "systematic-4.csv")
triple_3 = pd.read_csv(dataset_dir + "triple-3.csv")
triple_4 = pd.read_csv(dataset_dir + "triple-4.csv")

# merge related datasets
double = pd.concat([double_3, double_4])
fuzzing = pd.concat([fuzzing_3, fuzzing_4])
interval = pd.concat([interval_3, interval_4])
speed = pd.concat([speed_3, speed_4])
systematic = pd.concat([systematic_3, systematic_4])
triple = pd.concat([triple_3, triple_4])

# Label Encode Categorical Data
label_encoder = preprocessing.LabelEncoder() 

double["arbitration_id"] = label_encoder.fit_transform(double["arbitration_id"])
double["data_field"] = label_encoder.fit_transform(double["data_field"])

fuzzing["arbitration_id"] = label_encoder.fit_transform(fuzzing["arbitration_id"])
fuzzing["data_field"] = label_encoder.fit_transform(fuzzing["data_field"])

interval["arbitration_id"] = label_encoder.fit_transform(interval["arbitration_id"])
interval["data_field"] = label_encoder.fit_transform(interval["data_field"])

speed["arbitration_id"] = label_encoder.fit_transform(speed["arbitration_id"])
speed["data_field"] = label_encoder.fit_transform(speed["data_field"])

systematic["arbitration_id"] = label_encoder.fit_transform(systematic["arbitration_id"])
systematic["data_field"] = label_encoder.fit_transform(systematic["data_field"])

triple["arbitration_id"] = label_encoder.fit_transform(triple["arbitration_id"])
triple["data_field"] = label_encoder.fit_transform(triple["data_field"])

# save to file
# double.to_csv("./datasets/clean-data/test-data/uv-ua/double.csv", sep=',', index=False, encoding='utf-8')
# fuzzing.to_csv("./datasets/clean-data/test-data/uv-ua/fuzzing.csv", sep=',', index=False, encoding='utf-8')
# interval.to_csv("./datasets/clean-data/test-data/uv-ua/interval.csv", sep=',', index=False, encoding='utf-8')
# speed.to_csv("./datasets/clean-data/test-data/uv-ua/speed.csv", sep=',', index=False, encoding='utf-8')
# systematic.to_csv("./datasets/clean-data/test-data/uv-ua/systematic.csv", sep=',', index=False, encoding='utf-8')
# triple.to_csv("./datasets/clean-data/test-data/uv-ua/triple.csv", sep=',', index=False, encoding='utf-8')

#### Label Encode columns with categorical data

###### Label Encode Training Dataset

In [42]:
label_encoder = preprocessing.LabelEncoder() 
# Update categorical
merged_datasets["arbitration_id"] = label_encoder.fit_transform(merged_datasets["arbitration_id"])
merged_datasets["data_field"] = label_encoder.fit_transform(merged_datasets["data_field"])

merged_datasets[:10]

# save new training dataset
merged_datasets.to_csv("updated_dataset.csv", sep=',', index=False, encoding='utf-8')


In [9]:
DoS_df = pd.read_csv("./datasets/clean-data/test-data/kv-ka/DoS.csv")

new_DoS_df = DoS_df.drop(columns=["attack"])

# new_DoS_df

new_DoS_df.to_csv("./datasets/clean-data/test-data/kv-ka/new_DoS.csv", sep=',', index=False, encoding='utf-8')
