In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('./data/CAN traffic (normal only).txt', 'r') as f:
  normal_data_list = [line.rstrip().split("\t") for line in f]

In [3]:
normal_data_list[0:3]

[['1597707827', '260', '8', '06 25 05 30 FF CF 71 55', 'Normal'],
 ['1597707827', '329', '8', '4A C5 7E 8C 31 2D 01 10', 'Normal'],
 ['1597707827', '38D', '8', '00 00 49 00 90 7F FE 01', 'Normal']]

In [4]:
with open('./data/CAN traffic (attack included).txt', 'r') as f:
  abnormal_data_list = [line.rstrip().split("\t") for line in f]

In [5]:
abnormal_data_list[0:3]

[['1597759710', '153', '8', '20 A1 10 FF 00 FF 50 1F', 'Normal', 'Normal'],
 ['1597759710', '220', '8', '13 24 7F 60 05 FF BF 10', 'Normal', 'Normal'],
 ['1597759710', '507', '4', '08 00 00 01', 'Normal', 'Normal']]

In [6]:
normal_data_df = pd.DataFrame(normal_data_list, columns=['Timestamp', 'Arbitration ID', 'DLC', 'data', 'Class'])
abnormal_data_df = pd.DataFrame(abnormal_data_list, columns=['Timestamp', 'Arbitration ID', 'DLC', 'data', 'Class', 'SubClass'])

In [7]:
normal_data_df.head(3)

Unnamed: 0,Timestamp,Arbitration ID,DLC,data,Class
0,1597707827,260,8,06 25 05 30 FF CF 71 55,Normal
1,1597707827,329,8,4A C5 7E 8C 31 2D 01 10,Normal
2,1597707827,38D,8,00 00 49 00 90 7F FE 01,Normal


In [8]:
abnormal_data_df.head(3)

Unnamed: 0,Timestamp,Arbitration ID,DLC,data,Class,SubClass
0,1597759710,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal
1,1597759710,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal
2,1597759710,507,4,08 00 00 01,Normal,Normal


**Using Arbitration ID**

In [9]:
normal_ArbID_df = normal_data_df['Arbitration ID'].apply(int, base=16)
abnormal_ArbID_df = abnormal_data_df['Arbitration ID'].apply(int, base=16)


In [10]:
normal_ArbID_df

0          608
1          809
2          909
3         1056
4         1057
          ... 
179341     913
179342     608
179343    1057
179344     304
179345     320
Name: Arbitration ID, Length: 179346, dtype: int64

In [11]:
abnormal_ArbID_df

0          339
1          544
2         1287
3          854
4          832
          ... 
806385     870
806386     871
806387     872
806388    1151
806389    1225
Name: Arbitration ID, Length: 806390, dtype: int64

In [12]:
abnormal_ArbID_df

0          339
1          544
2         1287
3          854
4          832
          ... 
806385     870
806386     871
806387     872
806388    1151
806389    1225
Name: Arbitration ID, Length: 806390, dtype: int64

In [13]:
train_arbID_seq = normal_ArbID_df.to_numpy()
test_arbID_seq = abnormal_ArbID_df.to_numpy()

In [14]:
print(len(train_arbID_seq))
print(len(test_arbID_seq))

179346
806390


In [15]:
def get_split_arbid_seq_by_wnd(arbidseq, window_size=5):
  splited = np.lib.stride_tricks.as_strided(arbidseq,shape=(np.size(arbidseq)-window_size+1,window_size),strides=(arbidseq.strides[0],arbidseq.strides[0]))
  return splited

In [16]:
tr_set = get_split_arbid_seq_by_wnd(train_arbID_seq)
te_set = get_split_arbid_seq_by_wnd(test_arbID_seq)


In [17]:
import hmmlearn.hmm as hmm

In [18]:
tr_set_size = len(tr_set)
print(tr_set_size)
te_set_size = len(te_set)
print(te_set_size)

179342
806386


In [19]:
tr_set=tr_set.astype(int)
te_set=te_set.astype(int)

In [20]:
h = hmm.GaussianHMM(2) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h.fit(tr_set) # 학습진행
print(h.score(tr_set)) # log(foward prob)
print(h.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h.startprob_)
print(h.transmat_)

--------------------
-6265544.540709559
-6465855.056967285
--------------------
[1.00000000e+00 2.59528284e-10]
[[0.84543979 0.15456021]
 [0.08668545 0.91331455]]


In [21]:
import matplotlib.pyplot as plt

In [22]:
h2 = hmm.GaussianHMM(3) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h2.fit(tr_set) # 학습진행
print(h2.score(tr_set)) # log(foward prob)
print(h2.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h2.startprob_)
print(h2.transmat_)

--------------------
-6220997.719996078
-6492987.873545902
--------------------
[1.00000000e+00 1.66882579e-14 3.86265096e-44]
[[8.41959630e-01 1.21726920e-16 1.58040370e-01]
 [2.37700970e-01 6.74611898e-01 8.76871320e-02]
 [8.78668644e-14 4.51479497e-01 5.48520503e-01]]


In [23]:
h3 = hmm.GaussianHMM(4) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h3.fit(tr_set) # 학습진행
print(h3.score(tr_set)) # log(foward prob)
print(h3.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h3.startprob_)
print(h3.transmat_)

--------------------
-6149946.600655656
-6498707.382895833
--------------------
[1.75257988e-25 9.99999958e-01 4.15958579e-08 8.12321652e-52]
[[5.50347213e-01 3.24990099e-04 1.18629901e-17 4.49327797e-01]
 [1.62389794e-01 8.37610206e-01 9.18676277e-20 7.73926711e-27]
 [1.54964122e-01 3.27651958e-01 4.04345235e-01 1.13038685e-01]
 [2.82249278e-03 3.43073659e-06 4.83423635e-01 5.13750441e-01]]
