In [162]:
import pandas as pd
import numpy as np

In [163]:
with open('./data/CAN traffic (normal only).txt', 'r') as f:
  normal_data_list = [line.rstrip().split("\t") for line in f]

In [164]:
normal_data_list[0:3]

[['1597707827', '260', '8', '06 25 05 30 FF CF 71 55', 'Normal'],
 ['1597707827', '329', '8', '4A C5 7E 8C 31 2D 01 10', 'Normal'],
 ['1597707827', '38D', '8', '00 00 49 00 90 7F FE 01', 'Normal']]

In [165]:
with open('./data/CAN traffic (attack included).txt', 'r') as f:
  abnormal_data_list = [line.rstrip().split("\t") for line in f]

In [166]:
abnormal_data_list[0:3]

[['1597759710', '153', '8', '20 A1 10 FF 00 FF 50 1F', 'Normal', 'Normal'],
 ['1597759710', '220', '8', '13 24 7F 60 05 FF BF 10', 'Normal', 'Normal'],
 ['1597759710', '507', '4', '08 00 00 01', 'Normal', 'Normal']]

In [167]:
normal_data_df = pd.DataFrame(normal_data_list, columns=['Timestamp', 'Arbitration ID', 'DLC', 'data', 'Class'])
abnormal_data_df = pd.DataFrame(abnormal_data_list, columns=['Timestamp', 'Arbitration ID', 'DLC', 'data', 'Class', 'SubClass'])

In [168]:
normal_data_df.head(3)

Unnamed: 0,Timestamp,Arbitration ID,DLC,data,Class
0,1597707827,260,8,06 25 05 30 FF CF 71 55,Normal
1,1597707827,329,8,4A C5 7E 8C 31 2D 01 10,Normal
2,1597707827,38D,8,00 00 49 00 90 7F FE 01,Normal


In [169]:
abnormal_data_df.head(3)

Unnamed: 0,Timestamp,Arbitration ID,DLC,data,Class,SubClass
0,1597759710,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal
1,1597759710,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal
2,1597759710,507,4,08 00 00 01,Normal,Normal


**Using Arbitration ID**

In [170]:
normal_ArbID_df = normal_data_df['Arbitration ID'].apply(int, base=16)
abnormal_ArbID_df = abnormal_data_df['Arbitration ID'].apply(int, base=16)


In [171]:
normal_ArbID_df

0          608
1          809
2          909
3         1056
4         1057
          ... 
179341     913
179342     608
179343    1057
179344     304
179345     320
Name: Arbitration ID, Length: 179346, dtype: int64

In [172]:
abnormal_ArbID_df

0          339
1          544
2         1287
3          854
4          832
          ... 
806385     870
806386     871
806387     872
806388    1151
806389    1225
Name: Arbitration ID, Length: 806390, dtype: int64

In [173]:
abnormal_ArbID_df

0          339
1          544
2         1287
3          854
4          832
          ... 
806385     870
806386     871
806387     872
806388    1151
806389    1225
Name: Arbitration ID, Length: 806390, dtype: int64

In [174]:
train_arbID_seq = normal_ArbID_df.to_numpy()
test_arbID_seq = abnormal_ArbID_df.to_numpy()

In [175]:
print(len(train_arbID_seq))
print(len(test_arbID_seq))

179346
806390


In [176]:
def get_split_arbid_seq_by_wnd(arbidseq, window_size=3):
  splited = np.lib.stride_tricks.as_strided(arbidseq,shape=(np.size(arbidseq)-window_size+1,window_size),strides=(arbidseq.strides[0],arbidseq.strides[0]))
  return splited

In [177]:
tr_set = get_split_arbid_seq_by_wnd(train_arbID_seq)
te_set = get_split_arbid_seq_by_wnd(test_arbID_seq)


In [178]:
import hmmlearn.hmm as hmm

In [179]:
tr_set_size = len(tr_set)
print(tr_set_size)
te_set_size = len(te_set)
print(te_set_size)

179344
806388


In [180]:
tr_set=tr_set.astype(int)
te_set=te_set.astype(int)

In [181]:
h = hmm.GaussianHMM(2) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h.fit(tr_set) # 학습진행
print(h.score(tr_set)) # log(foward prob)
print(h.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h.startprob_)
print(h.transmat_)

--------------------
-3757076.5243507484
-3868908.5374412243
--------------------
[5.97891230e-05 9.99940211e-01]
[[0.86953736 0.13046264]
 [0.14743516 0.85256484]]


In [182]:
import matplotlib.pyplot as plt

In [183]:
h2 = hmm.GaussianHMM(3) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h2.fit(tr_set) # 학습진행
print(h2.score(tr_set)) # log(foward prob)
print(h2.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h2.startprob_)
print(h2.transmat_)

--------------------
-3701393.4203720903
-17596917.393672377
--------------------
[9.54840830e-01 4.51591703e-02 3.90190581e-40]
[[8.50173202e-01 8.09834581e-16 1.49826798e-01]
 [3.44257734e-01 5.09236490e-01 1.46505776e-01]
 [4.98930157e-09 5.11685790e-01 4.88314205e-01]]


In [184]:
h3 = hmm.GaussianHMM(4) # 모델 파라미터 설정, 모델링 객체 생성
print('--------------------')
h3.fit(tr_set) # 학습진행
print(h3.score(tr_set)) # log(foward prob)
print(h3.score(te_set)/(te_set_size/tr_set_size))
print('--------------------')
print(h3.startprob_)
print(h3.transmat_)

--------------------
-3631065.8397680894
-3910429.3196319584
--------------------
[9.93450638e-01 6.54936159e-03 6.07534155e-58 2.21400727e-48]
[[8.53725994e-01 1.83373589e-18 1.90773625e-15 1.46274006e-01]
 [3.93895937e-01 4.63533009e-01 8.19316619e-03 1.34377888e-01]
 [7.14624362e-13 9.99842962e-01 3.40782924e-06 1.53629962e-04]
 [5.64256704e-05 6.42329046e-10 9.99943561e-01 1.24406179e-08]]
