In [21]:
import numpy as np
import pandas as pd
import sklearn as sk

In [22]:
dataloc = "data"
carhacking = "Car_Hacking_Challenge_Dataset_rev20Mar2021"
prelim = "0_Preliminary"
training = "0_Training"
filename_1 = "Pre_train_D_1.csv"

In [23]:
import os

prelim_train_dir = os.path.join("..", dataloc, "raw", carhacking, prelim, training)
csv1 = os.path.join(prelim_train_dir, filename_1)

In [24]:
df = pd.read_csv(csv1)
df.head()

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass
0,1597760000.0,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal
1,1597760000.0,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal
2,1597760000.0,507,4,08 00 00 01,Normal,Normal
3,1597760000.0,356,8,00 00 00 80 16 00 00 00,Normal,Normal
4,1597760000.0,340,8,FC 03 00 E4 B7 21 FA 3C,Normal,Normal


In [25]:
df[["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]] = df.Data.str.split(" ", expand=True)
df.head()

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass,d1,d2,d3,d4,d5,d6,d7,d8
0,1597760000.0,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal,20,A1,10,FF,00,FF,50,1F
1,1597760000.0,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal,13,24,7F,60,05,FF,BF,10
2,1597760000.0,507,4,08 00 00 01,Normal,Normal,08,00,00,01,,,,
3,1597760000.0,356,8,00 00 00 80 16 00 00 00,Normal,Normal,00,00,00,80,16,00,00,00
4,1597760000.0,340,8,FC 03 00 E4 B7 21 FA 3C,Normal,Normal,FC,03,00,E4,B7,21,FA,3C


In [None]:
df["d4"].isna().sum()

In [None]:
df[df["Class"]=="Attack"].shape

In [None]:
df[df["Class"]=="Attack"]

In [26]:
df["d1_int"] = df.apply(lambda x: 999 if x["d1"] is None else int(x["d1"], 16), axis=1)
df["d2_int"] = df.apply(lambda x: 999 if x["d2"] is None else int(x["d2"], 16), axis=1)
df["d3_int"] = df.apply(lambda x: 999 if x["d3"] is None else int(x["d3"], 16), axis=1)
df["d4_int"] = df.apply(lambda x: 999 if x["d4"] is None else int(x["d4"], 16), axis=1)

df["d5_int"] = df.apply(lambda x: 999 if x["d5"] is None else int(x["d5"], 16), axis=1)
df["d6_int"] = df.apply(lambda x: 999 if x["d6"] is None else int(x["d6"], 16), axis=1)
df["d7_int"] = df.apply(lambda x: 999 if x["d7"] is None else int(x["d7"], 16), axis=1)
df["d8_int"] = df.apply(lambda x: 999 if x["d8"] is None else int(x["d8"], 16), axis=1)

In [27]:
df.shape

(806390, 22)

In [None]:
df.head()

In [28]:
X = df[["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]].to_numpy()
X

array([[ 32, 161,  16, ..., 255,  80,  31],
       [ 19,  36, 127, ..., 255, 191,  16],
       [  8,   0,   0, ..., 999, 999, 999],
       ...,
       [  0,   0,   0, ...,  40,  11,  66],
       [  4, 127, 255, ..., 123,   0,  38],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=int64)

In [None]:
X.shape

In [29]:
import math

def build_nonoverlapping_sequence(X, seq_num=4):
    seq_len = X.shape[1] * seq_num
    n = math.floor(X.shape[0] / seq_len)
    r = X.shape[0] % seq_len
    if r != 0:
        # Cut off not divisible part
        seqs = X[:-r].reshape(-1,32)
    else:
        seqs = X.reshape(-1,32)

    return seqs

In [None]:
math.floor(X.shape[0] / (X.shape[1] * 32))

In [30]:
Xnew = build_nonoverlapping_sequence(X, 32)
Xnew.shape

(201536, 32)

In [None]:
Xnew.shape

In [None]:
Xnew.descr()

In [38]:
def generate_model_file_path(embedding_root: str = None, embedding_for: str = None,
                             embedding_model='skipgram', embedding_wordNgrams: int = 1,
                             embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0,
                             embedding_type='fasttext'):
    filename = f"{embedding_for}_{embedding_type}_{embedding_model}_" \
               f"{embedding_wordNgrams}wordNgram_{embedding_dim}dim_{train_seq_len}trainseq_v{embedding_version}.bin"
    print(os.path.join(embedding_root, filename))
    return os.path.join(embedding_root, filename)

In [31]:
from abc import ABC, abstractmethod

class UnsupervisedEmbedding(ABC):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_dim: int = 100, embedding_version: float = 1.0):
        self.embedding_root = embedding_root
        self.embedding_for = embedding_for
        self.embedding_dim = embedding_dim
        self.embedding_version = embedding_version

    @abstractmethod
    def fit(self, X):
        pass

In [32]:
import fasttext

In [34]:
class FastTextEmbedding(UnsupervisedEmbedding):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_model='skipgram', embedding_wordNgrams: int = 1,
                 embedding_dim: int = 100, embedding_version: float = 1.0,
                 epochs: int = 10, minCount: int = 1, maxn: int = 0):
        super().__init__(embedding_root, embedding_for, embedding_dim, embedding_version)
        self.embedding_model = embedding_model

        # Can use wordN grams by setting 2
        # https://fasttext.cc/docs/en/supervised-tutorial.html
        self.embedding_wordNgrams = embedding_wordNgrams

        self.epochs = epochs
        self.minCount = minCount
        self.maxn = maxn

        self.embedding_config = {
            'embedding_root': self.embedding_root,
            'embedding_for': self.embedding_for, 'embedding_model': self.embedding_model,
            "embedding_wordNgrams": self.embedding_wordNgrams, 'embedding_dim': self.embedding_dim,
            'embedding_version': self.embedding_version, 'embedding_type': 'fasttext'
        }

    def fit(self, X):
        seq_len = X.shape[1]
        # if X is not None:
        #     seq_len = len(_tokenize_by_spaces(X[0])) 
        # print(f"Calc BBBBBBBBBB {seq_len} , X.shape[1] = {X.shape[1]}")


        data_temp_file_path = FastTextEmbedding.generate_temp_seq_storage_file_path(self.embedding_for)
        np.savetxt(data_temp_file_path, X.astype(int), fmt='%i')

        # Create embeddings for event id https://fasttext.cc/docs/en/python-module.html
        fasttext_model = fasttext.train_unsupervised(data_temp_file_path,
                                                     model=self.embedding_model,
                                                     dim=self.embedding_dim,
                                                     wordNgrams=self.embedding_wordNgrams,
                                                     epoch=self.epochs, minCount=self.minCount, maxn=self.maxn)
        cfg_copy = self.embedding_config.copy()
        cfg_copy["train_seq_len"] = seq_len
        model_file_path = generate_model_file_path(**cfg_copy)
        fasttext_model.save_model(model_file_path)

        os.remove(data_temp_file_path)

        # print(fasttext_model.get_words())
        # word_embeddings = model.get_output_matrix()
        # print(word_embeddings)

        return fasttext_model
    
    @staticmethod
    def generate_temp_seq_storage_file_path(embedding_for=None):
        return embedding_for + '_eventid_token_seq.txt'

In [39]:
fastText_embedding_cfg = {
    'embedding_root': "c:/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0"
}

fasttext_embedding = FastTextEmbedding(**fastText_embedding_cfg)
fasttext_model = fasttext_embedding.fit(Xnew) # fits and saves model

c:/carhacking/features\Car_Hacking_Challenge_Dataset_rev20Mar2021_fasttext_skipgram_3wordNgram_100dim_32trainseq_v1.0.bin


ValueError: c:/carhacking/features\Car_Hacking_Challenge_Dataset_rev20Mar2021_fasttext_skipgram_3wordNgram_100dim_32trainseq_v1.0.bin cannot be opened for saving!