In [None]:
import numpy as np
import pandas as pd
import sklearn as sk

In [None]:
dataloc = "data"
carhacking = "Car_Hacking_Challenge_Dataset_rev20Mar2021"
prelim = "0_Preliminary"
training = "0_Training"
filename_0 = "Pre_train_D_0.csv"

In [None]:
import os

prelim_train_dir = os.path.join("..", dataloc, "raw", carhacking, prelim, training)
csv0 = os.path.join(prelim_train_dir, filename_0)

In [None]:
df = pd.read_csv(csv0)
df.head()

In [None]:
df[["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]] = df.Data.str.split(" ", expand=True)
df.head()

In [None]:
df["d2"].isna().sum()

In [None]:
df["d1"].isna().sum()

In [None]:
df["d3"].isna().sum()

In [None]:
df["d4"].isna().sum()

In [None]:
df["d5"].isna().sum()

In [None]:
df["d6"].isna().sum()

In [None]:
df["d7"].isna().sum()

In [None]:
df["d8"].isna().sum()

In [None]:
df[df["Class"]=="Attack"].shape

In [None]:
df["d1_int"] = df.apply(lambda x: 999 if x["d1"] is None else int(x["d1"], 16), axis=1)
df["d2_int"] = df.apply(lambda x: 999 if x["d2"] is None else int(x["d2"], 16), axis=1)
df["d3_int"] = df.apply(lambda x: 999 if x["d3"] is None else int(x["d3"], 16), axis=1)
df["d4_int"] = df.apply(lambda x: 999 if x["d4"] is None else int(x["d4"], 16), axis=1)

df["d5_int"] = df.apply(lambda x: 999 if x["d5"] is None else int(x["d5"], 16), axis=1)
df["d6_int"] = df.apply(lambda x: 999 if x["d6"] is None else int(x["d6"], 16), axis=1)
df["d7_int"] = df.apply(lambda x: 999 if x["d7"] is None else int(x["d7"], 16), axis=1)
df["d8_int"] = df.apply(lambda x: 999 if x["d8"] is None else int(x["d8"], 16), axis=1)

In [None]:
df[df["d7_int"]==999].shape

In [None]:
df.info()

In [None]:
X = df[["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]].to_numpy()
X

In [None]:
X.shape

In [None]:
import math

def build_nonoverlapping_sequence(X, seq_num=4):
    seq_len = X.shape[1] * seq_num
    n = math.floor(X.shape[0] / seq_len)
    r = X.shape[0] % seq_len
    if r != 0:
        # Cut off not divisible part
        seqs = X[:-r].reshape(-1,32)
    else:
        seqs = X.reshape(-1,32)

    return seqs

In [None]:
X.shape

In [None]:
math.floor(X.shape[0] / (X.shape[1] * 32))

In [None]:
X.shape[0]

In [None]:
X.shape[1]*32

In [None]:
X.shape[0] % 32

In [None]:
Xnew = build_nonoverlapping_sequence(X, 32)
Xnew.shape

In [None]:
print(Xnew)

In [None]:
Xnew[0].shape

In [None]:
Xnew[0]

In [None]:
X[3]

In [None]:
X[7]

In [None]:
Xnew[1]

In [None]:
X[0]

In [None]:
X[1]

In [None]:
X[2]

In [None]:
X[3]

In [None]:
Xnew.shape

In [None]:
from abc import ABC, abstractmethod

class UnsupervisedEmbedding(ABC):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_dim: int = 100, embedding_version: float = 1.0):
        self.embedding_root = embedding_root
        self.embedding_for = embedding_for
        self.embedding_dim = embedding_dim
        self.embedding_version = embedding_version

    @abstractmethod
    def fit(self, X):
        pass

In [None]:
import fasttext

In [None]:
def generate_model_file_path(embedding_root: str = None, embedding_for: str = None,
                             embedding_model='skipgram', embedding_wordNgrams: int = 1,
                             embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0,
                             embedding_type='fasttext'):
    filename = f"{embedding_for}_{embedding_type}_{embedding_model}_" \
               f"{embedding_wordNgrams}wordNgram_{embedding_dim}dim_{train_seq_len}trainseq_v{embedding_version}.bin"
    print(os.path.join(embedding_root, filename))
    return os.path.join(embedding_root, filename)

In [None]:
class FastTextEmbedding(UnsupervisedEmbedding):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_model='skipgram', embedding_wordNgrams: int = 1,
                 embedding_dim: int = 100, embedding_version: float = 1.0,
                 epochs: int = 10, minCount: int = 1, maxn: int = 0):
        super().__init__(embedding_root, embedding_for, embedding_dim, embedding_version)
        self.embedding_model = embedding_model

        # Can use wordN grams by setting 2
        # https://fasttext.cc/docs/en/supervised-tutorial.html
        self.embedding_wordNgrams = embedding_wordNgrams

        self.epochs = epochs
        self.minCount = minCount
        self.maxn = maxn

        self.embedding_config = {
            'embedding_root': self.embedding_root,
            'embedding_for': self.embedding_for, 'embedding_model': self.embedding_model,
            "embedding_wordNgrams": self.embedding_wordNgrams, 'embedding_dim': self.embedding_dim,
            'embedding_version': self.embedding_version, 'embedding_type': 'fasttext'
        }

    def fit(self, X):
        seq_len = X.shape[1]
        # if X is not None:
        #     seq_len = len(_tokenize_by_spaces(X[0])) 
        # print(f"Calc BBBBBBBBBB {seq_len} , X.shape[1] = {X.shape[1]}")


        data_temp_file_path = FastTextEmbedding.generate_temp_seq_storage_file_path(self.embedding_for)
        np.savetxt(data_temp_file_path, X.astype(int), fmt='%i')

        # Create embeddings for event id https://fasttext.cc/docs/en/python-module.html
        fasttext_model = fasttext.train_unsupervised(data_temp_file_path,
                                                     model=self.embedding_model,
                                                     dim=self.embedding_dim,
                                                     wordNgrams=self.embedding_wordNgrams,
                                                     epoch=self.epochs, minCount=self.minCount, maxn=self.maxn)
        cfg_copy = self.embedding_config.copy()
        cfg_copy["train_seq_len"] = seq_len
        model_file_path = generate_model_file_path(**cfg_copy)
        fasttext_model.save_model(model_file_path)

        os.remove(data_temp_file_path)

        # print(fasttext_model.get_words())
        # word_embeddings = model.get_output_matrix()
        # print(word_embeddings)

        return fasttext_model
    
    @staticmethod
    def generate_temp_seq_storage_file_path(embedding_for=None):
        return embedding_for + '_eventid_token_seq.txt'

In [None]:
X.shape[1]

In [None]:
fasttext_model.words

In [None]:
print(fasttext_model.words)

In [None]:
fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0"
}

fasttext_embedding = FastTextEmbedding(**fastText_embedding_cfg)
fasttext_model = fasttext_embedding.fit(Xnew) # fits and saves model

Fit fastText model

In [None]:
def fast_text_from_model_file(embedding_root: str = None, embedding_for: str = None,
                              embedding_model='skipgram', embedding_wordNgrams: int = 1,
                              embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0):
    model_file_path = generate_model_file_path(embedding_root, embedding_for, embedding_model,
                                               embedding_wordNgrams, embedding_dim, train_seq_len,
                                               embedding_version, 'fasttext')
    model = fasttext.load_model(model_file_path)
    return model

Load embeddings

In [None]:
fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0", "train_seq_len": 32
}

fasttext_model = fast_text_from_model_file(**fastText_embedding_cfg)
word_embeddings = np.array([fasttext_model.get_word_vector(str(word_token))
                            for word_token in np.arange(0,255)])

#TODO: Extract normalized embeddings and do DBSCAN clustering

In [None]:
words = fasttext_model.get_word_vector(str(1))
words

In [None]:
words = fasttext_model.get_word_vector(str(2))
words

In [None]:
print(word_embeddings.shape)

In [None]:
word_embeddings

In [None]:
word_embeddings.shape

In [None]:
print(words)

In [None]:
normalized_word_embeddings = word_embeddings/ np.linalg.norm(word_embeddings)
normalized_word_embeddings

In [None]:
from sklearn.cluster import DBSCAN

eps = 0.5  # Adjust this value
min_samples = 5  # Adjust this value

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(normalized_word_embeddings)
labels.shape

In [None]:
labels

In [None]:
print(word_embeddings)

Save file in word2vec format. This is compact and we dont need fancy fasttext sub word token embeddings that bloats its propreitory feature storage in bin file  

In [None]:
def generate_word2vec_model_file_path(embedding_root: str = None, embedding_for: str = None,
                             embedding_model='skipgram', embedding_wordNgrams: int = 1,
                             embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0,
                             embedding_type='fasttext'):
    filename = f"{embedding_for}_{embedding_type}_{embedding_model}_" \
               f"{embedding_wordNgrams}wordNgram_{embedding_dim}dim_{train_seq_len}trainseq_v{embedding_version}.word2vec"
    return os.path.join(embedding_root, filename)

In [None]:
from gensim.models import Word2Vec, KeyedVectors

fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0", "train_seq_len": 32
}

model_file_path = generate_word2vec_model_file_path(**fastText_embedding_cfg)

#wv = KeyedVectors.load_word2vec_format(model_file_path)
#wv_eventids = wv.index_to_key
#TODO

In [None]:
wv = KeyedVectors.load_word2vec_format(model_file_path)

In [None]:
model_file_path