In [2]:
import numpy as np
import pandas as pd
import sklearn as sk

### 1. Load Dataframe and basic pre processing

In [3]:
dataloc = "data"
carhacking = "Car_Hacking_Challenge_Dataset_rev20Mar2021"
prelim = "0_Preliminary"
training = "0_Training"
filename_0 = "Pre_train_D_1.csv"

In [4]:
import os

prelim_train_dir = os.path.join("..", dataloc, "raw", carhacking, prelim, training)
csv0 = os.path.join(prelim_train_dir, filename_0)

In [5]:
df = pd.read_csv(csv0)
df.head()

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass
0,1597760000.0,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal
1,1597760000.0,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal
2,1597760000.0,507,4,08 00 00 01,Normal,Normal
3,1597760000.0,356,8,00 00 00 80 16 00 00 00,Normal,Normal
4,1597760000.0,340,8,FC 03 00 E4 B7 21 FA 3C,Normal,Normal


In [6]:
df[["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]] = df.Data.str.split(" ", expand=True)
df.head()

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass,d1,d2,d3,d4,d5,d6,d7,d8
0,1597760000.0,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal,20,A1,10,FF,00,FF,50,1F
1,1597760000.0,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal,13,24,7F,60,05,FF,BF,10
2,1597760000.0,507,4,08 00 00 01,Normal,Normal,08,00,00,01,,,,
3,1597760000.0,356,8,00 00 00 80 16 00 00 00,Normal,Normal,00,00,00,80,16,00,00,00
4,1597760000.0,340,8,FC 03 00 E4 B7 21 FA 3C,Normal,Normal,FC,03,00,E4,B7,21,FA,3C


In [7]:
df[["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]]

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8
0,20,A1,10,FF,00,FF,50,1F
1,13,24,7F,60,05,FF,BF,10
2,08,00,00,01,,,,
3,00,00,00,80,16,00,00,00
4,FC,03,00,E4,B7,21,FA,3C
...,...,...,...,...,...,...,...,...
806385,3B,28,0B,3B,30,00,01,
806386,00,00,00,00,05,00,00,00
806387,00,00,00,00,01,28,0B,42
806388,04,7F,FF,FF,00,7B,00,26


In [8]:
df["d1_int"] = df.apply(lambda x: 999 if x["d1"] is None else int(x["d1"], 16), axis=1)
df["d2_int"] = df.apply(lambda x: 999 if x["d2"] is None else int(x["d2"], 16), axis=1)
df["d3_int"] = df.apply(lambda x: 999 if x["d3"] is None else int(x["d3"], 16), axis=1)
df["d4_int"] = df.apply(lambda x: 999 if x["d4"] is None else int(x["d4"], 16), axis=1)

df["d5_int"] = df.apply(lambda x: 999 if x["d5"] is None else int(x["d5"], 16), axis=1)
df["d6_int"] = df.apply(lambda x: 999 if x["d6"] is None else int(x["d6"], 16), axis=1)
df["d7_int"] = df.apply(lambda x: 999 if x["d7"] is None else int(x["d7"], 16), axis=1)
df["d8_int"] = df.apply(lambda x: 999 if x["d8"] is None else int(x["d8"], 16), axis=1)

In [9]:
X = df[["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]].to_numpy()
X

array([[ 32, 161,  16, ..., 255,  80,  31],
       [ 19,  36, 127, ..., 255, 191,  16],
       [  8,   0,   0, ..., 999, 999, 999],
       ...,
       [  0,   0,   0, ...,  40,  11,  66],
       [  4, 127, 255, ..., 123,   0,  38],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=int64)

### 2. Create fastText embeddings

In [10]:
import math

def build_nonoverlapping_sequence(X, seq_num=4):
    seq_len = X.shape[1] * seq_num
    print(seq_len)
    n = math.floor(X.shape[0] / seq_len)
    r = X.shape[0] % seq_len
    if r != 0:
        # Cut off not divisible part
        seqs = X[:-r].reshape(-1,seq_len)
    else:
        seqs = X.reshape(-1,seq_len)

    return seqs

In [11]:
Xnew = build_nonoverlapping_sequence(X, 4)
Xnew.shape

32


(201592, 32)

In [12]:
from abc import ABC, abstractmethod

class UnsupervisedEmbedding(ABC):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_dim: int = 100, embedding_version: float = 1.0):
        self.embedding_root = embedding_root
        self.embedding_for = embedding_for
        self.embedding_dim = embedding_dim
        self.embedding_version = embedding_version

    @abstractmethod
    def fit(self, X):
        pass

In [13]:
import fasttext

In [14]:
def generate_model_file_path(embedding_root: str = None, embedding_for: str = None,
                             embedding_model='skipgram', embedding_wordNgrams: int = 1,
                             embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0,
                             embedding_type='fasttext'):
    filename = f"{embedding_for}_{embedding_type}_{embedding_model}_" \
               f"{embedding_wordNgrams}wordNgram_{embedding_dim}dim_{train_seq_len}trainseq_v{embedding_version}.bin"
    print(os.path.join(embedding_root, filename))
    return os.path.join(embedding_root, filename)

In [15]:
class FastTextEmbedding(UnsupervisedEmbedding):
    def __init__(self, embedding_root: str = None, embedding_for: str = None,
                 embedding_model='skipgram', embedding_wordNgrams: int = 1,
                 embedding_dim: int = 100, embedding_version: float = 1.0,
                 epochs: int = 10, minCount: int = 1, maxn: int = 0):
        super().__init__(embedding_root, embedding_for, embedding_dim, embedding_version)
        self.embedding_model = embedding_model

        # Can use wordN grams by setting 2
        # https://fasttext.cc/docs/en/supervised-tutorial.html
        self.embedding_wordNgrams = embedding_wordNgrams

        self.epochs = epochs
        self.minCount = minCount
        self.maxn = maxn

        self.embedding_config = {
            'embedding_root': self.embedding_root,
            'embedding_for': self.embedding_for, 'embedding_model': self.embedding_model,
            "embedding_wordNgrams": self.embedding_wordNgrams, 'embedding_dim': self.embedding_dim,
            'embedding_version': self.embedding_version, 'embedding_type': 'fasttext'
        }

    def fit(self, X):
        seq_len = X.shape[1]
        # if X is not None:
        #     seq_len = len(_tokenize_by_spaces(X[0])) 
        # print(f"Calc BBBBBBBBBB {seq_len} , X.shape[1] = {X.shape[1]}")


        data_temp_file_path = FastTextEmbedding.generate_temp_seq_storage_file_path(self.embedding_for)
        np.savetxt(data_temp_file_path, X.astype(int), fmt='%i')

        # Create embeddings for event id https://fasttext.cc/docs/en/python-module.html
        fasttext_model = fasttext.train_unsupervised(data_temp_file_path,
                                                     model=self.embedding_model,
                                                     dim=self.embedding_dim,
                                                     wordNgrams=self.embedding_wordNgrams,
                                                     epoch=self.epochs, minCount=self.minCount, maxn=self.maxn)
        cfg_copy = self.embedding_config.copy()
        cfg_copy["train_seq_len"] = seq_len
        model_file_path = generate_model_file_path(**cfg_copy)
        fasttext_model.save_model(model_file_path)

        os.remove(data_temp_file_path)

        # print(fasttext_model.get_words())
        # word_embeddings = model.get_output_matrix()
        # print(word_embeddings)

        return fasttext_model
    
    @staticmethod
    def generate_temp_seq_storage_file_path(embedding_for=None):
        return embedding_for + '_eventid_token_seq.txt'

##### 2.1 Fit fastText model

In [16]:
fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0"
}

fasttext_embedding = FastTextEmbedding(**fastText_embedding_cfg)
fasttext_model = fasttext_embedding.fit(Xnew) # fits and saves model

C:/Users/jvana/carhacking/features\Car_Hacking_Challenge_Dataset_rev20Mar2021_fasttext_skipgram_3wordNgram_100dim_32trainseq_v1.0.bin


##### 2.2 Load fastText embeddings and test

In [17]:
def fast_text_from_model_file(embedding_root: str = None, embedding_for: str = None,
                              embedding_model='skipgram', embedding_wordNgrams: int = 1,
                              embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0):
    model_file_path = generate_model_file_path(embedding_root, embedding_for, embedding_model,
                                               embedding_wordNgrams, embedding_dim, train_seq_len,
                                               embedding_version, 'fasttext')
    model = fasttext.load_model(model_file_path)
    return model

Load embeddings

In [18]:
fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0", "train_seq_len": 32
}

fasttext_model = fast_text_from_model_file(**fastText_embedding_cfg)
word_embeddings = np.array([fasttext_model.get_word_vector(str(word_token))
                            for word_token in np.arange(0,256)])

C:/Users/jvana/carhacking/features\Car_Hacking_Challenge_Dataset_rev20Mar2021_fasttext_skipgram_3wordNgram_100dim_32trainseq_v1.0.bin




### 3. Save embeddings in word2vec format

This is compact and we dont need fancy fasttext sub word token embeddings that bloats its propreitory feature storage in bin file  

In [19]:
def generate_word2vec_model_file_path(embedding_root: str = None, embedding_for: str = None,
                             embedding_model='skipgram', embedding_wordNgrams: int = 1,
                             embedding_dim: int = 100, train_seq_len: int = 10, embedding_version: float = 1.0,
                             embedding_type='fasttext'):
    filename = f"{embedding_for}_{embedding_type}_{embedding_model}_" \
               f"{embedding_wordNgrams}wordNgram_{embedding_dim}dim_{train_seq_len}trainseq_v{embedding_version}.word2vec"
    return os.path.join(embedding_root, filename)

In [21]:
key_vals = [(key, value) for key, value in zip(np.arange(0,256).tolist(), word_embeddings)]
word_embeddings_dict = dict(key_vals)
word_embeddings_dict[255]

array([-0.1652192 , -0.03188491,  0.0437138 , -0.15407199,  0.0819129 ,
        0.23403224, -0.12552513,  0.05796237, -0.07279262, -0.22351351,
       -0.01057801,  0.11908747,  0.07217024,  0.31627265, -0.10252567,
        0.05768141,  0.23584728, -0.10948078,  0.17642309,  0.39414752,
       -0.18936859, -0.24655008,  0.1652826 , -0.05353741,  0.19300675,
        0.14169793,  0.04712588, -0.19885758, -0.11150912,  0.10134937,
        0.04163514,  0.06596374,  0.03247613,  0.07225998,  0.07003564,
        0.39564037,  0.2816025 , -0.13472864, -0.07672402,  0.3366484 ,
        0.20323402,  0.0333915 , -0.30564266,  0.12292032,  0.15675554,
        0.0815374 , -0.15814464, -0.31744534, -0.19032525, -0.12156282,
       -0.17795318, -0.35717875,  0.09650101,  0.00425808,  0.14139764,
       -0.09368326,  0.08010682,  0.04709674,  0.11535004,  0.01419058,
        0.16525435,  0.29682907,  0.1988397 , -0.19170538,  0.18227738,
       -0.44902086, -0.03545482,  0.34411696,  0.21893562, -0.11

In [22]:
fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0", "train_seq_len": 32
}
output_file_path = generate_word2vec_model_file_path(**fastText_embedding_cfg)

# Write the word embeddings to the text file in Word2Vec format
with open(output_file_path, "w", encoding="utf-8") as f:
    # Write the header containing the vocabulary size and vector dimension
    f.write(f"{len(word_embeddings_dict)} {len(word_embeddings_dict[0])}\n")
    
    # Write each word and its corresponding vector
    for word, vec in word_embeddings_dict.items():
        vec_str = " ".join(str(v) for v in vec)
        f.write(f"{word} {vec_str}\n")

### 3. Load embeddings from word2vec format text file

In [23]:
from gensim.models import Word2Vec, KeyedVectors

fastText_embedding_cfg = {
    'embedding_root': "C:/Users/jvana/carhacking/features",
    'embedding_for': "Car_Hacking_Challenge_Dataset_rev20Mar2021", 'embedding_model': "skipgram",
    "embedding_wordNgrams": 3, 'embedding_dim': 100,
    'embedding_version': "1.0", "train_seq_len": 32
}

model_file_path = generate_word2vec_model_file_path(**fastText_embedding_cfg)

keyed_word_vectors = KeyedVectors.load_word2vec_format(model_file_path)

### 4. Create a new dataframe with these columns

In [24]:
df[["Arbitration_ID", "DLC"]].nunique(axis=0)

Arbitration_ID    81
DLC                6
dtype: int64

In [25]:
df[["Arbitration_ID", "DLC"]].to_dict(orient='records')

[{'Arbitration_ID': '153', 'DLC': 8},
 {'Arbitration_ID': '220', 'DLC': 8},
 {'Arbitration_ID': '507', 'DLC': 4},
 {'Arbitration_ID': '356', 'DLC': 8},
 {'Arbitration_ID': '340', 'DLC': 8},
 {'Arbitration_ID': '366', 'DLC': 7},
 {'Arbitration_ID': '367', 'DLC': 8},
 {'Arbitration_ID': '368', 'DLC': 8},
 {'Arbitration_ID': '470', 'DLC': 8},
 {'Arbitration_ID': '453', 'DLC': 5},
 {'Arbitration_ID': '485', 'DLC': 4},
 {'Arbitration_ID': '568', 'DLC': 8},
 {'Arbitration_ID': '490', 'DLC': 8},
 {'Arbitration_ID': '164', 'DLC': 4},
 {'Arbitration_ID': '386', 'DLC': 8},
 {'Arbitration_ID': '130', 'DLC': 8},
 {'Arbitration_ID': '140', 'DLC': 8},
 {'Arbitration_ID': '251', 'DLC': 8},
 {'Arbitration_ID': '2B0', 'DLC': 6},
 {'Arbitration_ID': '260', 'DLC': 8},
 {'Arbitration_ID': '329', 'DLC': 8},
 {'Arbitration_ID': '47F', 'DLC': 8},
 {'Arbitration_ID': '153', 'DLC': 8},
 {'Arbitration_ID': '220', 'DLC': 8},
 {'Arbitration_ID': '38D', 'DLC': 8},
 {'Arbitration_ID': '356', 'DLC': 8},
 {'Arbitrati

In [26]:
from sklearn.feature_extraction import FeatureHasher

n_features = 10
hasher = FeatureHasher(n_features=n_features)

# Transform the data to hashed features
hashed_features = hasher.transform(df[["Arbitration_ID", "DLC"]].to_dict(orient='records'))

# Convert the hashed features to a dense array
hashed_features = hashed_features.toarray()

# Print the hashed features
print(hashed_features)

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0. -1.  0. ...  0.  0.  0.]
 [ 0.  0.  1. ...  0.  0.  0.]
 ...
 [ 0.  1.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ...  0.  1.  0.]]


Prepocessing steps to create word embeddings for each record

In [None]:
# Use d1_int up to d8_int columns, 
# get their corresponding word embeddings for the decimal equivalent of hex
# add and average. This average will be out sentence embedding
# start filling a new np array with one row for each record
# finally np hstack the hashed features, sentence embedding and label encoded target column (normal, abnormal) 
# Apply randomforest on this new dataset

In [27]:
keyed_word_vectors.get_vector(df[["d1_int", "d2_int", "d3_int", "d4_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]].iloc[0][0])

array([-0.44071892, -0.06461801, -0.12380028, -0.05893249,  0.25712273,
        0.01981363, -0.00225604, -0.08023099, -0.13779256,  0.12798232,
       -0.08988129,  0.18418936,  0.09859187, -0.12060867, -0.26170507,
       -0.05101939,  0.12887385, -0.1995803 ,  0.16557561,  0.01948609,
       -0.28932226, -0.14013599,  0.17534825,  0.06228806,  0.24495521,
        0.11318952,  0.09538978,  0.00824149,  0.01745899,  0.43113112,
       -0.04056255,  0.06332417,  0.04904675, -0.15028216,  0.08903732,
        0.19265382,  0.1830414 ,  0.00798411, -0.03045711,  0.31048122,
        0.35758886,  0.04317192, -0.07632869, -0.0365347 ,  0.14789513,
       -0.05716055, -0.12454859, -0.12554629,  0.00908867, -0.08281502,
       -0.40355748,  0.04221542,  0.04964033, -0.00585803, -0.09431889,
        0.06529701, -0.19129498,  0.06578472,  0.11864228, -0.15444924,
       -0.218076  ,  0.1377165 ,  0.18516701,  0.19497883,  0.0155937 ,
       -0.43425876,  0.20924444, -0.07114529, -0.03649195, -0.05

In [28]:
data_columns = ["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]

In [29]:
data_vec = keyed_word_vectors.get_vector(df[data_columns].iloc[806389][0])
print(data_vec)

[-3.51759009e-02 -7.71026015e-02  8.52566734e-02  7.42760524e-02
 -2.68836990e-02  1.40283406e-02 -8.49538445e-02 -1.22018587e-02
  4.70627472e-02 -2.09955052e-02 -5.98548055e-02  8.78131464e-02
 -5.64265028e-02  3.24415490e-02 -1.62869141e-01  8.81173089e-02
  1.69892058e-01  3.84085551e-02  5.35612516e-02  1.13660455e-01
 -1.54461175e-01  1.43341655e-02  8.90775993e-02 -2.36041099e-03
 -9.34261479e-04  5.36624566e-02  8.28749910e-02 -2.02629089e-01
 -1.05085358e-01 -4.80466057e-03  2.32724371e-04  1.34992480e-01
  1.90154500e-02 -2.28053808e-01 -1.19375288e-01  1.33301422e-01
 -1.25737386e-02 -1.90504581e-01  2.97479797e-02  1.47619948e-01
  1.10412359e-01  1.46186367e-01 -2.96756178e-01 -3.89030166e-02
  3.45474780e-02  2.23940238e-02  1.42181218e-01  1.44127905e-02
  7.54038095e-02 -1.23313010e-01 -1.00208014e-01 -7.06055611e-02
 -4.56126295e-02  5.90718053e-02  7.73450434e-02 -5.37408702e-02
  8.45014453e-02  1.24546088e-01  1.18975103e-01  3.47409025e-02
  2.40828414e-02 -2.08522

In [30]:
data_columns = ["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]

### Sentence embedding: Calculate the average of word_embeddings for 8 columns of databyte

Average Sentence embedding is stored in an array

In [31]:
import numpy as np

# Assuming you have loaded your KeyedVectors object into 'keyed_word_vectors'
data_columns = ["d1_int", "d2_int", "d3_int", "d4_int", "d5_int", "d6_int", "d7_int", "d8_int"]

# Initialize an empty list to store the average sentence embeddings
num_records = len(df)
all_avg_sentence_embeddings = []
resulting_embeddings = np.empty((num_records, len(data_columns), 100))
# Iterate over the DataFrame in chunks of 1000 records (you can adjust the chunk size)
chunk_size = 1000

for start in range(0, num_records, chunk_size):
    end = min(start + chunk_size, num_records)
    chunk = df[start:end]

    # Process the current chunk using your code
    df_packets_int = chunk[data_columns]
    flattened_packets = df_packets_int.values.flatten()
    # Generate embeddings and reshape
    word_embeddings_array = np.array([keyed_word_vectors[word] if value != 999 else np.zeros(100) for word, value in zip(flattened_packets, flattened_packets)])
    #print("Length of the array is ", len(word_embeddings_array*100))
    #if len(word_embeddings_array)*100 >= 800000:
    #reshaped_embeddings = word_embeddings_array.reshape((chunk_size, len(data_columns), 100))
    reshaped_embeddings = word_embeddings_array.reshape((end - start, len(data_columns), 100))
    # Calculate average sentence embeddings for the current chunk
    avg_sentence_embeddings = np.mean(reshaped_embeddings, axis=1)

    #resulting_embeddings[start:end] = avg_sentence_embeddings
    # Append the average sentence embeddings to the list
    all_avg_sentence_embeddings.append(avg_sentence_embeddings)

# Combine all chunks into a single NumPy array
resulting_embeddings = np.vstack(all_avg_sentence_embeddings)


In [32]:
import pandas as pd

# Assuming you have resulting_embeddings as a numpy array
# Create a DataFrame from resulting_embeddings
embedding_df = pd.DataFrame(resulting_embeddings.reshape(-1, 100), columns=[f'embedding_{i}' for i in range(100)])


In [40]:
hashed_df = pd.DataFrame(hashed_features, columns=[f'hashed_feature_{i}' for i in range(n_features)])

In [41]:
hashed_df

Unnamed: 0,hashed_feature_0,hashed_feature_1,hashed_feature_2,hashed_feature_3,hashed_feature_4,hashed_feature_5,hashed_feature_6,hashed_feature_7,hashed_feature_8,hashed_feature_9
0,0.0,0.0,0.0,-8.0,0.0,0.0,-1.0,0.0,0.0,0.0
1,0.0,-1.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,-1.0,0.0,0.0
4,0.0,0.0,1.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
806385,0.0,0.0,0.0,-7.0,0.0,0.0,0.0,0.0,0.0,1.0
806386,0.0,0.0,1.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0
806387,0.0,1.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0
806388,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,-1.0,0.0,0.0


In [39]:
print(hashed_features[2])

[ 0.  0.  1. -4.  0.  0.  0.  0.  0.  0.]


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806390 entries, 0 to 806389
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Timestamp       806390 non-null  float64
 1   Arbitration_ID  806390 non-null  object 
 2   DLC             806390 non-null  int64  
 3   Data            806390 non-null  object 
 4   Class           806390 non-null  object 
 5   SubClass        806390 non-null  object 
 6   d1              806390 non-null  object 
 7   d2              806390 non-null  object 
 8   d3              805156 non-null  object 
 9   d4              805156 non-null  object 
 10  d5              743095 non-null  object 
 11  d6              727685 non-null  object 
 12  d7              695249 non-null  object 
 13  d8              663856 non-null  object 
 14  d1_int          806390 non-null  int64  
 15  d2_int          806390 non-null  int64  
 16  d3_int          806390 non-null  int64  
 17  d4_int    

In [42]:
# Concatenate the original DataFrame (df) and the new DataFrame (embedding_df) horizontally
df_new = pd.concat([df, hashed_df, embedding_df], axis=1)

# Now, df contains the original columns plus the 100 new columns from resulting_embeddings

In [43]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806390 entries, 0 to 806389
Columns: 132 entries, Timestamp to embedding_99
dtypes: float64(111), int64(9), object(12)
memory usage: 812.1+ MB


In [44]:
df_new.shape

(806390, 132)

In [47]:
df.columns

Index(['Timestamp', 'Arbitration_ID', 'DLC', 'Data', 'Class', 'SubClass', 'd1',
       'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd1_int', 'd2_int', 'd3_int',
       'd4_int', 'd5_int', 'd6_int', 'd7_int', 'd8_int'],
      dtype='object')

In [53]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [54]:
target_variable = 'Class'
labenc = LabelEncoder()
df[target_variable] = 1-labenc.fit_transform(df[target_variable])

In [55]:

df[target_variable]

0         1
1         1
2         1
3         1
4         1
         ..
806385    1
806386    1
806387    1
806388    1
806389    1
Name: Class, Length: 806390, dtype: int32

In [58]:
df

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass,d1,d2,d3,d4,...,d7,d8,d1_int,d2_int,d3_int,d4_int,d5_int,d6_int,d7_int,d8_int
0,1.597760e+09,153,8,20 A1 10 FF 00 FF 50 1F,1,Normal,20,A1,10,FF,...,50,1F,32,161,16,255,0,255,80,31
1,1.597760e+09,220,8,13 24 7F 60 05 FF BF 10,1,Normal,13,24,7F,60,...,BF,10,19,36,127,96,5,255,191,16
2,1.597760e+09,507,4,08 00 00 01,1,Normal,08,00,00,01,...,,,8,0,0,1,999,999,999,999
3,1.597760e+09,356,8,00 00 00 80 16 00 00 00,1,Normal,00,00,00,80,...,00,00,0,0,0,128,22,0,0,0
4,1.597760e+09,340,8,FC 03 00 E4 B7 21 FA 3C,1,Normal,FC,03,00,E4,...,FA,3C,252,3,0,228,183,33,250,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806385,1.597760e+09,366,7,3B 28 0B 3B 30 00 01,1,Normal,3B,28,0B,3B,...,01,,59,40,11,59,48,0,1,999
806386,1.597760e+09,367,8,00 00 00 00 05 00 00 00,1,Normal,00,00,00,00,...,00,00,0,0,0,0,5,0,0,0
806387,1.597760e+09,368,8,00 00 00 00 01 28 0B 42,1,Normal,00,00,00,00,...,0B,42,0,0,0,0,1,40,11,66
806388,1.597760e+09,47F,8,04 7F FF FF 00 7B 00 26,1,Normal,04,7F,FF,FF,...,00,26,4,127,255,255,0,123,0,38


In [60]:
print(df_new.shape)

(806390, 132)


In [61]:
df_new.columns

Index(['Timestamp', 'Arbitration_ID', 'DLC', 'Data', 'Class', 'SubClass', 'd1',
       'd2', 'd3', 'd4',
       ...
       'embedding_90', 'embedding_91', 'embedding_92', 'embedding_93',
       'embedding_94', 'embedding_95', 'embedding_96', 'embedding_97',
       'embedding_98', 'embedding_99'],
      dtype='object', length=132)