In [129]:
import numpy as np
import pandas as pd
from keras.src.utils import to_categorical
from numpy.lib.stride_tricks import sliding_window_view
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [130]:
df_train = pd.read_csv('../../data_v2/p2p_r4_train.csv')
df_test = pd.read_csv('../../data_v2/p2p_r4_il.csv')

In [131]:
fw = 10
bw = 5
ws = fw + bw + 1

## Preprocess Data

### Fill NA

In [132]:
feature_cols = ["method_call", "selective_file_data", "origin_file_data"]

df_train[feature_cols] = df_train[feature_cols].fillna("")
df_test[feature_cols] = df_test[feature_cols].fillna("")

### One hot Encode Labels

In [133]:
le = LabelEncoder().fit(df_train["task_position"])
train_labels = to_categorical(LabelEncoder().fit_transform(df_train["task_position"]))[bw: - fw]
test_labels = to_categorical(LabelEncoder().fit_transform(df_test["task_position"]))[bw:-fw]

In [134]:
def sliding_window(observation, window_length):
    return np.squeeze(sliding_window_view(observation, (window_length, observation.shape[1])), axis=1)


def encode_features(features, window_size):
    features_encoded = [to_categorical(LabelEncoder().fit_transform(ele)) for ele in features]
    features_window = [sliding_window(ele, window_size) for ele in features_encoded]
    return np.concatenate(features_window, axis=2)


train_features = encode_features(
    [df_train["method_call"], df_train["selective_file_data"], df_train["origin_file_data"]], ws)
test_features = encode_features([df_test["method_call"], df_test["selective_file_data"], df_test["origin_file_data"]],
                                ws)

In [135]:
# flatten features
f = train_features.reshape((train_features.shape[0], -1))
# join features and labels
j = np.concatenate([f, train_labels], axis=1)
# only keep unique combinations 
unique = np.unique(j, axis=0)

unique_labels = unique[:, -3:]
unique_features = unique[:, :-3].reshape(unique.shape[0], train_features.shape[1], train_features.shape[2])

In [136]:
X_train, X_test, y_train, y_test = train_test_split(unique_features, unique_labels)

In [137]:
from keras.src.layers import Bidirectional, LSTM, Dense
from keras import Sequential, Input

model = Sequential([
    Input((X_train.shape[1], X_train.shape[2])),
    Bidirectional(LSTM(units=32, dropout=0.3, recurrent_dropout=0.3)),
    Dense(3, activation="softmax")
])

model.compile(loss='categorical_crossentropy', optimizer='adam', weighted_metrics=["acc"])

2024-01-06 10:59:27.510386: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-06 10:59:27.510523: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-06 10:59:27.510533: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-06 10:59:27.511152: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-06 10:59:27.512051: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [139]:
import keras
from sklearn.utils import compute_sample_weight
from tqdm.keras import TqdmCallback

callbacks = [
    #keras.callbacks.ModelCheckpoint(
    #    "best_model.keras", save_best_only=True, monitor="val_loss"
    #),
    TqdmCallback(verbose=1),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
    ),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=20, verbose=1,start_from_epoch=30),
]

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    weighted_metrics=["acc"]
)

weights = compute_sample_weight('balanced', np.argmax(y_train, axis = 1))

hist = model.fit(
    X_train,
    y_train,
    batch_size =16,
    #validation_data = (test_features, test_labels_encoded),
    validation_split = 0.2,
    callbacks = callbacks,
    sample_weight = weights,
    epochs = 256,
    verbose = 0,
    )

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

2024-01-06 11:01:10.871336: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.

KeyboardInterrupt

