### Data preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
trainingData = pd.read_csv('../../data/trainData.csv', header=None)
# AATI = Average Access Time Interval
trainingData.columns = ["timestamp", "sector_id", "# of blocks"]
trainingData.head()

# trainingData.Frequency.hist(bins=15)

Unnamed: 0,timestamp,sector_id,# of blocks
0,1.652817,7487488,2048
1,1.652824,7489536,2048
2,1.65283,7491584,2048
3,1.652836,7493632,2048
4,1.652842,7495680,2048


In [3]:
testData = pd.read_csv("../../data/testData.csv", header=None)
testData.columns = ["timestamp", "sector_id", "# of blocks"]
testData.head()

print(testData.head())

   timestamp  sector_id  # of blocks
0   0.000000     303567            7
1   0.000000      55590            6
2   0.026214     303574            7
3   0.026214     240840            6
4   0.117964     303581            7


In [4]:
testDataLabel = pd.read_csv("../../data/lableing/testDataLabeled.csv", header=None)
testDataLabel.columns = ["sector_id", "frequency", "AATI", "# of blocks","hot/cold"]
print(testDataLabel.head())

   sector_id  frequency      AATI  # of blocks  hot/cold
0     753921      90736  0.459198       544416         1
1     240840      48735  0.886414       292465         1
2     836706      31787  1.296780       195293         1
3     837306      31704  1.299350       192217         1
4     700132      31288  1.156710       247313         1


In [5]:
trainDataLabel = pd.read_csv("../../data/lableing/trainDataLabeled.csv", header=None)
trainDataLabel.columns = ["sector_id", "frequency", "AATI", "# of blocks","hot/cold"]
print(trainDataLabel.head())

   sector_id  frequency     AATI  # of blocks  hot/cold
0       8488        966  36.0808         7728         1
1     205888        948  36.7295         7584         1
2     206064        948  36.7666         7584         1
3      74328        947  36.7683         7576         1
4      74408        945  36.8834         7560         1


In [6]:
# Constants
trainLabelSize = trainDataLabel["sector_id"].size
print("trainDataLabel size:", trainLabelSize)

trainDataLabel size: 1586700


In [7]:
trainDataLabel.dtypes

sector_id        int64
frequency        int64
AATI           float64
# of blocks      int64
hot/cold         int64
dtype: object

In [8]:
trainingData.dtypes

timestamp      float64
sector_id        int64
# of blocks      int64
dtype: object

In [9]:
trainingYLabelMap = {}

for i in range(trainLabelSize):
    trainingYLabelMap[trainDataLabel["sector_id"][i]] = trainDataLabel["hot/cold"][i]

trainingData["hot/cold"] = [trainingYLabelMap[sectorId] if sectorId in trainingYLabelMap else np.nan for sectorId in trainingData["sector_id"]]


In [10]:
testLabelSize = testDataLabel["sector_id"].size
testYLabelMap = {}

for i in range(testLabelSize):
    testYLabelMap[testDataLabel["sector_id"][i]] = testDataLabel["hot/cold"][i]

testData["hot/cold"] = [testYLabelMap[sectorId] if sectorId in testYLabelMap else np.nan for sectorId in testData["sector_id"]]

In [23]:
assert not np.any(np.isnan())

In [21]:
assert not np.any(np.isnan(testData["hot/cold"]))

In [11]:
trainDataLabel["hot/cold"].value_counts()

0    1354099
1     232601
Name: hot/cold, dtype: int64

In [12]:
trainingData["hot/cold"].value_counts()


1    6057980
0    3628281
Name: hot/cold, dtype: int64

In [13]:
testData["hot/cold"].value_counts()

1    2070048
0    2029306
Name: hot/cold, dtype: int64

In [14]:
# Normalizing number of blocks
trainBlocksMean = trainingData["# of blocks"].mean()
trainBlocksStd = trainingData["# of blocks"].std()

trainingData["# of blocks"] = (trainingData["# of blocks"] - trainBlocksMean) / trainBlocksStd

testBlocksMean = testData["# of blocks"].mean()
testBlocksStd = testData["# of blocks"].std()

testData["# of blocks"] = (testData["# of blocks"] - testBlocksMean) / testBlocksStd

In [36]:
# Try univariate LSTM first
features = trainingData["sector_id"].to_numpy()
target = trainingData[["hot/cold"]].to_numpy()

In [16]:
testFeatures = testData["sector_id"].to_numpy().tolist()
testTarget = testData["hot/cold"].to_numpy()

In [53]:
#Custom window function
def makeWindow(features, windowLength=1000, step=1):
    res = []
    i = 0
    while len(features) - windowLength >= i:
        res.append(features[i:i+windowLength])
        i += step
    
    return res

In [65]:
windowedFeatures = makeWindow(features, windowLength=1000, step=10)

In [66]:
windowedFeatures[0:2]

[array([7487488, 7489536, 7491584, 7493632, 7495680, 7497728, 7499776,
        7501824, 7503872, 7505920]),
 array([7497728, 7499776, 7501824, 7503872, 7505920, 7507968, 7510016,
        7512064, 7514112, 7516160])]

### Model Implementation

In [17]:
import tensorflow as tf

2022-07-28 16:45:19.570238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-28 16:45:20.158676: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-07-28 16:45:21.786423: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-07-28 16:45:21.786585: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [18]:
normalizationLayer = tf.keras.layers.Normalization(axis=None)
normalizationLayer.adapt(features)

2022-07-28 16:45:23.320503: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-28 16:45:23.420546: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-28 16:45:23.420881: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-28 16:45:23.422297: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

KeyboardInterrupt: 

In [None]:
normalizedTrainInput = normalizationLayer(features)

In [None]:
# Note: I basically concluded that large sector_ids are not possible to embed(preprocess).
# However, I thought of a way to represent sector_ids in tensors and that is one hot encoding and deep neural networks (aka. Dense, Fully connected)
# It could be wrong and I want more research on 
#   **representing numbers in one-hot encoding and RNN**
#   **Is it possible to feed LSTM large numbers**
# also checkout tf.data.Dataset.grouping_window() and tf.data.Dataset.window() functions 
# i think they can be used to generate windows. gl
ds = tf.data.Dataset.from_tensor_slices((features, target))
ds = ds.window(10, shift=1, drop_remainder=True)

In [None]:
# Operations on window
count = 0

def to_numpy(ds):
    return list(ds.as_numpy_iterator())

for window in ds:
    if count == 5:
        break
    count += 1

In [27]:
maxSectorNumber = np.amax(trainingData["sector_id"])
maxBlocks = np.amax(trainingData["# of blocks"])
print("maxSectorNumber:", maxSectorNumber)
print("maxBlocks:", maxBlocks)

maxSectorNumber: 1000213824
maxBlocks: 0.6759364762275851


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [29]:
def addressToTensor(address):
    addressSize = len(str(address)) 
    fill = [[0] * 10 for i in range(10 - addressSize)]
    arr = [[1 if j == int(char) else 0 for j in range(10)] for i, char in enumerate(str(address))]

    tensor = tf.Variable(fill + arr)
    
    return tensor



In [30]:
addressToTensor(10002138)

<tf.Variable 'Variable:0' shape=(10, 10) dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=int32)>

In [31]:
tensorTrainingData = []

for i in range(10000):
    tensorTrainingData.append(addressToTensor(features[i]))

tensorTrainingData = tf.Variable(tensorTrainingData )

In [32]:
tensorTestData = []

for i in range(10000):
    tensorTestData.append(addressToTensor(testFeatures[i]))

tensorTestData = tf.Variable(tensorTestData)

In [71]:
model = tf.keras.Sequential()

# embedding the sector number from 0-1
# input_shape=(number of time steps, number of features)
model.add(tf.keras.layers.Embedding(input_dim=trainLabelSize, output_dim=128, input_length=1000))
model.add(tf.keras.layers.LSTM(64,activation='relu', input_shape=(10, 10)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

2022-07-28 20:53:57.350907: W tensorflow/core/common_runtime/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 774.76MiB (rounded to 812390400)requested by op StatelessRandomUniformV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-07-28 20:53:57.350946: I tensorflow/core/common_runtime/bfc_allocator.cc:1033] BFCAllocator dump for GPU_0_bfc
2022-07-28 20:53:57.350962: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Bin (256): 	Total Chunks: 52, Chunks in use: 52. 13.0KiB allocated for chunks. 13.0KiB in use in bin. 1.3KiB client-requested in use in bin.
2022-07-28 20:53:57.350987: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-07-28 20:53:57.350995: I

ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[1586700,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2]

In [67]:
#Optimizer 
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
opt = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=opt, loss=loss_fn)

In [None]:
tensorTrainingData.shape

TensorShape([10000, 10, 10])

In [33]:
#One hot encoded shit
train_x = np.asarray(tensorTrainingData)
test_x = np.asarray(tensorTestData)

In [68]:
features[:10]

array([7487488, 7489536, 7491584, 7493632, 7495680, 7497728, 7499776,
       7501824, 7503872, 7505920])

In [69]:
model.fit(features[:10000], target[:10000], epochs=10, verbose=1)

Epoch 1/10


TypeError: in user code:

    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/home/skele/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/losses.py", line 2084, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "/home/skele/.local/lib/python3.10/site-packages/keras/backend.py", line 5586, in sparse_categorical_crossentropy
        epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
    File "/home/skele/.local/lib/python3.10/site-packages/keras/backend.py", line 985, in _constant_to_tensor
        return tf.constant(x, dtype=dtype)

    TypeError: Expected int64, but got 1e-07 of type 'float'.


In [None]:
y_hat = model.predict(test_x, verbose=1)
y_hat.shape



(10000, 1)

In [None]:
correct = 0
for i in range(10000):
    if y_hat[i] == testTarget[i]:
        correct += 1

print(f"Accuracy: {correct/10000}")

Accuracy: 0.0


In [None]:
y_hat[0:100]

array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
      