### Data preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
trainingData = pd.read_csv('../../data/trainData.csv', header=None)
# AATI = Average Access Time Interval
trainingData.columns = ["timestamp", "sector_id", "# of blocks"]
trainingData.head()

# trainingData.Frequency.hist(bins=15)

Unnamed: 0,timestamp,sector_id,# of blocks
0,1.652817,7487488,2048
1,1.652824,7489536,2048
2,1.65283,7491584,2048
3,1.652836,7493632,2048
4,1.652842,7495680,2048


In [3]:
testData = pd.read_csv("../../data/testData.csv", header=None)
testData.columns = ["timestamp", "sector_id", "# of blocks"]
testData.head()

print(testData.head())

   timestamp  sector_id  # of blocks
0   0.000000     303567            7
1   0.000000      55590            6
2   0.026214     303574            7
3   0.026214     240840            6
4   0.117964     303581            7


In [4]:
testDataLabel = pd.read_csv("../../data/lableing/testDataLabeled.csv", header=None)
testDataLabel.columns = ["sector_id", "frequency", "AATI", "# of blocks","hot/cold"]
print(testDataLabel.head())

   sector_id  frequency      AATI  # of blocks  hot/cold
0     753921      90736  0.459198       544416         1
1     240840      48735  0.886414       292465         1
2     836706      31787  1.296780       195293         1
3     837306      31704  1.299350       192217         1
4     700132      31288  1.156710       247313         1


In [5]:
trainDataLabel = pd.read_csv("../../data/lableing/trainDataLabeled.csv", header=None)
trainDataLabel.columns = ["sector_id", "frequency", "AATI", "# of blocks","hot/cold"]
print(trainDataLabel.head())

   sector_id  frequency     AATI  # of blocks  hot/cold
0       8488        966  36.0808         7728         1
1     205888        948  36.7295         7584         1
2     206064        948  36.7666         7584         1
3      74328        947  36.7683         7576         1
4      74408        945  36.8834         7560         1


In [6]:
# Constants
trainLabelSize = trainDataLabel["sector_id"].size
print("trainDataLabel size:", trainLabelSize)

trainDataLabel size: 1586700


In [7]:
trainDataLabel.dtypes

sector_id        int64
frequency        int64
AATI           float64
# of blocks      int64
hot/cold         int64
dtype: object

In [8]:
trainingData.dtypes

timestamp      float64
sector_id        int64
# of blocks      int64
dtype: object

In [9]:
trainingYLabelMap = {}

for i in range(trainLabelSize):
    trainingYLabelMap[trainDataLabel["sector_id"][i]] = trainDataLabel["hot/cold"][i]

trainingData["hot/cold"] = [trainingYLabelMap[sectorId] if sectorId in trainingYLabelMap else np.nan for sectorId in trainingData["sector_id"]]


In [10]:
testLabelSize = testDataLabel["sector_id"].size
testYLabelMap = {}

for i in range(testLabelSize):
    testYLabelMap[testDataLabel["sector_id"][i]] = testDataLabel["hot/cold"][i]

testData["hot/cold"] = [testYLabelMap[sectorId] if sectorId in testYLabelMap else np.nan for sectorId in testData["sector_id"]]

In [11]:
trainDataLabel["hot/cold"].value_counts()

0    1354099
1     232601
Name: hot/cold, dtype: int64

In [12]:
trainingData["hot/cold"].value_counts()


1    6057980
0    3628281
Name: hot/cold, dtype: int64

In [13]:
testData["hot/cold"].value_counts()

1    2070048
0    2029306
Name: hot/cold, dtype: int64

In [15]:
# Normalizing number of blocks
trainBlocksMean = trainingData["# of blocks"].mean()
trainBlocksStd = trainingData["# of blocks"].std()

trainingData["# of blocks"] = (trainingData["# of blocks"] - trainBlocksMean) / trainBlocksStd

testBlocksMean = testData["# of blocks"].mean()
testBlocksStd = testData["# of blocks"].std()

testData["# of blocks"] = (testData["# of blocks"] - testBlocksMean) / testBlocksStd

In [16]:
# Try univariate LSTM first
features = trainingData[["sector_id"]].to_numpy().tolist()
target = trainingData[["hot/cold"]].to_numpy().tolist()

In [17]:
features[0:5]


[[7487488.0, 0.6759364762275851],
 [7489536.0, 0.6759364762275851],
 [7491584.0, 0.6759364762275851],
 [7493632.0, 0.6759364762275851],
 [7495680.0, 0.6759364762275851]]

In [18]:
target[0:11]

[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]

### Model Implementation

In [19]:
import tensorflow as tf

2022-07-25 03:02:01.034409: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-25 03:02:01.231696: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-07-25 03:02:01.870626: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-07-25 03:02:01.870735: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [20]:
# Note: I basically concluded that large sector_ids are not possible to embed(preprocess).
# However, I thought of a way to represent sector_ids in tensors and that is one hot encoding and deep neural networks (aka. Dense, Fully connected)
# It could be wrong and I want more research on 
#   **representing numbers in one-hot encoding and RNN**
#   **Is it possible to feed LSTM large numbers**
# also checkout tf.data.Dataset.grouping_window() and tf.data.Dataset.window() functions 
# i think they can be used to generate windows. gl
ds = tf.data.Dataset.from_tensor_slices((features, target))
dsn_features

2022-07-25 03:03:03.654093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 03:03:03.712273: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 03:03:03.712688: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 03:03:03.714228: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

<TensorSliceDataset element_spec=(TensorSpec(shape=(2,), dtype=tf.float32, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>

In [22]:
maxSectorNumber = np.amax(trainingData["sector_id"])
maxBlocks = np.amax(trainingData["# of blocks"])
print("maxSectorNumber:", maxSectorNumber)
print("maxBlocks:", maxBlocks)

maxSectorNumber: 1000213824
maxBlocks: 0.6759364762275851


In [23]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [24]:
model = tf.keras.Sequential()

# embedding the sector number from 0-1
model.add(tf.keras.layers.Embedding(input_dim=maxSectorNumber+1,output_dim=64))

model.add(tf.keras.layers.LSTM)

NameError: name 'Sequential' is not defined