In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from utils import QmdlLogsHelper
import numpy as np
import sys; print(f"byte_order: {sys.byteorder}")
import struct

byte_order: little


In [2]:
path_project = "/disk/sean/5glogs"
path_logs = "sa_log/nr-airondiag_Thu_Apr_18_17-44-36_2024/diag_Thu_Apr_18_17-44-36_2024"
path_file = "qmdl_1.qmdl"
qmdl_logs_path = Path(path_project) / Path(path_logs) / Path(path_file)
logs_helper = QmdlLogsHelper(qmdl_logs_path)

In [3]:
print(logs_helper.get_logs_array().shape, type(logs_helper.get_logs_array()))
print(logs_helper.get_logs_array()[:3])

(287764, 5480) <class 'numpy.ndarray'>
[[0.07450981 0.07058824 0.41568628 ... 0.         0.         0.        ]
 [0.2627451  0.15686275 0.         ... 0.         0.         0.        ]
 [0.6156863  0.27058825 0.7529412  ... 0.         0.         0.        ]]


In [4]:
labels_array = logs_helper.get_detach_request_labels_array()
# labels_array = labels_array[:50000]
print(np.sum(labels_array), np.where(labels_array == 1)[0][-5:], labels_array.shape)

261 [266669 269336 270887 272919 280668] (287764,)


In [5]:
logs_array = logs_helper.get_logs_array()
LOGS_CHUNK_SIZE = 100
data = [np.array([i, i+LOGS_CHUNK_SIZE, labels_array[i:i+LOGS_CHUNK_SIZE].sum(dtype=np.uint32)], dtype=np.uint32) for i in range(len(logs_array)-LOGS_CHUNK_SIZE)]
data = np.vstack(data)
print(len(data), data[:3], type(data[0][0]))

287664 [[  0 100   0]
 [  1 101   0]
 [  2 102   0]] <class 'numpy.uint32'>


In [6]:
type(logs_array[0][0])

numpy.float32

In [6]:
data = logs_helper.get_dataset(chunk_size=100)
print(len(data), data[:3], type(data[0][0]))

287664 [[  0 100   0]
 [  1 101   0]
 [  2 102   0]] <class 'numpy.uint32'>


In [7]:
data_label0 = data[np.where(data[:,2] == 0)[0]]
data_label1 = data[np.where(data[:,2] > 0)[0]]
# data_label2 = data[np.where(data[:,2]==2)[0]]
# data_label3 = data[np.where(data[:,2]==3)[0]]
# assert len(data) == sum([len(data[np.where(data[:,2]==i)[0]]) for i in range(4)])
assert len(data) == len(data_label0) + len(data_label1)
print(f"{len(data)} == {len(data_label0) + len(data_label1)} ( = {len(data_label0)} + {len(data_label1)} )")

287664 == 287664 ( = 262690 + 24974 )


In [8]:
data_label0_0 = data_label0[np.mod(data_label0[:,0], 10) == 0]
len(data_label0_0)

26269

In [9]:
assert len(data_label1[np.where(data_label1[:,2]>0)[0]]) == sum([len(data_label1[np.where(data_label1[:,2]==i)[0]]) for i in range(1, 4)])
for i in range(4):
    print(f"label={i}, data_count={len(data_label1[np.where(data_label1[:,2]==i)[0]])}")

label=0, data_count=0
label=1, data_count=23952
label=2, data_count=918
label=3, data_count=104


In [11]:
data_label1_1 = data[np.where(data[:,2] > 0)[0]]; print(len(data_label1_1))
data_label1_1[:, 2] = 1
for i in range(4):
    print(f"label={i}, data_count={len(data_label1_1[np.where(data_label1_1[:,2]==i)[0]])}")

24974
label=0, data_count=0
label=1, data_count=24974
label=2, data_count=0
label=3, data_count=0


In [12]:
data_balanced = np.vstack([data_label0_0, data_label1_1])
x = data_balanced[:, 0:2]
y = data_balanced[:, 2]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42, stratify=y)
print(len(x_train), len(x_test), sum(y_train), sum(y_test))

46118 5125 22476 2498


In [12]:
x_train[0]

array([180582, 180682], dtype=uint32)

In [13]:
rng = np.random.default_rng(seed=42)
rng.choice(100, size=5, replace=False)

array([75, 43, 64,  8, 99])

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

LOG_LENGTH = 5480 # len(max(logs_list, key=len)) == 5480

class Classifier(nn.Module):
	def __init__(self, d_model=80, n_signals=2, dropout=0.1):
		super().__init__()
		# Project the dimension of features from that of input into d_model.
		self.prenet = nn.Linear(LOG_LENGTH, d_model)
		# TODO:
		#   Change Transformer to Conformer.
		#   https://arxiv.org/abs/2005.08100
		self.encoder_layer = nn.TransformerEncoderLayer(
			d_model=d_model, dim_feedforward=256, nhead=2
		)
		# self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

		# Project the the dimension of features from d_model into speaker nums.
		self.pred_layer = nn.Sequential(
			nn.Linear(d_model, d_model),
			nn.Sigmoid(),
			nn.Linear(d_model, n_signals),
		)

	def forward(self, logs):
		"""
		args:
            logs: (batch_size, CHUNK_SIZE, LOG_LENGTH)
		return:
			out: (batch size, n_signals)
		"""
		# out: (batch size, length, d_model)
		out = self.prenet(logs)
		# out: (length, batch size, d_model)
		out = out.permute(1, 0, 2)
		# The encoder layer expect features in the shape of (length, batch size, d_model).
		out = self.encoder_layer(out)
		# out: (batch size, length, d_model)
		out = out.transpose(0, 1)
		# mean pooling
		stats = out.mean(dim=1)

		# out: (batch, n_spks)
		out = self.pred_layer(stats)
		return out

In [27]:
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
import math
import torch

def get_cosine_schedule_with_warmup(
	optimizer: Optimizer,
	num_warmup_steps: int,
	num_training_steps: int,
	num_cycles: float = 0.5,
	last_epoch: int = -1,
):
	"""
	Create a schedule with a learning rate that decreases following the values of the cosine function between the
	initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
	initial lr set in the optimizer.

	Args:
		optimizer (:class:`~torch.optim.Optimizer`):
		The optimizer for which to schedule the learning rate.
		num_warmup_steps (:obj:`int`):
		The number of steps for the warmup phase.
		num_training_steps (:obj:`int`):
		The total number of training steps.
		num_cycles (:obj:`float`, `optional`, defaults to 0.5):
		The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
		following a half-cosine).
		last_epoch (:obj:`int`, `optional`, defaults to -1):
		The index of the last epoch when resuming training.

	Return:
		:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
	"""
	def lr_lambda(current_step):
		# Warmup
		if current_step < num_warmup_steps:
			return float(current_step) / float(max(1, num_warmup_steps))
		# decadence
		progress = float(current_step - num_warmup_steps) / float(
			max(1, num_training_steps - num_warmup_steps)
		)
		return max(
			0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
		)

	return LambdaLR(optimizer, lr_lambda, last_epoch)

In [16]:
import torch

def model_fn(batch, model, criterion, device):
	"""Forward a batch through the model."""

	logs, labels = batch
	logs = logs.to(device)
	labels = labels.to(device)

	outs = model(logs)

	loss = criterion(outs, labels)

	# Get the speaker id with highest probability.
	preds = outs.argmax(1)
	# Compute accuracy.
	accuracy = torch.mean((preds == labels).float())

	return loss, accuracy

In [17]:
from tqdm import tqdm
import torch

def valid(dataloader, model, criterion, device): 
	"""Validate on validation set."""

	model.eval()
	running_loss = 0.0
	running_accuracy = 0.0
	pbar = tqdm(total=len(dataloader.dataset), ncols=0, desc="Valid", unit=" uttr")

	for i, batch in enumerate(dataloader):
		with torch.no_grad():
			loss, accuracy = model_fn(batch, model, criterion, device)
			running_loss += loss.item()
			running_accuracy += accuracy.item()

		pbar.update(dataloader.batch_size)
		pbar.set_postfix(
			loss=f"{running_loss / (i+1):.2f}",
			accuracy=f"{running_accuracy / (i+1):.2f}",
		)

	pbar.close()
	model.train()

	return running_accuracy / len(dataloader)

In [None]:
from data import QmdlDataset
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split
import torch
import torch.nn as nn

def parse_args():
	"""arguments"""
	config = {
		"data_dir": "./Dataset",
		"save_path": "model.ckpt",
		"batch_size": 32,
		"n_workers": 8,
		"valid_steps": 2000,
		"warmup_steps": 1000,
		"save_steps": 10000,
		"total_steps": 70000,
	}
	return config

def main(
	data_dir,
	save_path,
	batch_size,
	n_workers,
	valid_steps,
	warmup_steps,
	total_steps,
	save_steps,
):
    """Main function."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Info]: Use {device} now!")

    ds_train = QmdlDataset(qmdl_logs_path, split='train')
    ds_test = QmdlDataset(qmdl_logs_path, split='test')
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=False)
    dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False)
    test_features, test_labels = next(iter(dl_test))
    print(test_features.shape, test_labels.shape, test_labels)
	# train_loader, valid_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)
	# train_iterator = iter(train_loader)
	# print(f"[Info]: Finish loading data!",flush = True)

	# model = Classifier(n_spks=speaker_num).to(device)
	# criterion = nn.CrossEntropyLoss()
	# optimizer = AdamW(model.parameters(), lr=1e-3)
	# scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
	# print(f"[Info]: Finish creating model!",flush = True)

	# best_accuracy = -1.0
	# best_state_dict = None

	# pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

	# for step in range(total_steps):
	# 	# Get data
	# 	try:
	# 		batch = next(train_iterator)
	# 	except StopIteration:
	# 		train_iterator = iter(train_loader)
	# 		batch = next(train_iterator)

	# 	loss, accuracy = model_fn(batch, model, criterion, device)
	# 	batch_loss = loss.item()
	# 	batch_accuracy = accuracy.item()

	# 	# Updata model
	# 	loss.backward()
	# 	optimizer.step()
	# 	scheduler.step()
	# 	optimizer.zero_grad()

	# 	# Log
	# 	pbar.update()
	# 	pbar.set_postfix(
	# 		loss=f"{batch_loss:.2f}",
	# 		accuracy=f"{batch_accuracy:.2f}",
	# 		step=step + 1,
	# 	)

	# 	# Do validation
	# 	if (step + 1) % valid_steps == 0:
	# 		pbar.close()

	# 		valid_accuracy = valid(valid_loader, model, criterion, device)

	# 		# keep the best model
	# 		if valid_accuracy > best_accuracy:
	# 			best_accuracy = valid_accuracy
	# 			best_state_dict = model.state_dict()

	# 		pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

	# 	# Save the best model so far.
	# 	if (step + 1) % save_steps == 0 and best_state_dict is not None:
	# 		torch.save(best_state_dict, save_path)
	# 		pbar.write(f"Step {step + 1}, best model saved. (accuracy={best_accuracy:.4f})")

	# pbar.close()

if __name__ == "__main__":
	main(**parse_args())

[Info]: Use cuda now!


In [39]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[Info]: Use {device} now!")
outs = torch.tensor([[-0.4479,  0.0522],
        [-0.4415,  0.0304],
        [-0.4452,  0.0278],
        [-0.4706,  0.0399],
        [-0.4554,  0.0350],
        [-0.4750,  0.0309],
        [-0.4432,  0.0390],
        [-0.4480,  0.0473],
        [-0.4636,  0.0588],
        [-0.4737,  0.0599],
        [-0.4384,  0.0442],
        [-0.4327,  0.0208],
        [-0.4271,  0.0200],
        [-0.4546,  0.0381],
        [-0.3925, -0.0133],
        [-0.3263, -0.0047],
        [-0.4414,  0.0381],
        [-0.4638,  0.0415],
        [-0.4415,  0.0392],
        [-0.4728,  0.0528],
        [-0.4769,  0.0532],
        [-0.4473,  0.0420],
        [-0.4592,  0.0274],
        [-0.4665,  0.0579],
        [-0.4809,  0.0543],
        [-0.4532,  0.0338],
        [-0.4379,  0.0169],
        [-0.4772,  0.0332],
        [-0.4451,  0.0540],
        [-0.3530,  0.0109],
        [-0.4492,  0.0408],
        [-0.4688,  0.0467]], dtype=torch.float)
targets = torch.tensor([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 1, 0], dtype=torch.long)
outs.to(device)
targets.to(device)
print(outs.dtype, targets.dtype)

[Info]: Use cuda now!
torch.float32 torch.int64


In [40]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
criterion(outs, targets)

tensor(0.6426)

In [9]:
print(struct.pack('>h', 1023)) # b'\x03\xff'
print(struct.pack('=h', 1023)) # native, b'\xff\x03'
print(struct.pack('<h', 1023)) # little, b'\xff\x03'

b'\x03\xff'
b'\xff\x03'
b'\xff\x03'


In [10]:
struct.calcsize('<c')

1

In [8]:
a = struct.unpack('<h', data[:2])
print(a, type(a))

(4627,) <class 'tuple'>


In [9]:
struct.pack('>h', struct.unpack('<h', data[:2])[0])

b'\x12\x13'

In [10]:
len(bdata)

24737113

In [11]:
for i in range(0, len(bdata[:8]), 2):
    print(bdata[i], bdata[i+1])

19 18
106 125
94 67
40 126


In [12]:
print(bdata[:8], bdata[:8:2])

b'\x13\x12j}^C(~' b'\x13j^('


In [13]:
bd2 = []
for elem1, elem2 in zip(bdata[0:8:2], bdata[1:8:2]):
    print(elem1, elem2)
    bd2.extend([elem2, elem1])

19 18
106 125
94 67
40 126


In [14]:
print(bd2, bytes(bd2), type(bytes(bd2)))

[18, 19, 125, 106, 67, 94, 126, 40] b'\x12\x13}jC^~(' <class 'bytes'>


In [15]:
# https://blog.finxter.com/python-read-binary-file/

In [33]:
bd3 = []
for elem1, elem2 in zip(bdata[::2], bdata[1::2]):
    bd3.extend([elem2, elem1])
if len(bd3) < len(bdata):
    for elem2 in bdata[-1:]:
        bd3.append(elem2)

In [34]:
len(bd3), len(bdata)

(24737113, 24737113)

In [36]:
bd3[-3:], bdata[-3:]

([131, 97, 126], b'a\x83~')

In [38]:
with open("qmdl_1_swapped.qmdl", "wb") as f:
    f.write(bytes(bd3))

In [14]:
with open("qmdl_1_swapped.qmdl", "rb") as f:
    bdata = f.read() # 
    data = bdata[:10]
    print(f"{bdata[:10]}")
    print(f"{data}", type(data))
    print(f"{bdata[:10].hex()}")

b'\x12\x13}jC^~(E\x9d'
b'\x12\x13}jC^~(E\x9d' <class 'bytes'>
12137d6a435e7e28459d
