### Dataset

In [1]:
from datasets import load_dataset
from huggingface_hub import login

In [2]:
login(token='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')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/TZholaman/.cache/huggingface/token
Login successful


In [3]:
train_ds = load_dataset('rokset3/keystrokes136M_normalized_features', split='train')
test_ds = load_dataset('rokset3/keystrokes136M_normalized_features', split='test')

Found cached dataset parquet (/home/TZholaman/.cache/huggingface/datasets/rokset3___parquet/rokset3--keystrokes136M_normalized_features-4ef0b7f2d56e4af1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/TZholaman/.cache/huggingface/datasets/rokset3___parquet/rokset3--keystrokes136M_normalized_features-4ef0b7f2d56e4af1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


### Creating Custom Triplet Loss Dataset Class

In [4]:
import time
import torch
import random
import datasets
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from einops import rearrange, repeat
from torch.utils.data import Dataset, DataLoader


In [5]:
sample = train_ds.select(range(0,10000))

In [6]:
sample_df = sample.to_pandas()

In [7]:
sample_df.head(1)

Unnamed: 0,participant_id,section_id,keycode_ids,hl,il,pl,rl
0,100001,1090979,"[0.06274509803921569, 0.3411764705882353, 0.25...","[0.151, 0.12, 0.144, 0.144, 0.136, 0.128, 0.2,...","[-0.151, -0.08, 0.056, -0.072, -0.032, 0.048, ...","[0.0, 0.071, 0.176, 0.072, 0.112, 0.184, 0.104...","[0.0, 0.04, 0.2, 0.072, 0.104, 0.176, 0.176, 0..."


### Lets test model & losses 

In [9]:
from classes.losses import TripletLoss
from classes.models import LSTM, BNLSTMCell
from classes.datasets import CustomTripletLossDatasetFromPandasDataFrame

def num_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp


In [8]:
train_ds = CustomTripletLossDatasetFromPandasDataFrame(sample_df,
                                                       128)
train_dl = DataLoader(train_ds, batch_size=2048, shuffle=True)

In [23]:
test_batch = next(iter(train_dl))
anchor = model(test_batch['anchor_features'])[0][-1]
positive = model(test_batch['positive_features'])[0][-1]
negative = model(test_batch['negative_features'])[0][-1]

In [24]:
loss = TripletLoss()

In [25]:
loss(anchor, positive, negative)

tensor(1.0625, grad_fn=<MeanBackward0>)

### Lets train model on this sample data

In [26]:
model = LSTM(cell_class=BNLSTMCell,
             input_size=5,
             hidden_size=128,
             num_layers=2,
             use_bias=True,
             batch_first=True,
             dropout=0.2,
             max_length=128)


epochs = 10
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
criterion = TripletLoss()
model.train()
device = 'cuda:0'
model = model.cuda()


for epoch in tqdm(range(epochs), desc="Epochs"):
    running_loss = []
    for step, batch in enumerate(tqdm(train_dl, desc="Training", leave=False)):
        anchor_img = batch['anchor_features'].to(device)
        positive_img = batch['positive_features'].to(device)
        negative_img = batch['negative_features'].to(device)
        
        optimizer.zero_grad()
        anchor_out = model(anchor_img)[0][:-1]
        positive_out = model(positive_img)[0][:-1]
        negative_out = model(negative_img)[0][:-1]
        
        loss = criterion(anchor_out, positive_out, negative_out)
        loss.backward()
        optimizer.step()
        
        running_loss.append(loss.cpu().detach().numpy())
    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch+1, epochs, np.mean(running_loss)))

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1/10 - Loss: 1.0401


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 2/10 - Loss: 1.0296


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 3/10 - Loss: 1.0025


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 4/10 - Loss: 1.0090


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 5/10 - Loss: 0.9694


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 6/10 - Loss: 0.9404


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 7/10 - Loss: 0.8981


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 8/10 - Loss: 0.9566


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 9/10 - Loss: 1.0747


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 10/10 - Loss: 0.8056


### Lets test trainer

In [9]:
from classes.trainers import TripletLossTrainer

In [10]:
model = LSTM(cell_class=BNLSTMCell,
             input_size=5,
             hidden_size=128,
             num_layers=2,
             use_bias=True,
             batch_first=True,
             dropout=0.2,
             max_length=128)


epochs = 10
device = 'cuda:0'
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
train_dl = train_dl
val_dl = train_dl
config = dict(
    batch_size = 2048,
    log_to = 'wandb',
    project_name = 'GoNet',
    entity = 'zholamantemirlan',
    experiment_name = 'test_run',
    model_name = 'GoNet_128_2',
    save_dir = 'runs',
    epochs = 10
)


In [11]:
trainer = TripletLossTrainer(model,
                             optimizer,
                             train_dl,
                             val_dl,
                             device,
                             config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzholamantemirlan[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
trainer.train()

### Custom Bucketing collate fn

In [1]:
from datasets import load_dataset
from huggingface_hub import login
login(token='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')
train_ds = load_dataset('rokset3/keystrokes136M_normalized_features', split='train')
test_ds = load_dataset('rokset3/keystrokes136M_normalized_features', split='test')
sample = train_ds.select(range(0,10000))
sample_df = sample.to_pandas()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/TZholaman/.cache/huggingface/token
Login successful


Found cached dataset parquet (/home/TZholaman/.cache/huggingface/datasets/rokset3___parquet/rokset3--keystrokes136M_normalized_features-4ef0b7f2d56e4af1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/TZholaman/.cache/huggingface/datasets/rokset3___parquet/rokset3--keystrokes136M_normalized_features-4ef0b7f2d56e4af1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [2]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence

In [33]:
def custom_collate_fn(data, n_features=5):
    '''
    data = List[dict]
    '''
    (_,
     _,
     _,
     anchor_len,
     positive_len,
     negative_len) = zip(*data)
    
    anchor_max_len = max(anchor_len)
    positive_max_len = max(positive_len)
    negative_max_len = max(negative_len)
    
    n_features = n_features
    
    anchor_features = torch.zeros((len(data), n_features, anchor_max_len))
    positive_features = torch.zeros((len(data), n_features, positive_max_len))
    negative_features = torch.zeros((len(data),  n_features, negative_max_len))
    
    anchor_pos, positive_pos, negative_pos = 0, 1, 2
    #for i in range(len(data)):
    #    j, k = data[i][anchor_pos].size(0), data[i][anchor_pos].size(1)
    #    anchor_features[i] = torch.cat([data[i][achor_pos], torch.zeros((anchor_max_len - j, k))])
     
    anchor_features = _reshape_features(data, anchor_pos, anchor_features, anchor_max_len)
    positive_features = _reshape_features(data, positive_pos, positive_features, positive_max_len)
    negative_features = _reshape_features(data, negative_pos, negative_features, negative_max_len)
    
    
    anchor_len = torch.tensor(anchor_len)
    positive_len = torch.tensor(positive_len)
    negative_len = torch.tensor(negative_len)
    
    return (
        anchor_features.float(),
        positive_features.float(),
        negative_features.float(),
        anchor_len.long(),
        positive_len.long(),
        negative_len.long(),
    )

def _reshape_features(data, pos, features, max_len):
    for i in range(len(data)):
        j, k = data[i][pos].size(0), data[i][pos].size(1)
        features[i] = torch.cat([data[i][pos], torch.zeros((j, max_len - k))], dim=-1)
    
    return features
    
    
    
    

In [34]:
import numpy as np
from torch.utils.data import DataLoader
from classes.datasets import BetterTripletLossDatasetFromPandasDataFrame
import torch


In [35]:
ds = BetterTripletLossDatasetFromPandasDataFrame(sample_df, max_length=128)

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [36]:
ds[0][0].shape

torch.Size([5, 49])

In [37]:
dl = DataLoader(ds, batch_size=64, collate_fn=custom_collate_fn)

In [40]:
a, p, n, _, _, _ = next(iter(dl))

In [45]:
a.shape

torch.Size([64, 5, 76])

In [39]:
import numpy as np
np.concatenate(sample_df.iloc[10][['hl', 'rl']].values, axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1

In [25]:
arr = np.array([[1, 2],
               [3, 4]])

In [27]:
arr.dtype

dtype('int64')

In [28]:
arr.astype(np.float32)

array([[1., 2.],
       [3., 4.]], dtype=float32)

In [29]:
arr

array([[1, 2],
       [3, 4]])