In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F  
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import Transformer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


device: cpu


In [2]:
# Load data
import pickle
with open("./data/WN18RR_torch/with_5_negative_samples_data.pickle", "rb") as fp:   # Unpickling
    data = pickle.load(fp)

with open("./data/WN18RR_torch/with_5_negative_samples_labels.pickle", "rb") as fp:   # Unpickling
    labels = pickle.load(fp)

In [3]:
SPECIAL_TOKENS = ['[CLS]', '[SEP1]', '[SEP2]', '[END]']
str_to_token_int_map = {'[CLS]':100, '[SEP1]':200, '[SEP2]':300, '[END]':400} # Special tokens first, ...

def create_vocabulary(data_file):
    """Creates a vocabulary (index mapping) from your dataset file."""
    all_ids = set()
    with open(data_file, 'r') as f:
        for line in f:
            triple = line.strip().split(" ")
            subject, relation, object = triple[1], triple[3], triple[5]
            ids = [int(subject), int(relation),int(object)]
            all_ids.update(ids)  

    # ... then numeric IDs
    # str_to_token_int_map.update({str(token): token for token in all_ids})
    str_to_token_int_map.update({str(token): index+1000 for index, token in enumerate(all_ids)})
    return str_to_token_int_map

def tokenize_triple(triple_str, str_to_token_int_map=str_to_token_int_map):
    """Tokenizes a single triple using the provided index mapping."""
    tokens = triple_str.strip().split(" ")
    numerical_ids = [str_to_token_int_map[token] for token in tokens]
    return numerical_ids

# Example Usage
data_file = './data/WN18RR_torch/sequences/train_seq.txt'
index_map = create_vocabulary(data_file)

triple_str = '[CLS] 13266892 [SEP1] 1 [SEP2] 8107499 [END]'
tokenized_triple = tokenize_triple(triple_str, index_map)
print(tokenized_triple)

[100, 9917, 200, 1001, 300, 35799, 400]


In [4]:
def pad_batch(batch, max_length, pad_token_id=0):
    padded_batch = []
    for sequence in batch:  
        padded_seq = F.pad(sequence, (0, max_length - len(sequence)), value=pad_token_id) 
        padded_batch.append(padded_seq)
    return torch.stack(padded_batch)  

def create_src_mask(padded_batch, pad_token_id=0):
    return (padded_batch == pad_token_id).int()

def collate_fn(batch):  # Custom collate function for DataLoader
    inputs, targets = zip(*batch)  # Unpack example pairs
    max_length = max(len(seq) for seq in inputs) 
    padded_inputs = pad_batch(inputs, max_length)
    src_mask = create_src_mask(padded_inputs)
    return padded_inputs, targets, src_mask

tokenized_data = torch.tensor(list(map(tokenize_triple, data)), dtype=torch.int32).to(device)
labels = torch.tensor(labels).reshape(len(labels),1).to(device)
dataset = TensorDataset(tokenized_data, labels)  # Create a PyTorch Dataset
dataloader = DataLoader(dataset, batch_size=512, shuffle=True, collate_fn=collate_fn)


In [5]:
len(dataloader)

1018

In [6]:
# class TransformerEncoder(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, num_heads, 
#                  num_encoder_layers, feedforward_dim, dropout=0.1):
#         super().__init__()

#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.encoder_layers = nn.ModuleList([
#             TransformerEncoderLayer(embedding_dim, num_heads, feedforward_dim, dropout) 
#             for _ in range(num_encoder_layers)
#         ])
#         self.output_layer = nn.Linear(embedding_dim, 1)  # For score output

#     def forward(self, src_tokens, src_mask=None): 
#         embedded = self.embedding(src_tokens) 

#         for layer in self.encoder_layers:
#             embedded = layer(embedded,src_mask) 

#         #score prediction
#             print(self.output_layer)
#         score = self.output_layer(embedded[:, 0, :]) 
#         return score.squeeze()  # Output a single score

# class TransformerEncoderLayer(nn.Module):
#     def __init__(self, embedding_dim, num_heads, feedforward_dim, dropout):
#         super().__init__()
#         self.self_attn = nn.MultiheadAttention(embedding_dim, num_heads, dropout)
#         self.ffn = nn.Sequential(
#             nn.Linear(embedding_dim, feedforward_dim),  
#             nn.ReLU(),
#             nn.Linear(feedforward_dim, embedding_dim)
#         )
#         # LayerNorm and dropout would typically be here, omitted for brevity 
    
#     def forward(self, src, src_mask=None):
#         # Self-attention
#         attn_output, _ = self.self_attn(src, src, src, key_padding_mask=src_mask)
#         # Feed-forward
#         ffn_output = self.ffn(attn_output) 
#         return ffn_output 

In [7]:
vocabulary_size = len(set(tokenized_data.flatten().tolist()))
print(vocabulary_size)

40574


In [8]:
# vocabulary_size = len(set(tokenized_data.flatten().tolist()))
# embedding_dim = 16
# num_heads = 8
# num_encoder_layers = 6
# feedforward_dim = 1024
# dropout=0.1


# model = TransformerEncoder(vocabulary_size,embedding_dim,num_heads,num_encoder_layers,feedforward_dim,dropout).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = nn.MSELoss()

# for epoch in range(10):
#     for i,(input,target_scores,src_mask) in enumerate(dataloader):
#        print(i)
#        optimizer.zero_grad()
#        outputs = model(input,src_mask)  
#        loss = loss_fn(outputs, target_scores)  
#        loss.backward()
#        optimizer.step()

In [16]:

class SequenceValidityModel(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward):
        super(SequenceValidityModel, self).__init__()
        self.transformer = Transformer(d_model, nhead, num_layers, dim_feedforward)
        self.linear = nn.Linear(d_model, 2)  # Binary classification

    def forward(self, src):
        output = self.transformer(src)
        output = self.linear(output[-1, :, :])  # Use the last output for classification
        return output

# Initialize the model
model = SequenceValidityModel(d_model=16, nhead=4, num_layers=2, dim_feedforward=128)

# Define a loss function
loss_fn = nn.CrossEntropyLoss()

# Define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
# for epoch in tqdm(range(10)):
#     for i, (inputs, labels,src_mask) in enumerate(dataloader):
#         print(i)
#         inputs = inputs.to(device)
#         labels = labels.to(device)

#         # Forward pass
#         outputs = model(inputs)
#         loss = loss_fn(outputs, labels)
#         print(epoch,' ',loss)

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()