In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir("/content/drive/MyDrive/00.new_Notebook_test/input_data")

In [None]:
!ls

In [None]:
!pip install torchtext transformers pandas

In [None]:
!pip install torchtext==2.0.0

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Positional Encoding

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pos_encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, x):
        return x + self.pos_encoding[:x.size(0), :]

In [None]:
# Scaled Dot-Product Attention

def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    attn_weights = F.softmax(scores, dim=-1)
    return torch.matmul(attn_weights, value), attn_weights

In [None]:
# Multi-Head Attention

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        assert embed_size % heads == 0
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, query, key, value, mask=None):
        N = query.size(0)
        query_len, key_len, value_len = query.size(1), key.size(1), value.size(1)

        query = self.query(query).view(N, query_len, self.heads, self.head_dim)
        key = self.key(key).view(N, key_len, self.heads, self.head_dim)
        value = self.value(value).view(N, value_len, self.heads, self.head_dim)

        query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2)

        attention, _ = scaled_dot_product_attention(query, key, value, mask)
        attention = attention.transpose(1, 2).contiguous().view(N, query_len, self.embed_size)

        return self.fc_out(attention)

In [None]:
# Feed-Forward Network

class FeedForward(nn.Module):
    def __init__(self, embed_size, expansion=4):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embed_size, expansion * embed_size)
        self.fc2 = nn.Linear(expansion * embed_size, embed_size)

    def forward(self, x):
        return self.fc2(F.relu(self.fc1(x)))

In [None]:
# Transformer Block

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = FeedForward(embed_size, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(query, key, value, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

In [None]:
# Encoder

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_len):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = PositionalEncoding(embed_size, max_len)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout, forward_expansion)
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        out = self.dropout(self.position_embedding(self.word_embedding(x)))

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

In [None]:
# Decoder

class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_len):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = PositionalEncoding(embed_size, max_len)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        x = self.dropout(self.position_embedding(self.word_embedding(x)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

In [None]:
# Full Transformer Model

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=512, num_layers=6,
                 forward_expansion=4, heads=8, dropout=0.1, device="cuda", max_len=100):
        super(Transformer, self).__init__()

        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_len)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_len)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N, 1, trg_len, trg_len)
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

# Data Preparation

In [None]:
# Load and merge your datasets
train_data = pd.read_json('attribute_train.data', lines=True)
train_solution = pd.read_json('attribute_train.solution', lines=True)
val_data = pd.read_json('attribute_val.data', lines=True)
val_solution = pd.read_json('attribute_val.solution', lines=True)
test_data = pd.read_json('attribute_test.data', lines=True)

In [None]:
# Merge data with solutions
train_merged = pd.merge(train_data, train_solution, on='indoml_id')
val_merged = pd.merge(val_data, val_solution, on='indoml_id')

In [None]:
# Define a tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Define the special tokens and maximum length for tokenization
max_len = 100
pad_idx = tokenizer.pad_token_id
bos_idx = tokenizer.cls_token_id
eos_idx = tokenizer.sep_token_id

def tokenize_text(text):
    tokens = tokenizer.encode(text, max_length=max_len, truncation=True, padding='max_length')
    return tokens

In [None]:
# Apply tokenization to the datasets
train_merged['src_tokens'] = train_merged['title'].apply(tokenize_text)
train_merged['trg_tokens'] = train_merged['L0_category'].apply(lambda x: tokenize_text(str(x)))

val_merged['src_tokens'] = val_merged['title'].apply(tokenize_text)
val_merged['trg_tokens'] = val_merged['L0_category'].apply(lambda x: tokenize_text(str(x)))

In [None]:
# Dataset and DataLoader Creation

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, src_tokens, trg_tokens):
        self.src_tokens = src_tokens
        self.trg_tokens = trg_tokens

    def __len__(self):
        return len(self.src_tokens)

    def __getitem__(self, idx):
        src = torch.tensor(self.src_tokens[idx], dtype=torch.long)
        trg = torch.tensor(self.trg_tokens[idx], dtype=torch.long)
        return src, trg

In [None]:
batch_size = 32
# Create the dataset
train_dataset = CustomDataset(train_merged['src_tokens'].tolist(), train_merged['trg_tokens'].tolist())
val_dataset = CustomDataset(val_merged['src_tokens'].tolist(), val_merged['trg_tokens'].tolist())

# Create DataLoader
train_iterator = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_iterator = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Training Loop

# Hyperparameters
src_vocab_size = tokenizer.vocab_size
trg_vocab_size = tokenizer.vocab_size
src_pad_idx = pad_idx
trg_pad_idx = pad_idx
embed_size = 512
num_layers = 3
heads = 8
dropout = 0.1
forward_expansion = 4
learning_rate = 3e-4
num_epochs = 10
batch_size = 32

In [None]:
# Initialize model, optimizer, and loss function
model = Transformer(src_vocab_size=len(tokenizer.vocab), trg_vocab_size=len(tokenizer.vocab), src_pad_idx=pad_idx, trg_pad_idx=pad_idx, embed_size=512, num_layers=3, forward_expansion=4, heads=8, dropout=0.1, device=device, max_len=max_len).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
train_iterator = torch.utils.data.DataLoader(train_merged[['src_tokens', 'trg_tokens']].values.tolist(), batch_size=batch_size, shuffle=True)

In [None]:
dataset = CustomDataset(train_merged['src_tokens'].tolist(), train_merged['trg_tokens'].tolist())
for i in range(5):
    src, trg = dataset[i]
    print(type(src), type(trg))  # Should be torch.Tensor
    print(src.shape, trg.shape)

In [None]:
train_iterator = DataLoader(dataset, batch_size=batch_size, shuffle=True)
for i, (src, trg) in enumerate(train_iterator):
    print(type(src), type(trg))
    print(src.shape, trg.shape)
    break


In [None]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(train_iterator):
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])

        output = output.reshape(-1, output.shape[2])
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        epoch_loss += loss.item()

        # Save checkpoint
        if i % 1000 == 0:  # Adjust as needed
            torch.save(model.state_dict(), f"transformer_model_epoch{epoch}_step{i}.pth")
            print(f"Checkpoint saved at epoch {epoch}, step {i}")

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_iterator):.4f}')

In [None]:
# Save the model
model_save_path = "transformer_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Testing

import torch

In [None]:
import json
with open('attribute_test.data', 'r') as f:
    test_data = [json.loads(line) for line in f]

In [None]:
test_titles = [item['title'] for item in test_data]
test_src_tokens = [tokenizer.encode(title, return_tensors='pt') for title in test_titles]

max_len = max(len(tokens[0]) for tokens in test_src_tokens)
test_src_tokens_padded = [torch.cat([tokens[0], torch.zeros(max_len - len(tokens[0]), dtype=torch.long)]) for tokens in test_src_tokens]

In [None]:
model = Transformer(src_vocab_size=len(tokenizer.vocab), trg_vocab_size=len(tokenizer.vocab), src_pad_idx=pad_idx, trg_pad_idx=pad_idx, embed_size=512, num_layers=3, forward_expansion=4, heads=8, dropout=0.1, device=device, max_len=max_len).to(device)

model.load_state_dict(torch.load("transformer_model.pth"))
model.eval()

In [None]:
test_dataset = CustomDataset(test_src_tokens, [None]*len(test_src_tokens))
test_iterator = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def generate_predictions(data_loader, model, device):
    predictions = []
    with torch.no_grad():
        for src in data_loader:
            src = src[0].to(device)
            output = model(src, src[:, :-1])
            output = output.argmax(dim=-1)

            predictions.extend(output.cpu().tolist())

    return predictions
predictions = generate_predictions(test_iterator, model, device)

predicted_texts = [tokenizer.decode(pred) for pred in predictions]

In [None]:
# Print predictions
for i, pred_text in enumerate(predicted_texts[:5]):
    print(f"Sample {i}: {pred_text}")