In [123]:
import os
import string

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import yaml

In [124]:
DATA_DIR = '/kaggle/input/google-quest-challenge'

# Load training and testing data
print('Loading data...  ', end='')
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print('Done!')

# Sort data into separate groups
print('Sorting data...  ', end='')
train_question_titles = train_df['question_title'].values
train_question_bodies = train_df['question_body'].values
train_answers = train_df['answer'].values

test_question_titles = test_df['question_title'].values
test_question_bodies = test_df['question_body'].values
test_answers = test_df['answer'].values

question_targets = torch.tensor(train_df[train_df.columns[11:32]].values)
answer_targets = torch.tensor(train_df[train_df.columns[32:]].values)
print('Done!')

Loading data...  Done!
Sorting data...  Done!


In [125]:
def clean(s):
    return s.translate(str.maketrans('', '', string.punctuation))

def get_unique_words(text):
    text = clean(text).lower()
    return set(text.split())
    
def tokenize(text, vocab):
    text = clean(text).lower()
    return torch.tensor([vocab[w] for w in text.split()], dtype=torch.long)
    
# Create vocabulary of all words
print('Extracting vocabulary words...  ', end='')
vocabulary = set()
for text in train_question_titles:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in train_question_bodies:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in train_answers:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_question_titles:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_question_bodies:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_answers:
    vocabulary = vocabulary.union(get_unique_words(text)) 
print(f'Done!')
    
vocabulary = {word:token for token, word in enumerate(vocabulary)}

# Tokenize data
print('Tokenizing data...  ', end='')
train_title_tokens = []
train_body_tokens = []
train_answer_tokens = []
test_title_tokens = []
test_body_tokens = []
test_answer_tokens = []
for title, body, answer in zip(train_question_titles, train_question_bodies, train_answers):
    train_title_tokens.append(tokenize(title, vocabulary))
    train_body_tokens.append(tokenize(body, vocabulary))
    train_answer_tokens.append(tokenize(answer, vocabulary))
for title, body, answer in zip(test_question_titles, test_question_bodies, test_answers):
    test_title_tokens.append(tokenize(title, vocabulary))
    test_body_tokens.append(tokenize(body, vocabulary))
    test_answer_tokens.append(tokenize(answer, vocabulary))
print('Done!')

# Get max lengths
print('Calculating maximum sequence lengths...  ', end='')
max_title_length = max(max([t.size(0) for t in train_title_tokens]), max([t.size(0) for t in test_title_tokens]))
max_body_length = max(max([t.size(0) for t in train_body_tokens]), max([t.size(0) for t in test_body_tokens]))
max_answer_length = max(max([t.size(0) for t in train_answer_tokens]), max([t.size(0) for t in test_answer_tokens]))
print('Done!')

Extracting vocabulary words...  Done!
Tokenizing data...  Done!
Calculating maximum sequence lengths...  Done!


In [140]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, max_length, embedding_dim=512):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        
    def forward(self, x):
        x = x.unsqueeze(0)
        embedded = self.embedding(x)
        y = self.transformer_encoder(embedded)
        padding = torch.zeros(1, self.max_length - y.size(1), y.size(2))
        y = torch.cat((y, padding), dim=1)
        return y

class Network(nn.Module):
    def __init__(self, title_encoder, body_encoder, answer_encoder):
        super(Network, self).__init__()
        self.title_encoder = title_encoder
        self.body_encoder = body_encoder
        self.answer_encoder = answer_encoder    
    
    def forward(self, title, body, answer):
        pass