In [1]:
import os
import string

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import yaml


EPOCHS = 3
LEARNING_RATE = 0.01
LOG_INTERVAL = 0.2



## Load Data

In [2]:
DATA_DIR = '/kaggle/input/google-quest-challenge'

# Load training and testing data
print('Loading data...  ', end='')
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_submission_df = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
print('Done!')

# Sort data into separate groups
print('Sorting data...  ', end='')
train_titles = train_df['question_title'].values
train_bodies = train_df['question_body'].values
train_answers = train_df['answer'].values

test_labels = sample_submission_df.columns
test_qa_ids = test_df['qa_id'].values
test_titles = test_df['question_title'].values
test_bodies = test_df['question_body'].values
test_answers = test_df['answer'].values

targets = torch.tensor(train_df[train_df.columns[11:]].values, dtype=torch.float)
print('Done!')

Loading data...  

FileNotFoundError: [Errno 2] File /kaggle/input/google-quest-challenge/train.csv does not exist: '/kaggle/input/google-quest-challenge/train.csv'

## Process Data

In [None]:
def clean(s):
    return s.translate(str.maketrans('', '', string.punctuation))

def get_unique_words(text):
    text = clean(text).lower()
    return set(text.split())
    
def tokenize(text, vocab):
    text = clean(text).lower()
    return torch.tensor([vocab[w] for w in text.split()], dtype=torch.long)
    
# Create vocabulary of all words
print('Extracting vocabulary words...  ', end='')
vocabulary = set()
for text in train_titles:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in train_bodies:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in train_answers:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_titles:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_bodies:
    vocabulary = vocabulary.union(get_unique_words(text))
for text in test_answers:
    vocabulary = vocabulary.union(get_unique_words(text)) 
print(f'Done!')
    
vocabulary = {word:token for token, word in enumerate(vocabulary)}

# Tokenize data
print('Tokenizing data...  ', end='')
train_title_tokens = []
train_body_tokens = []
train_answer_tokens = []
test_title_tokens = []
test_body_tokens = []
test_answer_tokens = []
for title, body, answer in zip(train_titles, train_bodies, train_answers):
    train_title_tokens.append(tokenize(title, vocabulary))
    train_body_tokens.append(tokenize(body, vocabulary))
    train_answer_tokens.append(tokenize(answer, vocabulary))
for title, body, answer in zip(test_titles, test_bodies, test_answers):
    test_title_tokens.append(tokenize(title, vocabulary))
    test_body_tokens.append(tokenize(body, vocabulary))
    test_answer_tokens.append(tokenize(answer, vocabulary))
print('Done!')

# Get max lengths
print('Calculating maximum sequence lengths...  ', end='')
max_title_length = max(max([t.size(0) for t in train_title_tokens]), max([t.size(0) for t in test_title_tokens]))
max_body_length = max(max([t.size(0) for t in train_body_tokens]), max([t.size(0) for t in test_body_tokens]))
max_answer_length = max(max([t.size(0) for t in train_answer_tokens]), max([t.size(0) for t in test_answer_tokens]))
print('Done!')

## The Neural Network

In [None]:
print('Setting up neural networks...  ', end='')

class Encoder(nn.Module):
    def __init__(self, vocab_size, max_length, embedding_dim=512):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        
    def forward(self, x):
        x = x.unsqueeze(0)
        embedded = self.embedding(x)
        y = self.transformer_encoder(embedded)
        padding = torch.zeros(1, self.max_length - y.size(1), y.size(2))
        y = torch.cat((y, padding), dim=1).unsqueeze(0)
        y = F.relu(F.max_pool2d(self.conv1(y), 2))
        y = F.relu(F.max_pool2d(self.conv2(y), 2))
        return y

class Network(nn.Module):
    def __init__(self, title_encoder, body_encoder, answer_encoder):
        super(Network, self).__init__()
        self.title_encoder = title_encoder
        self.body_encoder = body_encoder
        self.answer_encoder = answer_encoder
        self.conv1 = nn.Conv2d(20, 30, kernel_size=7)
        self.conv2 = nn.Conv2d(30, 40, kernel_size=7)
        self.conv3 = nn.Conv2d(40, 50, kernel_size=7)
        self.lin1 = nn.Linear(56000, 3500)
        self.lin2 = nn.Linear(3500, 875)
        self.lin3 = nn.Linear(875, 30)
    
    def forward(self, title, body, answer):
        title_y = self.title_encoder(title)
        body_y = self.body_encoder(body)
        answer_y = self.answer_encoder(answer)
        y = torch.cat((title_y, body_y, answer_y), dim=2)
        y = F.relu(F.max_pool2d(self.conv1(y), 2))
        y = F.relu(F.max_pool2d(self.conv2(y), 2))
        y = F.relu(F.max_pool2d(self.conv3(y), 2))
        y = y.view(-1, 56000)
        y = F.relu(self.lin1(y))
        y = F.relu(self.lin2(y))
        y = torch.sigmoid(self.lin3(y))
        return y.squeeze()
    
    
title_encoder = Encoder(len(vocabulary), max_title_length)
body_encoder = Encoder(len(vocabulary), max_body_length)
answer_encoder = Encoder(len(vocabulary), max_answer_length)
network = Network(title_encoder, body_encoder, answer_encoder)

optimizer = optim.Adam(network.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()
print('Done!')

## Training

In [None]:
print('\n--- Begin Training ---')
train_data = list(enumerate(zip(train_title_tokens, train_body_tokens, train_answer_tokens, targets)))
total = len(train_data)
network.train()
for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    for idx, (title, body, answer, target) in train_data:
        optimizer.zero_grad()
        output = network(title, body, answer)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print(f'Epoch: {epoch:{len(str(EPOCHS))}d}/{EPOCHS}  Loss: {total_loss / (idx + 1):.4f}  Item: {idx + 1:{len(str(total))}d}/{total}', end='\r')

## Testing

In [None]:
network.train(False)
test_data = list(enumerate(zip(test_qa_ids, test_title_tokens, test_body_tokens, test_answer_tokens)))
total = len(test_data)
output_data = []
for idx, (qa_id, title, body, answer) in test_data:
    output = network(title, body, answer).tolist()
    output.insert(0, qa_id)
    output_data.append(output)
    print(f'Testing... {idx}/{total}', end='\r')
print('Done!')

output_df = pd.DataFrame(output_data, columns=test_labels)
output_df.to_csv('submission.csv', index=False)
print('Testing output saved to submission.csv')