# Hey Bert!

This is my attempt to teach myself how BERT works.

## Imports

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig


if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')

DATA_DIR = '/kaggle/input/google-quest-challenge'
VOCAB_PATH = '/kaggle/input/bertvocab/vocabulary.txt'
EPOCHS = 2
LEARNING_RATE = 0.01

## Load Data

In [None]:
# Load training and testing data
print('Loading data...  ', end='')
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_submission_df = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
print('Done!')

# Sort data into separate groups
print('Sorting data...  ', end='')
train_titles = train_df['question_title'].values
train_bodies = train_df['question_body'].values
train_answers = train_df['answer'].values

test_labels = sample_submission_df.columns
test_qa_ids = test_df['qa_id'].values
test_titles = test_df['question_title'].values
test_bodies = test_df['question_body'].values
test_answers = test_df['answer'].values

targets = torch.tensor(train_df[train_df.columns[11:]].values, dtype=torch.float, device=DEVICE)
print('Done!')

## Process Data

In [None]:
def process_data(title, body, answer, tokenizer):
    ids = tokenizer.encode(f'[CLS] {title} [SEP] {body} [SEP] {answer} [SEP]')   
    return torch.tensor(ids, device=DEVICE).unsqueeze(0)
    

# Get ids, segments, and positions
print('Processing data...  ', end='')
tokenizer = DistilBertTokenizer(vocab_file=VOCAB_PATH)

train_ids = []
test_ids = []
for title, body, answer in zip(train_titles, train_bodies, train_answers):
    train_ids.append(process_data(title, body, answer, tokenizer))
for title, body, answer in zip(test_titles, test_bodies, test_answers):
    test_ids.append(process_data(title, body, answer, tokenizer))
    
# Find max sequence length
max_sequence_length = max(max([tid.size(1) for tid in train_ids]), max([tid.size(1) for tid in test_ids]))
print('Done!')

## The Neural Network

In [None]:
print('Setting up neural network...  ', end='')

class Network(nn.Module):
    def __init__(self, config, max_length=1024):
        super(Network, self).__init__()
        self.max_length = max_length
        
        self.bert = DistilBertModel(config).to(device=DEVICE)
        self.lin1 = nn.Linear(768, 256).to(device=DEVICE)
        self.lin2 = nn.Linear(256, 64).to(device=DEVICE)
        self.lin3 = nn.Linear(64, 30).to(device=DEVICE)
    
    def forward(self, x):
        chunks = torch.split(x, self.max_length, dim=1)
        for chunk in chunks:
            y = self.bert(chunk)[0][0, 0]
        y = F.relu(self.lin1(y))
        y = F.relu(self.lin2(y))
        y = torch.sigmoid(self.lin3(y))
        return y.squeeze()
    

config = DistilBertConfig(vocab_size=tokenizer.vocab_size, max_position_embeddings=max_sequence_length)
network = Network(config)
optimizer = optim.SGD(network.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()
print('Done!')

## Training

In [None]:
print('\n--- Begin Training ---')
total = len(train_ids)
network.train()
for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    for idx, (ids, target) in enumerate(zip(train_ids, targets)):
        optimizer.zero_grad()
        output = network(ids)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print(f'Epoch: {epoch:{len(str(EPOCHS))}d}/{EPOCHS}  Loss: {total_loss / (idx + 1):.4f}  Item: {idx + 1:{len(str(total))}d}/{total}', end='\r')
    print()

## Testing

In [None]:
network.train(False)
total = len(test_ids)
output_data = []
for idx, (qa_id, ids) in enumerate(zip(test_qa_ids, test_ids)):
    output = network(ids).tolist()
    output.insert(0, qa_id)
    output_data.append(output)
    print(f'Testing... {idx + 1}/{total}', end='\r')
print('Testing... Done!  ')

output_df = pd.DataFrame(output_data, columns=test_labels)
output_df.to_csv('submission.csv', index=False)
print('Testing output saved to submission.csv')