In [1]:
import functools
import pandas as pd
import torch
import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../utilities/')
from utilities import Timer, lmap

sns.set()

In [2]:
# detect gpu availability
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [3]:
# Utility function: map returning a list
def lmap(func, iterable):
    return list(map(func, iterable))

In [4]:
# model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')

In [5]:
# model_sc = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')

# 1. Prepare the Data
### 1a. Preprocess/Tokenize

In [6]:
%%capture
def preprocess_BERT(encoder, q_row):
    '''Preprocess dataframe row for BERT
    
    Parameters
    ----------
    encoder: callable, takes in text, returns encoded tokens, 
             should be provided by pre-trained model
    
    q_row  : dataframe row containing columns for question_title, 
             question_body, and answer
    
    Returns
    -------
    pandas Series of entries, each entry a list of length 512.
    Entries: tokens, a mask, and positional embeddings.
    '''
    
    # Set max length allowed by BERT model
    MAX_LENGTH = 512
    
    # Get question title, body, and answer from dataframe row
    question = q_row.question_title + q_row.question_body
    answer   = q_row.answer
    
    # Encode question and answer without [CLS] and [SEP]
    question_tok = encoder(question, add_special_tokens = False)
    answer_tok   = encoder(answer, add_special_tokens = False)

    # Truncate tokens to length MAX_LENGTH - 3 to account for special tokens
    while len(question_tok + answer_tok) > (MAX_LENGTH - 3):
        
        # Pick the longest list, then pop last item in list
        # Default to shortening answer if there is a tie
        array_to_pop = max([answer_tok, question_tok], key = len)
        array_to_pop.pop()
    
    # Get encodings for [CLS] and [SEP]
    cls_token_encoded = encoder(['[CLS]'], add_special_tokens = False)
    sep_token_encoded = encoder(['[SEP]'], add_special_tokens = False)
    
    # Combine question, answer, and special tokens
    content_tok = cls_token_encoded + question_tok + \
                  sep_token_encoded + answer_tok   + \
                  sep_token_encoded
    
    # Create padding
    padding_len = MAX_LENGTH - len(content_tok)
    padding     = [0] * padding_len
    
    # Add padding
    final_tok   = content_tok + padding
    
    # Compute segment_ids
    segment_ids = [0] * (len(question_tok) + 2) + \
                  [1] * (len(answer_tok)   + 1) + \
                  padding
    
    # Compute the mask
    mask        = [1] * len(content_tok) + padding
    
    return pd.Series({
        'tokens'      : final_tok,
        'segment_ids' : segment_ids,
        'mask'        : mask
    })

# Load in tokenizer for BERT base uncased
BERT_base_uncased_tokenizer  = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') 

# Curry preprocess function and partially apply it
preprocess_BERT_base_uncased = functools.partial(preprocess_BERT, 
                                                 BERT_base_uncased_tokenizer.encode)

### 1b. Create a PyTorch Dataset from the Processed Data

In [7]:
target_cols = ['question_asker_intent_understanding', 'question_body_critical', 
               'question_conversational', 'question_expect_short_answer', 
               'question_fact_seeking', 'question_has_commonly_accepted_answer', 
               'question_interestingness_others', 'question_interestingness_self', 
               'question_multi_intent', 'question_not_really_a_question', 
               'question_opinion_seeking', 'question_type_choice', 
               'question_type_compare', 'question_type_consequence', 
               'question_type_definition', 'question_type_entity', 
               'question_type_instructions', 'question_type_procedure', 
               'question_type_reason_explanation', 'question_type_spelling', 
               'question_well_written', 'answer_helpful', 
               'answer_level_of_information', 'answer_plausible', 
               'answer_relevance', 'answer_satisfaction', 
               'answer_type_instructions', 'answer_type_procedure', 
               'answer_type_reason_explanation', 'answer_well_written']

In [10]:
def create_dataset(df, preprocessor, target_cols = None):
    '''Create a dataset from a pandas dataframe
    
    Parameters
    ----------
    
    df: Pandas dataframe with text columns available for the preprocessor 
        and containing the target columns
        
    preprocessor: callable taking a row of a dataframe and returning 
                  a Series containing the inputs as lists in each entry
                  
    target_cols: list of column names to use as the target.
    If None, no labels are included
    
    Returns
    -------
    
    PyTorch Dataset (batched)
    
    '''
    # Process the input data into a dataframe with 3 columns
    processed_data = df.apply(preprocessor, axis = 'columns')

    # Convert each of those three columns into a tensor
    def convert_col_to_tensor(col):
        # Convert each list entry to a tensor. Then stack them into one large tensor
        col = lmap(lambda list_ : torch.tensor(list_, dtype = torch.long), col.tolist())
        return torch.stack(col)

    tokens      = convert_col_to_tensor(processed_data.tokens).to(device)
    segment_ids = convert_col_to_tensor(processed_data.segment_ids).to(device)
    mask        = convert_col_to_tensor(processed_data['mask']).to(device)
    
    data        = [tokens, segment_ids, mask]
    
    # Collect the target columns
    if target_cols is not None:
        targets     = torch.tensor(df[target_cols].values, dtype = torch.float32).to(device)
        data.append(targets)

    # Construct a Torch Dataset, then a DataLoader that random samples and batches
    dataset     = TensorDataset(*data)
    dataset     = DataLoader(dataset, 32, shuffle = True)
    
    return dataset

### 1c. Train Test Split

In [11]:
%%capture
# Suppress warnings when tokenizing sentences longer than the allowed length of 512

# Load the original data
train_df_all = pd.read_csv('../input/google-quest-challenge/train.csv')
test_df      = pd.read_csv('../input/google-quest-challenge/test.csv')

# Create Train and Validation Splits
train_df, valid_df = train_test_split(train_df_all, random_state = 42, train_size = 0.8)

Timer.start()
# Create PyTorch Datasets
train = create_dataset(train_df, preprocess_BERT_base_uncased, target_cols)
valid = create_dataset(valid_df, preprocess_BERT_base_uncased, target_cols)
test  = create_dataset( test_df, preprocess_BERT_base_uncased)

# 2. Construct the Model