In [None]:
import numpy as np 
import pandas as pd 
import re

import os
import logging
import gc
from pathlib import Path
import pickle

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader

from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

'''
Fork and eddit from:
https://www.kaggle.com/ceshine/pytorch-bert-baseline-public-score-0-54

We use this notebook to generate BERT embeddings for two mentions and the gender pronoun.
We remove punctuation during data pre-processing at this time.
'''

In [None]:
import pandas as pd
train_df =  pd.concat([
    pd.read_csv("gap-test.tsv", delimiter="\t"),
    pd.read_csv("gap-validation.tsv", delimiter="\t")
], axis=0)
test_df = pd.read_csv("gap-development.tsv", delimiter="\t")

In [None]:
BERT_MODEL = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[THISISA]", "[THISISB]", "[THISISP]"))

In [None]:
def insert_tag(row):
    """
    Insert custom tags to help us find the position of A, B, and the pronoun after tokenization.
    """
    to_be_inserted = sorted([(row["A-offset"], " THISISA "),(row["B-offset"], " THISISB "),(row["Pronoun-offset"], " THISISP ")], key=lambda x: x[0], reverse=True)  # 从大往小插入这样才不会乱顺序    
    text = row["Text"]    
    for offset, tag in to_be_inserted:
        text = text[:offset] + tag + text[offset:]
    return text

def clean_and_replace_target_name(row):
    '''' 
    Only alphabet left
    replace all target name with fake name
    '''
    
    text = row['TextClean']
    text = re.sub("[^a-zA-Z]"," ",text)  
    A = re.sub("[^a-zA-Z]"," ",row['A'])   
    B = re.sub("[^a-zA-Z]"," ",row['B']) 
    
    # replace names
    text = re.sub(str(A), tokenizer.tokenize(A)[0], text)
    text = re.sub(str(B), tokenizer.tokenize(B)[0], text)
    
    text = re.sub(r"THISISA", r"[THISISA]", text)
    text = re.sub(r"THISISB", r"[THISISB]", text)
    text = re.sub(r"THISISP", r"[THISISP]", text)
    
    text = re.sub(' +', ' ', text)
    return text

def generate_text(row):
    row.loc['TextClean'] = insert_tag(row)
    text = clean_and_replace_target_name(row)
    return text

## Tokenize

In [None]:
tokenizer.vocab["[THISISA]"] = -1
tokenizer.vocab["[THISISB]"] = -1
tokenizer.vocab["[THISISP]"] = -1

def tokenize(text, tokenizer):
    """
    Returns a list of tokens and the positions of A, B, and the pronoun.
    """
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in ("[THISISA]", "[THISISB]", "[THISISP]"):
            entries[token] = len(final_tokens) + 1
            continue
        final_tokens.append(token)
    return final_tokens, (entries["[THISISA]"], entries["[THISISB]"], entries["[THISISP]"])

In [None]:
offsets_lst = []
tokens_lst = []
for _, row in train_df.iterrows():
    text = generate_text(row)
    tokens, offsets = tokenize(text, tokenizer)
    offsets_lst.append(offsets)
    tokens_lst.append(tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]))

In [None]:
test_offsets_lst = []
test_tokens_lst = []
for _, row in test_df.iterrows():
    text = generate_text(row)
    tokens, offsets = tokenize(text, tokenizer)
    test_offsets_lst.append(offsets)
    test_tokens_lst.append(tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]))

## Pad the sequences

In [None]:
# truncate each row to the size of max_len

max_len = max((len(x) for x in tokens_lst))  
tokens = np.zeros((len(tokens_lst), max_len), dtype=np.int64)
for i, row in enumerate(tokens_lst):
    row = np.array(row[:max_len])
    tokens[i, :len(row)] = row

# All sentenses
token_tensor = torch.from_numpy(tokens)

In [None]:
test_max_len = max((len(x) for x in test_tokens_lst))  
test_tokens = np.zeros((len(test_tokens_lst), test_max_len), dtype=np.int64)
for i, row in enumerate(test_tokens_lst):
    row = np.array(row[:test_max_len])
    test_tokens[i, :len(row)] = row

# All sentenses
test_token_tensor = torch.from_numpy(test_tokens)

## Generate Embedding

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.cuda.set_device(0)
bert = BertModel.from_pretrained(BERT_MODEL)

In [None]:
bert_outputs = []
with torch.no_grad():
    for i in range(len(token_tensor)):
        if i % 40 == 0:
            print(i)
        bert_output, _ =  bert(
                    token_tensor[i].unsqueeze(0), 
                    attention_mask=(token_tensor[i].unsqueeze(0) > 0).long(), 
                    token_type_ids=None, 
                    output_all_encoded_layers=False) 

        bert_outputs.append(bert_output)

In [None]:
test_bert_outputs = []
with torch.no_grad():
    for i in range(len(test_token_tensor)):
        if i % 40 == 0:
            print(i)
        test_bert_output, _ =  bert(
                    test_token_tensor[i].unsqueeze(0), 
                    attention_mask=(test_token_tensor[i].unsqueeze(0) > 0).long(), 
                    token_type_ids=None, 
                    output_all_encoded_layers=False) 

        test_bert_outputs.append(test_bert_output)

In [None]:
pickle.dump(offsets_lst, open('offsets_lst.pkl', "wb"))
pickle.dump(tokens_lst, open('token_lst_wto_padding.pkl', "wb"))
pickle.dump(bert_outputs, open('bert_outputs.pkl', "wb"))

pickle.dump(test_offsets_lst, open('test_offsets_lst.pkl', "wb"))
pickle.dump(test_tokens_lst, open('test_token_lst_wto_padding.pkl', "wb"))
pickle.dump(test_bert_outputs, open('test_bert_outputs.pkl', "wb"))