Here we simply run logistic regression with BERT embeddings. Code for building BERT embeddings for A, B, and pronoun is taken from this great [Matei's Kernel](https://www.kaggle.com/mateiionita/taming-the-bert-a-baseline).

In [1]:
import os
import gc
import numpy as np
import pandas as pd 
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv
!wget -q https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

In [3]:
import modeling
import extract_features
import tokenization

In [4]:
val_df = pd.read_table('gap-validation.tsv', index_col='ID').reset_index(drop=True)
test_df  = pd.read_table('gap-validation.tsv', index_col='ID').reset_index(drop=True)
dev_df  = pd.read_table('gap-development.tsv', index_col='ID').reset_index(drop=True)

In [5]:
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  
bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta


In [6]:
def count_char(text, offset):   
    count = 0
    for pos in range(offset):
        if text[pos] != " ": count +=1
    return count

def candidate_length(candidate):
    count = 0
    for i in range(len(candidate)):
        if candidate[i] !=  " ": count += 1
    return count

def count_token_length_special(token):
    count = 0
    special_token = ["#", " "]
    for i in range(len(token)):
        if token[i] not in special_token: count+=1
    return count

def embed_by_bert(df, path_to_bert='uncased_L-12_H-768_A-12', embed_size=768, batch_size=8,
                 layers='-1', max_seq_length=256):
    
    text = df['Text']
    text.to_csv('input.txt', index=False, header=False)
    os.system(f"python3 extract_features.py \
               --input_file=input.txt \
               --output_file=output.jsonl \
               --vocab_file={path_to_bert}/vocab.txt \
               --bert_config_file={path_to_bert}/bert_config.json \
               --init_checkpoint={path_to_bert}/bert_model.ckpt \
               --layers={layers} \
               --max_seq_length={max_seq_length} \
               --batch_size={batch_size}")
    
    bert_output = pd.read_json("output.jsonl", lines=True)
    bert_output.head()
    
    os.system("rm input.txt")
    os.system("rm output.jsonl")
    
    index = df.index
    columns = ["emb_A", "emb_B", "emb_P", "label"]
    emb = pd.DataFrame(index = index, columns = columns)
    emb.index.name = "ID"
    
    for i in tqdm(range(len(text))):
        
        features = bert_output.loc[i, "features"]
        P_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'Pronoun-offset'])
        A_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'A-offset'])
        B_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'B-offset'])
        A_length = candidate_length(df.loc[i, 'A'])
        B_length = candidate_length(df.loc[i, 'B'])
        
        emb_A, emb_B, emb_P = np.zeros(embed_size), np.zeros(embed_size), np.zeros(embed_size)
        char_count, cnt_A, cnt_B = 0, 0, 0
        
        for j in range(2, len(features)):
            token = features[j]["token"]
            token_length = count_token_length_special(token)
            if char_count == P_char_start:
                emb_P += np.asarray(features[j]["layers"][0]['values']) 
            if char_count in range(A_char_start, A_char_start + A_length):
                emb_A += np.asarray(features[j]["layers"][0]['values'])
                cnt_A += 1
            if char_count in range(B_char_start, B_char_start + B_length):
                emb_B += np.asarray(features[j]["layers"][0]['values'])
                cnt_B += 1                
            char_count += token_length
        
        if cnt_A > 0:
            emb_A /= cnt_A
        if cnt_B > 0:
            emb_B /= cnt_B
        
        label = "Neither"
        if (df.loc[i,"A-coref"] == True):
            label = "A"
        if (df.loc[i,"B-coref"] == True):
            label = "B"

        emb.iloc[i] = [emb_A, emb_B, emb_P, label]
        
    return emb    

In [7]:
%%time
val_bert_emb = embed_by_bert(val_df)
test_bert_emb = embed_by_bert(test_df)
dev_bert_emb = embed_by_bert(dev_df)

100%|██████████| 454/454 [00:00<00:00, 937.31it/s]
100%|██████████| 454/454 [00:00<00:00, 694.60it/s]
100%|██████████| 2000/2000 [00:02<00:00, 862.42it/s]


CPU times: user 32.3 s, sys: 15 s, total: 47.3 s
Wall time: 7min 4s


In [25]:
val_bert_emb["emb_A"].head().map(np.asarray).values[0].astype('float').shape

(768,)

In [26]:
def featurize(embedding_df):
    
    pronoun_embs, a_embs, b_embs, labels = [], [], [], []
    
    for i in tqdm(range(len(embedding_df))):
        
        pronoun_embs.append(embedding_df.loc[i, "emb_P"])
        a_embs.append(embedding_df.loc[i, "emb_A"])
        b_embs.append(embedding_df.loc[i, "emb_B"])

        label_map = {'A': 0, 'B': 1, 'Neither': 2}
        labels.append(label_map[embedding_df.loc[i, "label"]])

    
    a_embs = np.asarray(a_embs).astype('float')
    b_embs = np.asarray(b_embs).astype('float') 
    pronoun_embs = np.asarray(pronoun_embs).astype('float')
    
    return np.concatenate([a_embs, b_embs, pronoun_embs], axis=1), np.asarray(labels)

In [29]:
X_train, y_train = featurize(pd.concat([val_bert_emb, dev_bert_emb]).sort_index().reset_index())

100%|██████████| 2454/2454 [00:00<00:00, 27859.32it/s]


In [30]:
X_train.shape, y_train.shape

((2454, 2304), (2454,))

In [31]:
logit = LogisticRegression(C=1e-2, random_state=17, solver='lbfgs', 
                           multi_class='multinomial', max_iter=100,
                          n_jobs=4)

In [32]:
%%time
logit.fit(X_train, y_train)

CPU times: user 64 ms, sys: 340 ms, total: 404 ms
Wall time: 23.1 s


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=4, penalty='l2', random_state=17, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Prediction for stage 1 test set 

In [33]:
!cp gap-development.tsv stage1_test.tsv

In [34]:
stage1_test_df  = pd.read_table('stage1_test.tsv', index_col='ID').reset_index(drop=True)

In [35]:
%%time
stage1_test_bert_emb = embed_by_bert(stage1_test_df)

100%|██████████| 2000/2000 [00:02<00:00, 850.74it/s]


In [36]:
X_test, y_test = featurize(stage1_test_bert_emb)

100%|██████████| 2000/2000 [00:00<00:00, 27172.13it/s]


In [37]:
logit_test_pred = logit.predict_proba(X_test)
log_loss(y_test, logit_test_pred)

0.2775730991751326

In [38]:
# Write the prediction to file for submission
submission = pd.read_csv("../input/sample_submission_stage_1.csv", index_col = "ID")
submission["A"] = logit_test_pred[:, 0]
submission["B"] = logit_test_pred[:, 1]
submission["NEITHER"] = logit_test_pred[:, 2]
submission.to_csv("submission.csv")