Here we simply run logistic regression with BERT embeddings. Code for building BERT embeddings for A, B, and pronoun is taken from this great [Matei's Kernel](https://www.kaggle.com/mateiionita/taming-the-bert-a-baseline).

In [1]:
import os
import gc
import numpy as np
import pandas as pd 
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv
!wget -q https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv
!wget -q https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget -q https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

/bin/bash: wget: command not found
/bin/bash: wget: command not found
/bin/bash: wget: command not found
/bin/bash: wget: command not found
/bin/bash: wget: command not found
/bin/bash: wget: command not found


In [6]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sys
sys.path.append("/Users/stalaviya/MachineLearning/git_machine_learning/sentiment_research/sentiment_analysis_masters_research/downloads")

import modeling
import extract_features
import tokenization

In [8]:
# val_df = pd.read_table('gap-validation.tsv', index_col='ID').reset_index(drop=True)
# test_df  = pd.read_table('gap-validation.tsv', index_col='ID').reset_index(drop=True)
# dev_df  = pd.read_table('gap-development.tsv', index_col='ID').reset_index(drop=True)

df = pd.read_csv('../data/IMDB_Dataset.csv')

dev_df = df[:30000]
val_df = df[30000:40000]
test_df = df[40000:]

In [9]:
test_df

Unnamed: 0,review,sentiment
40000,First off I want to say that I lean liberal on...,negative
40001,I was excited to see a sitcom that would hopef...,negative
40002,When you look at the cover and read stuff abou...,negative
40003,"Like many others, I counted on the appearance ...",negative
40004,"This movie was on t.v the other day, and I did...",negative
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [10]:
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 
!unzip uncased_L-12_H-768_A-12.zip

/bin/bash: wget: command not found
unzip:  cannot find or open uncased_L-12_H-768_A-12.zip, uncased_L-12_H-768_A-12.zip.zip or uncased_L-12_H-768_A-12.zip.ZIP.


In [11]:
def count_char(text, offset):   
    count = 0
    for pos in range(offset):
        if text[pos] != " ": count +=1
    return count

def candidate_length(candidate):
    count = 0
    for i in range(len(candidate)):
        if candidate[i] !=  " ": count += 1
    return count

def count_token_length_special(token):
    count = 0
    special_token = ["#", " "]
    for i in range(len(token)):
        if token[i] not in special_token: count+=1
    return count

def embed_by_bert(df, path_to_bert='uncased_L-12_H-768_A-12', embed_size=768, batch_size=8,
                 layers='-1', max_seq_length=256):
    
    text = df['review']
    text.to_csv('input.txt', index=False, header=False)
    os.system(f"python3 extract_features.py \
               --input_file=input.txt \
               --output_file=output.jsonl \
               --vocab_file={path_to_bert}/vocab.txt \
               --bert_config_file={path_to_bert}/bert_config.json \
               --init_checkpoint={path_to_bert}/bert_model.ckpt \
               --layers={layers} \
               --max_seq_length={max_seq_length} \
               --batch_size={batch_size}")
    
    bert_output = pd.read_json("output.jsonl", lines=True)
    bert_output.head()
    
    os.system("rm input.txt")
    os.system("rm output.jsonl")
    
    index = df.index
    columns = ["emb_A", "emb_B", "emb_P", "label"]
    emb = pd.DataFrame(index = index, columns = columns)
    emb.index.name = "ID"
    
    for i in tqdm(range(len(text))):
        
        features = bert_output.loc[i, "features"]
        P_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'Pronoun-offset'])
        A_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'A-offset'])
        B_char_start = count_char(df.loc[i, 'Text'], df.loc[i, 'B-offset'])
        A_length = candidate_length(df.loc[i, 'A'])
        B_length = candidate_length(df.loc[i, 'B'])
        
        emb_A, emb_B, emb_P = np.zeros(embed_size), np.zeros(embed_size), np.zeros(embed_size)
        char_count, cnt_A, cnt_B = 0, 0, 0
        
        for j in range(2, len(features)):
            token = features[j]["token"]
            token_length = count_token_length_special(token)
            if char_count == P_char_start:
                emb_P += np.asarray(features[j]["layers"][0]['values']) 
            if char_count in range(A_char_start, A_char_start + A_length):
                emb_A += np.asarray(features[j]["layers"][0]['values'])
                cnt_A += 1
            if char_count in range(B_char_start, B_char_start + B_length):
                emb_B += np.asarray(features[j]["layers"][0]['values'])
                cnt_B += 1                
            char_count += token_length
        
        if cnt_A > 0:
            emb_A /= cnt_A
        if cnt_B > 0:
            emb_B /= cnt_B
        
        label = "Neither"
        if (df.loc[i,"A-coref"] == True):
            label = "A"
        if (df.loc[i,"B-coref"] == True):
            label = "B"

        emb.iloc[i] = [emb_A, emb_B, emb_P, label]
        
    return emb    

In [None]:
%%time

val_bert_emb = embed_by_bert(val_df)
test_bert_emb = embed_by_bert(test_df)
dev_bert_emb = embed_by_bert(dev_df)

In [None]:
val_bert_emb["emb_A"].head().map(np.asarray).values[0].astype('float').shape

In [None]:
def featurize(embedding_df):
    
    pronoun_embs, a_embs, b_embs, labels = [], [], [], []
    
    for i in tqdm(range(len(embedding_df))):
        
        pronoun_embs.append(embedding_df.loc[i, "emb_P"])
        a_embs.append(embedding_df.loc[i, "emb_A"])
        b_embs.append(embedding_df.loc[i, "emb_B"])

        label_map = {'A': 0, 'B': 1, 'Neither': 2}
        labels.append(label_map[embedding_df.loc[i, "label"]])

    
    a_embs = np.asarray(a_embs).astype('float')
    b_embs = np.asarray(b_embs).astype('float') 
    pronoun_embs = np.asarray(pronoun_embs).astype('float')
    
    return np.concatenate([a_embs, b_embs, pronoun_embs], axis=1), np.asarray(labels)

In [None]:
X_train, y_train = featurize(pd.concat([val_bert_emb, dev_bert_emb]).sort_index().reset_index())

In [None]:
X_train.shape, y_train.shape

In [None]:
logit = LogisticRegression(C=1e-2, random_state=17, solver='lbfgs', 
                           multi_class='multinomial', max_iter=100,
                          n_jobs=4)

In [None]:
%%time
logit.fit(X_train, y_train)

## Prediction for stage 1 test set 

In [None]:
!cp gap-development.tsv stage1_test.tsv

In [None]:
stage1_test_df  = pd.read_table('stage1_test.tsv', index_col='ID').reset_index(drop=True)

In [None]:
%%time
stage1_test_bert_emb = embed_by_bert(stage1_test_df)

In [None]:
X_test, y_test = featurize(stage1_test_bert_emb)

In [None]:
logit_test_pred = logit.predict_proba(X_test)
log_loss(y_test, logit_test_pred)

In [None]:
# Write the prediction to file for submission
submission = pd.read_csv("../input/sample_submission_stage_1.csv", index_col = "ID")
submission["A"] = logit_test_pred[:, 0]
submission["B"] = logit_test_pred[:, 1]
submission["NEITHER"] = logit_test_pred[:, 2]
submission.to_csv("submission.csv")