In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import zipfile
import sys
import time

Downloading config and weights

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
with zipfile.ZipFile("uncased_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall()
!ls 'uncased_L-12_H-768_A-12'

We will use some of the important scripts from the bert repo which you can find here(https://github.com/google-research/bert)

In [None]:
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

In [None]:
import modeling
import extract_features
import tokenization
import tensorflow as tf

Finally downloading the data from the git repo

In [None]:
!wget https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv
!wget https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv
!ls

Next, we feed BERT the data from these three files. For each line, we want to obtain contextual embeddings for the 3 target words (A, B, Pronoun). Here are some helper functions to keep track of the offsets of the target words.

In [None]:
def compute_offset_no_spaces(text, offset):
	count = 0
	for pos in range(offset):
		if text[pos] != " ": count +=1
	return count

def count_chars_no_special(text):
	count = 0
	special_char_list = ["#"]
	for pos in range(len(text)):
		if text[pos] not in special_char_list: count +=1
	return count

def count_length_no_special(text):
	count = 0
	special_char_list = ["#", " "]
	for pos in range(len(text)):
		if text[pos] not in special_char_list: count +=1
	return count

In [None]:
# def run_bert(data):
# 	'''
# 	Runs a forward propagation of BERT on input text, extracting contextual word embeddings
# 	Input: data, a pandas DataFrame containing the information in one of the GAP files

# 	Output: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768)
# 	columns: "emb_A": the embedding for word A
# 	         "emb_B": the embedding for word B
# 	         "emb_P": the embedding for the pronoun
# 	         "label": the answer to the coreference problem: "A", "B" or "NEITHER"
# 	'''
#     # From the current file, take the text only, and write it in a file which will be passed to BERT
# 	text = data["Text"]
# 	text.to_csv("input.txt", index = False, header = False)

#     # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl
#     # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers.
# 	os.system("python3 extract_features.py \
# 	  --input_file=input.txt \
# 	  --output_file=output.jsonl \
# 	  --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \
# 	  --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \
# 	  --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \
# 	  --layers=-1 \
# 	  --max_seq_length=256 \
# 	  --batch_size=8")

# 	bert_output = pd.read_json("output.jsonl", lines = True)

# 	os.system("rm output.jsonl")
# 	os.system("rm input.txt")

# 	index = data.index
# 	columns = ["emb_A", "emb_B", "emb_P", "label"]
# 	emb = pd.DataFrame(index = index, columns = columns)
# 	emb.index.name = "ID"

# 	for i in range(len(data)): # For each line in the data file
# 		# get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
# 		P = data.loc[i,"Pronoun"].lower()
# 		A = data.loc[i,"A"].lower()
# 		B = data.loc[i,"B"].lower()

# 		# For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
# 		P_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"Pronoun-offset"])
# 		A_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"A-offset"])
# 		B_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"B-offset"])
# 		# Figure out the length of A, B, not counting spaces or special characters
# 		A_length = count_length_no_special(A)
# 		B_length = count_length_no_special(B)

# 		# Initialize embeddings with zeros
# 		emb_A = np.zeros(768)
# 		emb_B = np.zeros(768)
# 		emb_P = np.zeros(768)

# 		# Initialize counts
# 		count_chars = 0
# 		cnt_A, cnt_B, cnt_P = 0, 0, 0

# 		features = pd.DataFrame(bert_output.loc[i,"features"]) # Get the BERT embeddings for the current line in the data file
# 		for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
# 			token = features.loc[j,"token"]

# 			# See if the character count until the current token matches the offset of any of the 3 target words
# 			if count_chars  == P_offset: 
# 				# print(token)
# 				emb_P += np.array(features.loc[j,"layers"][0]['values'])
# 				cnt_P += 1
# 			if count_chars in range(A_offset, A_offset + A_length): 
# 				# print(token)
# 				emb_A += np.array(features.loc[j,"layers"][0]['values'])
# 				cnt_A +=1
# 			if count_chars in range(B_offset, B_offset + B_length): 
# 				# print(token)
# 				emb_B += np.array(features.loc[j,"layers"][0]['values'])
# 				cnt_B +=1								
# 			# Update the character count
# 			count_chars += count_length_no_special(token)
# 		# Taking the average between tokens in the span of A or B, so divide the current value by the count	
# 		emb_A /= cnt_A
# 		emb_B /= cnt_B

# 		# Work out the label of the current piece of text
# 		label = "Neither"
# 		if (data.loc[i,"A-coref"] == True):
# 			label = "A"
# 		if (data.loc[i,"B-coref"] == True):
# 			label = "B"

# 		# Put everything together in emb
# 		emb.iloc[i] = [emb_A, emb_B, emb_P, label]

# 	return emb

The following method takes the data from a file, passes it through BERT to obtain contextual embeddings for the target words, then returns these embeddings in the emb DataFrame. Below, we will use it 3 times, once for each of the files gap-test, gap-development, gap-validation.

In [None]:
def run_bert(data):
	'''
	Runs a forward propagation of BERT on input text, extracting contextual word embeddings
	Input: data, a pandas DataFrame containing the information in one of the GAP files

	Output: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768)
	columns: "emb_A": the embedding for word A
	         "emb_B": the embedding for word B
	         "emb_P": the embedding for the pronoun
	         "label": the answer to the coreference problem: "A", "B" or "NEITHER"
	'''
    # From the current file, take the text only, and write it in a file which will be passed to BERT
	text = data["Text"]
	text.to_csv("input.txt", index = False, header = False)

    # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl
    # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers.
	os.system("python3 extract_features.py \
	  --input_file=input.txt \
	  --output_file=output.jsonl \
	  --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \
	  --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \
	  --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \
	  --layers=-1 \
	  --max_seq_length=256 \
	  --batch_size=8")

	bert_output = pd.read_json("output.jsonl", lines = True)

	os.system("rm output.jsonl")
	os.system("rm input.txt")

	index = data.index
	columns = ["emb_A", "emb_B", "emb_P"]
	emb = pd.DataFrame(index = index, columns = columns)
	emb.index.name = "ID"

	for i in range(len(data)): # For each line in the data file
		# get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
		P = data.loc[i,"Pronoun"].lower()
		A = data.loc[i,"A"].lower()
		B = data.loc[i,"B"].lower()

		# For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
		P_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"Pronoun-offset"])
		A_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"A-offset"])
		B_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"B-offset"])
		# Figure out the length of A, B, not counting spaces or special characters
		A_length = count_length_no_special(A)
		B_length = count_length_no_special(B)

		# Initialize embeddings with zeros
		emb_A = np.zeros(768)
		emb_B = np.zeros(768)
		emb_P = np.zeros(768)

		# Initialize counts
		count_chars = 0
		cnt_A, cnt_B, cnt_P = 0, 0, 0

		features = pd.DataFrame(bert_output.loc[i,"features"]) # Get the BERT embeddings for the current line in the data file
		for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
			token = features.loc[j,"token"]

			# See if the character count until the current token matches the offset of any of the 3 target words
			if count_chars  == P_offset: 
				# print(token)
				emb_P += np.array(features.loc[j,"layers"][0]['values'])
				cnt_P += 1
			if count_chars in range(A_offset, A_offset + A_length): 
				# print(token)
				emb_A += np.array(features.loc[j,"layers"][0]['values'])
				cnt_A +=1
			if count_chars in range(B_offset, B_offset + B_length): 
				# print(token)
				emb_B += np.array(features.loc[j,"layers"][0]['values'])
				cnt_B +=1								
			# Update the character count
			count_chars += count_length_no_special(token)
		# Taking the average between tokens in the span of A or B, so divide the current value by the count	
		emb_A /= cnt_A
		emb_B /= cnt_B

# 		# Work out the label of the current piece of text
# 		label = "Neither"
# 		if (data.loc[i,"A-coref"] == True):
# 			label = "A"
# 		if (data.loc[i,"B-coref"] == True):
# 			label = "B"

		# Put everything together in emb
		emb.iloc[i] = [emb_A, emb_B, emb_P]

	return emb

Read the three GAP files, pass them through BERT, and write the contextual embeddings in json files. Unfortunately, I wasn't able to silence TensorFlow, so it's giving a lot of information and warnings when I run this cell.

In [None]:
print("Started at ", time.ctime())



In [None]:
validation_data = pd.read_csv("gap-validation.tsv", sep = '\t')
validation_emb = run_bert(validation_data)

development_data = pd.read_csv("gap-development.tsv", sep = '\t')
development_emb = run_bert(development_data)

print("Finished at ", time.ctime())

In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

In [None]:
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [None]:
def featurize(embedding_df):
    
    pronoun_embs, a_embs, b_embs = [], [], []
    
    for i in tqdm(range(len(embedding_df))):
        
        pronoun_embs.append(embedding_df.loc[i, "emb_P"])
        a_embs.append(embedding_df.loc[i, "emb_A"])
        b_embs.append(embedding_df.loc[i, "emb_B"])

#         label_map = {'A': 0, 'B': 1, 'Neither': 2}
#         labels.append(label_map[embedding_df.loc[i, "label"]])

    
    a_embs = np.asarray(a_embs).astype('float')
    b_embs = np.asarray(b_embs).astype('float') 
    pronoun_embs = np.asarray(pronoun_embs).astype('float')
    
    return np.concatenate([a_embs, b_embs, pronoun_embs], axis=1)

In [None]:
# def featurize(embedding_df):
    
#     pronoun_embs, a_embs, b_embs, labels = [], [], [], []
    
#     for i in tqdm(range(len(embedding_df))):
        
#         pronoun_embs.append(embedding_df.loc[i, "emb_P"])
#         a_embs.append(embedding_df.loc[i, "emb_A"])
#         b_embs.append(embedding_df.loc[i, "emb_B"])

#         label_map = {'A': 0, 'B': 1, 'Neither': 2}
#         labels.append(label_map[embedding_df.loc[i, "label"]])

    
#     a_embs = np.asarray(a_embs).astype('float')
#     b_embs = np.asarray(b_embs).astype('float') 
#     pronoun_embs = np.asarray(pronoun_embs).astype('float')
    
#     return np.concatenate([a_embs, b_embs, pronoun_embs], axis=1), np.asarray(labels)

In [None]:
X_train, y_train = featurize(pd.concat([validation_emb, development_emb]).sort_index().reset_index())

In [None]:
X_train = my_imputer.fit_transform(X_train)

In [None]:
logit = LogisticRegression(C=0.0075, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=4, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
logit.fit(X_train, y_train)

In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(logit, open(filename, 'wb'))

In [None]:
test_data = pd.read_csv("../input/gendered-pronoun-resolution/test_stage_2.tsv",sep = "\t")


In [None]:
loaded_model = pickle.load(open('./finalized_model.sav', 'rb'))

In [None]:
len(test_data)

In [None]:
submission = pd.read_csv("../input/gendered-pronoun-resolution/sample_submission_stage_2.csv", index_col = "ID")

In [None]:
len(submission)

In [None]:
submission.describe()

In [None]:
count = 0
for i in range(0,12360, 200):
    test_emb = run_bert(test_data[i:i+200].reset_index())
    X_test = featurize(test_emb.sort_index().reset_index())
    X_test = my_imputer.fit_transform(X_test)
    logit_test_pred = loaded_model.predict_proba(X_test)
    for j in range(0, 200):
        submission.iloc[count+j]["A"] = logit_test_pred[j, 0]
        submission.iloc[count+j]["B"] = logit_test_pred[j, 1]
        submission.iloc[count+j]["NEITHER"]= logit_test_pred[j, 2]
    count +=200
    print(count)
        
    
    

In [None]:
len(submission)

In [None]:
submission.tail()

In [None]:
submission.iloc[12358]['A']

In [None]:
submission.to_csv("./submissionf.csv")

In [None]:
dm = pd.read_csv("./submissionf.csv")

In [None]:
dm.head()