In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import dgl
from dgl import DGLGraph

import re
import numpy as np
import pandas as pd

from pytorch_pretrained_bert import BertTokenizer

import spacy
import pickle
import collections

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.cuda.set_device(0)

In [2]:
'''
Load results from BERT Embeddings Generatation with Punc.ipynb and BERT Embeddings Generatation without Punc.ipynb
'''

token_lst = pickle.load(open('token_lst_wto_padding.pkl', "rb")) # tokens of every sentence without padding
bert_outputs_lst = pickle.load(open('bert_outputs.pkl', "rb")) # list of outputs of bert for every sentence
offsets_lst = pickle.load(open('offsets_lst.pkl', "rb"))

test_token_lst = pickle.load(open('test_token_lst_wto_padding.pkl', "rb")) # tokens of every sentence without padding
test_bert_outputs_lst = pickle.load(open('test_bert_outputs.pkl', "rb")) # list of outputs of bert for every sentence
test_offsets_lst = pickle.load(open('test_offsets_lst.pkl', "rb"))

others_bert_outputs = pickle.load(open('others_bert_outputs.pkl', "rb"))
test_others_bert_outputs  = pickle.load(open('test_others_bert_outputs.pkl', "rb"))


train_df = pd.concat([
    pd.read_csv("gap-test.tsv", delimiter="\t"),
    pd.read_csv("gap-validation.tsv", delimiter="\t")
], axis=0)

test_df = pd.read_csv("gap-development.tsv", delimiter="\t")

# Model Building

## GATNE Model

In [3]:
import math
class RGATModel(nn.Module):
    def __init__(
        self, embedding_size, embedding_u_size, edge_type_count, dim_a
    ):
        super(RGATModel, self).__init__()
        self.embedding_size = embedding_size  # 每个节点输出的embedding_size
        self.embedding_u_size = embedding_u_size  # 节点作为邻居初始化size
        self.edge_type_count = edge_type_count  # 类别数量
        self.dim_a = dim_a  # 中间隐层特征数量

        feature_dim = 1024
        self.embed_trans = nn.Parameter(torch.FloatTensor(feature_dim, embedding_size)) 
        self.u_embed_trans = nn.Parameter(torch.FloatTensor(edge_type_count, feature_dim, embedding_u_size)) 

        self.trans_weights = nn.Parameter(
            torch.FloatTensor(edge_type_count, embedding_u_size, embedding_size)
        )
        self.trans_weights_s1 = nn.Parameter(
            torch.FloatTensor(edge_type_count, embedding_u_size, dim_a)
        )
        self.trans_weights_s2 = nn.Parameter(torch.FloatTensor(edge_type_count, dim_a, 1)) 

        self.reset_parameters()

    def reset_parameters(self):
        self.embed_trans.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        self.u_embed_trans.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        
        self.trans_weights.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        self.trans_weights_s1.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        self.trans_weights_s2.data.normal_(std=1.0 / math.sqrt(self.embedding_size))

    def forward(self,features_lst, train_types_lst, node_neigh_lst):
        rst_hidden = []
        for features,train_types,node_neigh in zip(features_lst,train_types_lst,node_neigh_lst):
            train_types = train_types.cuda()
            node_neigh = node_neigh.cuda()

            num_nodes = int(features.shape[0]/3)
            node_embed = torch.matmul(features, self.embed_trans)
            node_embed_neighbors = torch.einsum('bijk,akm->bijam', features[node_neigh], self.u_embed_trans)

            node_embed_tmp = torch.cat(  # Aggregate the neighbor information around each category
                [
                    node_embed_neighbors[:, i, :, i, :].unsqueeze(1)
                    for i in range(self.edge_type_count)
                ],
                dim=1,
            )
            node_type_embed = torch.sum(node_embed_tmp, dim=2).repeat(3,1,1)  # Sum the neighbor information

            trans_w = self.trans_weights[train_types]
            trans_w_s1 = self.trans_weights_s1[train_types]
            trans_w_s2 = self.trans_weights_s2[train_types]

            attention = F.softmax(
                torch.matmul(
                    torch.tanh(torch.matmul(node_type_embed, trans_w_s1)), trans_w_s2
                ).squeeze(2),
                dim=1,
            ).unsqueeze(1)
            node_type_embed = torch.matmul(attention, node_type_embed)  #对node_type_embed做attention求和
            node_embed = node_embed + torch.matmul(node_type_embed, trans_w).squeeze(1)
            last_node_embed = F.normalize(node_embed, dim=1)
            
            # concat
            last_node_embed = torch.cat((last_node_embed[:num_nodes],last_node_embed[num_nodes:2*num_nodes],last_node_embed[2*num_nodes:]),dim =1)
            rst_hidden.append(last_node_embed)
        return rst_hidden


## Design the Main Model (RGAT + FFNN)

In [4]:
class Head(nn.Module):
    """The MLP submodule"""
    def __init__(self, rgat_out_size: int, bert_out_size: int):
        super().__init__()
        self.bert_out_size = bert_out_size
        self.rgat_out_size = rgat_out_size
        
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_out_size * 3 + rgat_out_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_out_size * 3 + rgat_out_size * 3, 256),    
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.5),
            nn.Linear(256, 3),
        )
        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, rgat_outputs, offsets_rgat, bert_embeddings):
        
        rgat_extracted_outputs = [rgat_outputs[i].unsqueeze(0).gather(1, offsets_rgat[i].unsqueeze(0).unsqueeze(2)
                                       .expand(-1, -1, rgat_outputs[i].unsqueeze(0).size(2)))
                                 .view(rgat_outputs[i].unsqueeze(0).size(0), -1) for i in range(len(rgat_outputs))]
        
        rgat_extracted_outputs = torch.stack(rgat_extracted_outputs, dim=0).squeeze()
        
        embeddings = torch.cat((rgat_extracted_outputs, bert_embeddings), 1) 
        
        return self.fc(embeddings)


class BERT_Head(nn.Module):
    def __init__(self, bert_hidden_size: int):
        super().__init__()
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_hidden_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_hidden_size * 3, 512 * 3),   
            nn.ReLU(),
        )

        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, bert_embeddings):
        outputs = self.fc(bert_embeddings.view(bert_embeddings.shape[0],-1))
        return outputs
        
    
class GPRModel(nn.Module):
    """The main model."""
    def __init__(self):
        super().__init__()
        self.RGAT =  RGATModel(256,10,3,20)
        self.BERThead = BERT_Head(1024) # bert output size
        self.head = Head(256*3, 512)  # rgat output   berthead output
    
    
    def forward(self, offsets_bert, offsets_rgat, bert_embeddings, features,train_types,neighbors):
        rgat_outputs = self.RGAT(features,train_types,neighbors)
        bert_head_outputs = self.BERThead(bert_embeddings)
        head_outputs = self.head(rgat_outputs, offsets_rgat, bert_head_outputs)
        return head_outputs            


# Data Input

## Generate All Syntactic Graphs with BERT embeddings

In [5]:
parser = spacy.load('en_core_web_lg') 
# An error may be reported that the model cannot be found and needs to be executed on the terminal:
# python -m spacy download en_core_web_lg

BERT_MODEL = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[THISISA]", "[THISISB]", "[THISISP]"))

tokenizer.vocab["[THISISA]"] = -1
tokenizer.vocab["[THISISB]"] = -1
tokenizer.vocab["[THISISP]"] = -1

### Training Data

In [6]:
def is_target(i, target_offset_list):
    return i in target_offset_list

def transfer_n_e(nodes, edges):

    num_nodes = len(nodes)
    new_edges = []
    for e1, e2 in edges:
        new_edges.append( [nodes[e1], nodes[e2]] ) 
    return num_nodes, new_edges

def gen_edge_by_type(tran_edges,edge_type):
    edge_data_by_type = dict()
    for i in range(len(edge_type)):
        if str(edge_type[i]) not in edge_data_by_type:
            edge_data_by_type[str(edge_type[i])] = list()
        edge_data_by_type[str(edge_type[i])].append(tran_edges[i])
    return edge_data_by_type

def generate_neighbors(network_data, num_nodes , edge_types=['0','1','2'], neighbor_samples=4):
    edge_type_count = len(edge_types)
    neighbors = [[[] for __ in range(edge_type_count)] for _ in range(num_nodes)]
    for r in range(edge_type_count):
        # print('Generating neighbors for layer', r)
        g = network_data[edge_types[r]]  # Nodes involved in each type
        for (x, y) in g:
            ix = x  # The index that x corresponds to
            iy = y  # The index that y corresponds to
            neighbors[ix][r].append(iy)  # The neighbor information
            neighbors[iy][r].append(ix)
        for i in range(num_nodes):  # Sample a fixed number of neighbors
            if len(neighbors[i][r]) == 0:  
                neighbors[i][r] = [i] * neighbor_samples
            elif len(neighbors[i][r]) < neighbor_samples:  
                neighbors[i][r].extend(list(np.random.choice(neighbors[i][r], size=neighbor_samples-len(neighbors[i][r]))))
            elif len(neighbors[i][r]) > neighbor_samples:  
                neighbors[i][r] = list(np.random.choice(neighbors[i][r], size=neighbor_samples))
    return neighbors  # The neighbor sampling result of each node


all_train_types = []
all_neighbors = []
all_features = []

all_graphs = []
rgat_offsets = []
for i, sent_token in enumerate(token_lst):
    sent_token = token_lst[i]

    sent = ' '.join([re.sub("[#]","",token)   for token in tokenizer.convert_ids_to_tokens(sent_token[1:-1])])
    doc = parser(sent)
    parse_rst = doc.to_json()

    target_offset_list = [item - 1 for item in offsets_lst[i]]
    
    nodes = collections.OrderedDict()
    edges = []
    edge_type = []
    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue

        if i_word not in nodes:
            nodes[i_word] = len(nodes) 
            edges.append( [i_word, i_word] )
            edge_type.append(0)
        if word['head'] not in nodes:
            nodes[word['head']] = len(nodes) 
            edges.append( [word['head'], word['head']] )
            edge_type.append(0)

        if word['dep'] != 'ROOT':
                edges.append( [word['head'], word['id']] )
                edge_type.append(1)
                edges.append( [word['id'], word['head']] )
                edge_type.append(2)

    num_nodes, tran_edges = transfer_n_e(nodes, edges)
    
    edge_data_by_type = gen_edge_by_type(tran_edges,edge_type)
    neighbors = generate_neighbors(edge_data_by_type, num_nodes)
    train_types = [0]*num_nodes +[1]*num_nodes+[2]*num_nodes
    
    all_train_types.append(torch.from_numpy(np.array(train_types,dtype = int)))
    all_neighbors.append(torch.from_numpy(np.array(neighbors)))
    
    rgat_offset = [nodes[offset] for offset in target_offset_list]
    rgat_offsets.append(rgat_offset)
    
    G = dgl.DGLGraph()
    G = G.to('cuda:0')  
    G.add_nodes(num_nodes)
    G.add_edges(list(zip(*tran_edges))[0],list(zip(*tran_edges))[1]) 

    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue
        if is_target(i_word, target_offset_list): 
            G.nodes[ [ nodes[i_word] ]].data['h'] = others_bert_outputs[i][0][target_offset_list.index(i_word)].unsqueeze(0).cuda()
        else:
            G.nodes[ [ nodes[i_word] ]].data['h'] = bert_outputs_lst[i][0][i_word + 1].unsqueeze(0).cuda()
        if is_target(word['head'], target_offset_list):
            G.nodes[ [ nodes[word['head']] ]].data['h'] = others_bert_outputs[i][0][target_offset_list.index(word['head'])].unsqueeze(0).cuda()
        else:   
            G.nodes[ [ nodes[word['head']] ]].data['h'] = bert_outputs_lst[i][0][word['head'] + 1].unsqueeze(0).cuda()

    edge_norm = []
    for e1, e2 in tran_edges:
        if e1 == e2:
            edge_norm.append(1)
        else:
            edge_norm.append( 1 / (G.in_degree(e2) - 1 ) )


    edge_type = torch.from_numpy(np.array(edge_type)).cuda()
    edge_norm = torch.from_numpy(np.array(edge_norm)).unsqueeze(1).float().cuda()

    G.edata.update({'rel_type': edge_type,})
    G.edata.update({'norm': edge_norm})
    all_graphs.append(G)
    all_features.append(G.ndata['h'].repeat(3,1))



### Test Data

In [7]:
test_all_train_types = []
test_all_neighbors = []
test_all_features = []

test_all_graphs = []
test_rgat_offsets = []
for i, sent_token in enumerate(test_token_lst):
    sent_token = test_token_lst[i]

    sent = ' '.join([re.sub("[#]","",token)   for token in tokenizer.convert_ids_to_tokens(sent_token[1:-1])])
    doc = parser(sent)
    parse_rst = doc.to_json()

    target_offset_list = [item - 1 for item in test_offsets_lst[i]]
    
    nodes = collections.OrderedDict()
    edges = []
    edge_type = []
    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue

        if i_word not in nodes:
            nodes[i_word] = len(nodes) 
            edges.append( [i_word, i_word] )
            edge_type.append(0)
        if word['head'] not in nodes:
            nodes[word['head']] = len(nodes) 
            edges.append( [word['head'], word['head']] )
            edge_type.append(0)

        if word['dep'] != 'ROOT':
                edges.append( [word['head'], word['id']] )
                edge_type.append(1)
                edges.append( [word['id'], word['head']] )
                edge_type.append(2)

    num_nodes, tran_edges = transfer_n_e(nodes, edges)
    
    edge_data_by_type = gen_edge_by_type(tran_edges,edge_type)
    neighbors = generate_neighbors(edge_data_by_type, num_nodes)
    train_types = [0]*num_nodes +[1]*num_nodes+[2]*num_nodes
    
    test_all_train_types.append(torch.from_numpy(np.array(train_types,dtype = int)))
    test_all_neighbors.append(torch.from_numpy(np.array(neighbors)))
    
    test_rgat_offset = [nodes[offset] for offset in target_offset_list]
    test_rgat_offsets.append(test_rgat_offset)
    
    G = dgl.DGLGraph()
    G = G.to('cuda:0')
    G.add_nodes(num_nodes)
    G.add_edges(list(zip(*tran_edges))[0],list(zip(*tran_edges))[1]) 

    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue
        
        if is_target(i_word, target_offset_list): 
            G.nodes[ [ nodes[i_word] ]].data['h'] = test_others_bert_outputs[i][0][target_offset_list.index(i_word)].unsqueeze(0).cuda()
        else:
            G.nodes[ [ nodes[i_word] ]].data['h'] = test_bert_outputs_lst[i][0][i_word + 1].unsqueeze(0).cuda()
        if is_target(word['head'], target_offset_list):
            G.nodes[ [ nodes[word['head']] ]].data['h'] = test_others_bert_outputs[i][0][target_offset_list.index(word['head'])].unsqueeze(0).cuda()
        else:   
            G.nodes[ [ nodes[word['head']] ]].data['h'] = test_bert_outputs_lst[i][0][word['head'] + 1].unsqueeze(0).cuda()

    edge_norm = []
    for e1, e2 in tran_edges:
        if e1 == e2:
            edge_norm.append(1)
        else:
            edge_norm.append( 1 / (G.in_degree(e2) - 1 ) )


    edge_type = torch.from_numpy(np.array(edge_type)).cuda()
    edge_norm = torch.from_numpy(np.array(edge_norm)).unsqueeze(1).float().cuda()

    G.edata.update({'rel_type': edge_type,})
    G.edata.update({'norm': edge_norm})
    test_all_graphs.append(G)
    test_all_features.append(G.ndata['h'].repeat(3,1))

## Design Dataloader and Dataset

In [8]:
class GPRDataset(Dataset):
    def __init__(self, original_df, graphs, bert_offsets, rgat_offsets, bert_embeddings,features,train_types,neighbors):
        

        tmp = original_df[["A-coref", "B-coref"]].copy()
        tmp["Neither"] = ~(original_df["A-coref"] | original_df["B-coref"])
        self.y = tmp.values.astype("bool")

        self.graphs = graphs
        self.bert_offsets = bert_offsets  # already +1
        self.bert_embeddings = bert_embeddings  # include [CLS]
        self.rgat_offsets = rgat_offsets
        self.features = features
        self.train_types = train_types
        self.neighbors = neighbors
        
    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return self.graphs[idx], self.bert_offsets[idx], self.rgat_offsets[idx], self.bert_embeddings[idx], \
    self.y[idx],self.features[idx],self.train_types[idx],self.neighbors[idx]

In [9]:
def collate(samples):
    
    graphs, bert_offsets, rgat_offsets, bert_embeddings, labels,features,train_types,neighbors = map(list, zip(*samples))
    
    batched_graph = dgl.batch(graphs)
    offsets_bert = torch.stack([torch.LongTensor(x) for x in bert_offsets], dim=0)
    offsets_rgat = torch.stack([torch.LongTensor(x) for x in rgat_offsets], dim=0)
    
    one_hot_labels = torch.stack([torch.from_numpy(x.astype("uint8")) for x in labels], dim=0)
    _, labels = one_hot_labels.max(dim=1)
    
    bert_embeddings = torch.stack(bert_embeddings, dim=0).squeeze()
    
    
    return batched_graph, offsets_bert, offsets_rgat, bert_embeddings, labels,features,train_types,neighbors

## Test DataLoarder

In [10]:
test_dataset = GPRDataset(original_df = test_df, graphs = test_all_graphs, bert_offsets = test_offsets_lst, rgat_offsets = test_rgat_offsets, bert_embeddings = test_others_bert_outputs,features=test_all_features,train_types=test_all_train_types,neighbors=test_all_neighbors)
# train_dataset = GPRDataset(original_df = train_df, graphs = all_graphs, bert_offsets = offsets_lst, rgat_offsets= rgat_offsets, bert_embeddings = others_bert_outputs)

In [11]:
# train_dataloarder = DataLoader(
#    train_dataset,
#    collate_fn = collate,
#    batch_size = 4,
#    shuffle=True,
# )

test_dataloarder = DataLoader(
    test_dataset,
    collate_fn = collate,
    batch_size = 4,
)

# Training Part

In [12]:
def send_graph_to_cpu(g):
    # nodes
    g = g.to("cpu")
    labels = g.node_attr_schemes()
    for l in labels.keys():
        g.ndata[l] = g.ndata.pop(l).cpu()
    # edges
    labels = g.edge_attr_schemes()
    for l in labels.keys():
        g.edata[l] = g.edata.pop(l).cpu()
    return g

In [13]:
lr_value = 0.0001
total_epoch = 100
def adjust_learning_rate(optimizers, epoch):
    # warm up
    if epoch < 10:
        lr_tmp = 0.00001
    else:
        lr_tmp = lr_value * pow((1 - 1.0 * epoch / 100), 0.9)
    
    if epoch > 36:
        lr_tmp =  0.000015 * pow((1 - 1.0 * epoch / 100), 0.9)
    
    for optimizer in optimizers:
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_tmp

    return lr_tmp

In [14]:
# generate labels
tmp = train_df[["A-coref", "B-coref"]].copy()
tmp["Neither"] = ~(train_df["A-coref"] | train_df["B-coref"])
train_y = tmp.values.astype("bool").argmax(1)

## 5 Fold

In [15]:
from operator import itemgetter

kfold = StratifiedKFold(n_splits = 5)
test_predict_lst = [] # the test output for every fold
for train_index, test_index in kfold.split(train_df, train_y):
    print("=" * 20)
    print(f"Fold {len(test_predict_lst) + 1}")
    print("=" * 20)
    
    val_dataset = GPRDataset(original_df = train_df.iloc[test_index], graphs = list(itemgetter(*test_index)(all_graphs)), bert_offsets = list(itemgetter(*test_index)(offsets_lst)), rgat_offsets= list(itemgetter(*test_index)(rgat_offsets)), bert_embeddings = list(itemgetter(*test_index)(others_bert_outputs)),features=list(itemgetter(*test_index)(all_features)),train_types=list(itemgetter(*test_index)(all_train_types)),neighbors=list(itemgetter(*test_index)(all_neighbors)))
    
    train_dataset = GPRDataset(original_df = train_df.iloc[train_index], graphs = list(itemgetter(*train_index)(all_graphs)), bert_offsets = list(itemgetter(*train_index)(offsets_lst)), rgat_offsets= list(itemgetter(*train_index)(rgat_offsets)), bert_embeddings = list(itemgetter(*train_index)(others_bert_outputs)),features=list(itemgetter(*train_index)(all_features)),train_types=list(itemgetter(*train_index)(all_train_types)),neighbors=list(itemgetter(*train_index)(all_neighbors)))
    
    train_dataloarder = DataLoader(
    train_dataset,
    collate_fn = collate,
    batch_size = 4,
    shuffle=True,)
    
    val_dataloarder = DataLoader(
    val_dataset,
    collate_fn = collate,
    batch_size = 4,)

    model = GPRModel().cuda()
    loss_func = nn.CrossEntropyLoss() 
    optimizer = optim.Adam(model.parameters(), lr=lr_value)
    reg_lambda = 0.035

    print('Dataloader Success---------------------')
    
    best_val_loss = 11
    for epoch in range(total_epoch):
        
        if epoch % 5 == 0:
            print('|',">" * epoch," "*(80-epoch),'|')
        
        lr = adjust_learning_rate([optimizer],epoch)
        # print("Learning rate = %4f\n" % lr)
        model.train()
        for iter, (batched_graph, offsets_bert, offsets_rgat, bert_embeddings, labels,features,train_types,neighbors) in enumerate(train_dataloarder):
            bert_embeddings = bert_embeddings.cuda()
            labels = labels.cuda()
            offsets_rgat = offsets_rgat.cuda()

            prediction = model(offsets_bert, offsets_rgat, bert_embeddings,features,train_types,neighbors )

            l2_reg = None
            for w in model.RGAT.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2)  
            for w in model.head.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2)   
            for w in model.BERThead.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2) 
            loss = loss_func(prediction, labels) + l2_reg * reg_lambda

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for iter, (batched_graph, offsets_bert, offsets_rgat, bert_embeddings, labels,features,train_types,neighbors) in enumerate(val_dataloarder):
                offsets_rgat = offsets_rgat.cuda()
                bert_embeddings = bert_embeddings.cuda()
                labels = labels.cuda()
                prediction = model(offsets_bert, offsets_rgat, bert_embeddings, features,train_types,neighbors)
                loss = loss_func(prediction, labels)
                val_loss += loss.detach().item()
            val_loss = val_loss/(iter + 1)
            
            
        if epoch%20 == 0:
            print('Epoch {}, val_loss {:.4f}'.format(epoch, val_loss))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            if epoch > 20:
                torch.save(model.state_dict(), 'best_model.pth') 
            if epoch > 36: print('Best val loss found: ', best_val_loss)

    
    print('This fold, the best val loss is: ', best_val_loss)
    
    test_loss = 0.
    test_predict = None
    
    model = GPRModel().to("cuda:0")
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    
    with torch.no_grad():
        for iter, (batched_graph, offsets_bert, offsets_rgat, bert_embeddings, labels,features,train_types,neighbors) in enumerate(test_dataloarder):
            
            offsets_rgat = offsets_rgat.cuda()
            bert_embeddings = bert_embeddings.cuda()
            labels = labels.cuda()
            prediction = model(offsets_bert, offsets_rgat, bert_embeddings, features,train_types,neighbors)
            
            if test_predict is None:
                test_predict = prediction
            else:
                test_predict = torch.cat((test_predict, prediction), 0) 
            loss = loss_func(prediction, labels)
            test_loss += loss.detach().item()
    
    test_loss /= (iter + 1)
    print('This fold, the test loss is: ', test_loss)
    test_predict_lst.append(test_predict)

Fold 1
Dataloader Success---------------------
|                                                                                   |
Epoch 0, val_loss 1.2307
| >>>>>                                                                             |
| >>>>>>>>>>                                                                        |
| >>>>>>>>>>>>>>>                                                                   |
| >>>>>>>>>>>>>>>>>>>>                                                              |
Epoch 20, val_loss 0.6380
| >>>>>>>>>>>>>>>>>>>>>>>>>                                                         |
| >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>                                                    |
| >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>                                               |
Best val loss found:  0.5709718752924989
| >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>                                          |
Epoch 40, val_loss 0.5864
| >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>              

# Test Part

In [16]:
test_predict_arr = [torch.softmax(pre.cpu(), -1).clamp(1e-4, 1-1e-4).numpy() for pre in test_predict_lst]

In [17]:
final_test_preds = np.mean(test_predict_arr, axis=0)

In [18]:
final_test_preds

array([[0.8161887 , 0.12780455, 0.05600675],
       [0.96323794, 0.01873819, 0.01802389],
       [0.02798579, 0.9661549 , 0.00585947],
       ...,
       [0.5187353 , 0.40772533, 0.07353938],
       [0.6908145 , 0.05375566, 0.2554298 ],
       [0.03592736, 0.87267363, 0.09139912]], dtype=float32)

In [19]:
from sklearn.metrics import log_loss

def extract_target(df):
    df["Neither"] = 0
    df.loc[~(df['A-coref'] | df['B-coref']), "Neither"] = 1
    df["target"] = 0
    df.loc[df['B-coref'] == 1, "target"] = 1
    df.loc[df["Neither"] == 1, "target"] = 2
    return df
test_df = extract_target(test_df)
log_loss(test_df.target, final_test_preds)

0.4912520516128279

In [20]:
(test_df.target==2).sum()

201

In [21]:
final_test_preds.argmax(axis = 1)

array([0, 0, 1, ..., 0, 0, 1])

In [22]:
from sklearn.metrics import classification_report
print(classification_report(test_df.target, final_test_preds.argmax(axis = 1),digits =3))

              precision    recall  f1-score   support

           0      0.828     0.874     0.850       874
           1      0.833     0.846     0.840       925
           2      0.745     0.507     0.604       201

    accuracy                          0.825      2000
   macro avg      0.802     0.743     0.765      2000
weighted avg      0.822     0.825     0.821      2000



In [24]:
df_sub = pd.DataFrame(final_test_preds, columns=["A", "B", "NEITHER"])
df_sub["ID"] = test_df.ID
df_sub = df_sub[['ID',"A", "B", "NEITHER"]]
df_sub.to_csv("submission_RGAT_Model.csv", index=False)
df_sub.head()

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.816189,0.127805,0.056007
1,development-2,0.963238,0.018738,0.018024
2,development-3,0.027986,0.966155,0.005859
3,development-4,0.014713,0.732934,0.252352
4,development-5,0.039329,0.953091,0.00758
