In [2]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer
import torch
import os, sys

from torch.utils.data import TensorDataset
from transformers import BertModel, AdamW, BertConfig,BertTokenizer,BertPreTrainedModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import logging
import datetime, time
from collections import defaultdict
#from torch.utils.tensorboard import SummaryWriter

from torch.nn.init import xavier_uniform_

In [3]:
BASE_DIR = "C:/Users/rajiv/dev/evalwriting/"
INPUT_DIR = BASE_DIR+"inputs/"
RESULT_DIR = BASE_DIR+"result/"
TBOARD_LOG_DIR = BASE_DIR+"tboard/"
LOG_DIR = BASE_DIR+"logs/"
CP_DIR = BASE_DIR+"checkpoints/"


In [4]:
log_file = LOG_DIR+datetime.datetime.now().strftime("%Y%m%d.%H.%M")+".txt"
logging.basicConfig(filename=log_file, level=logging.DEBUG)

logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(sys.stdout))

In [5]:
class BertForMultiSentenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = BertModel(config)
        # Initialize model weights (inherited function).
        self.init_weights()


    def forward(self, x, segs, mask):
        # the below returns a tuple. First element in the tuple is last hidden state. Second element in tuple is pooler output
        self.result = self.model(input_ids=x, attention_mask =mask, token_type_ids=segs)
        top_vec = self.result[0]
        return top_vec


class Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 8)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, mask_cls):
#        h = self.linear2(self.tanh(self.linear1(x)))
        h = self.linear1(x)
        h = h.squeeze(-1)
        sent_scores = self.softmax(h) * mask_cls.unsqueeze(2).repeat(1,1,8).float()
        return sent_scores, x, h



class PassageClassifier(nn.Module):
    def __init__(self, args=None, num_hidden = 768, load_pretrained_bert = True, bert_config = None):
        super(PassageClassifier, self).__init__()
        self.args = args
        self.bert = BertForMultiSentenceClassification.from_pretrained(
            "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )
#        if (args.encoder == 'classifier'):
        self.encoder = Classifier(num_hidden)
        logger.debug("dropout = "+str(nn.Dropout(self.bert.config.hidden_dropout_prob)))
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        for p in self.encoder.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    def load_cp(self, state):
        self.load_state_dict(state, strict=True)

    def forward(self, x, segs, clss, mask, mask_cls, sentence_range=None):
        top_vec = self.bert(x, segs, mask)
#        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
#        sents_vec = sents_vec * mask_cls[:, :, None].float()
        sents_vec = top_vec.gather(1, clss.unsqueeze(-1).expand(-1, -1, 768))
        sents_vec = self.dropout(sents_vec)
#        cls_outs = sents_vec.clone().detach().cpu().numpy()
#        print(">>> ", cls_outs.shape, np.sum(cls_outs, axis=1)[:, :20])

        sent_scores, x, h = self.encoder(sents_vec, mask_cls)
        sent_scores = sent_scores.squeeze(-1)
        return sent_scores, mask_cls, x, h


## Create model inputs

In [6]:
import glob
train_d, val_d = dict(), dict()

for d_type, dset in (("train", train_d), ("validation", val_d)):
    for inp_file in glob.glob(INPUT_DIR+"*"+d_type+"*"):
        f_name = inp_file.split("\\")[-1]
        namelist = f_name.split("_")[:-1]
        tag = "_".join(namelist)
        file_d = torch.load(inp_file)
        #dset[tag] = file_d [:int(len(file_d)*.1)] #TODO temporary
        dset[tag] = file_d
        #dset = TensorDataset(train_d["fileidx"], train_d["idx"], train_d["src"],train_d["labels"], train_d["segs"], train_d["clss"], train_d["attn"], train_d["mask_cls"])

train_d.keys()

dict_keys(['attn', 'clss', 'id', 'labels', 'mask_cls', 'segs', 'src', 'wends', 'wstarts'])

In [7]:
for dset in (train_d, val_d):
    for k, v in dset.items():
        dset[k] = dset[k][:20]

train_dataset = TensorDataset(train_d["id"], train_d["src"],  train_d["segs"], train_d["clss"], train_d["attn"], train_d["mask_cls"], train_d["wstarts"], train_d["wends"], train_d["labels"])
val_dataset = TensorDataset(val_d["id"], val_d["src"],  val_d["segs"], val_d["clss"], val_d["attn"], val_d["mask_cls"], val_d["wstarts"], val_d["wends"], val_d["labels"])



### Training

In [9]:
with torch.cuda.device('cuda:0'):
    torch.cuda.empty_cache()

for k, v in train_d.items():
    del v

In [10]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

with torch.cuda.device('cuda:0'):
    torch.cuda.empty_cache()

import gc
gc.collect()

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3070 Laptop GPU


In [11]:
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )


In [None]:

model = PassageClassifier()
torch.cuda.empty_cache()
_= model.cuda()


In [17]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))




The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.model.embeddings.word_embeddings.weight            (30522, 768)
bert.model.embeddings.position_embeddings.weight          (512, 768)
bert.model.embeddings.token_type_embeddings.weight          (2, 768)
bert.model.embeddings.LayerNorm.weight                        (768,)
bert.model.embeddings.LayerNorm.bias                          (768,)

==== First Transformer ====

bert.model.encoder.layer.0.attention.self.query.weight    (768, 768)
bert.model.encoder.layer.0.attention.self.query.bias          (768,)
bert.model.encoder.layer.0.attention.self.key.weight      (768, 768)
bert.model.encoder.layer.0.attention.self.key.bias            (768,)
bert.model.encoder.layer.0.attention.self.value.weight    (768, 768)
bert.model.encoder.layer.0.attention.self.value.bias          (768,)
bert.model.encoder.layer.0.attention.output.dense.weight   (768, 768)
bert.model.encoder.layer.0.attention.output.dense.bias        

In [18]:
## set all layers to train
for param in model.bert.parameters():
    param.requires_grad=True
for param in model.encoder.parameters():
    param.requires_grad=True

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

In [20]:
from transformers import get_linear_schedule_with_warmup

epochs =10

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(total_steps * .2),
                                            num_training_steps = total_steps)


In [21]:
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
loss_c = torch.nn.BCELoss(reduction='none')


In [22]:
def save_cp(model_state, optimizer_state, scheduler, epoch, step=0):
    cp = {'model_state': model_state, 'optimizer_state':optimizer_state, 'scheduler': scheduler}
    filename = datetime.datetime.now().strftime("%Y%m%d.%H.")+str(epoch)+"_"+str(step)+".pth.tar"
    torch.save(cp, CP_DIR+filename )

def load_cp(filename):
    cp = torch.load(filename)
    return cp['model_state'], cp['optimizer_state'], cp['scheduler']

logger.debug("logging initializer")
print(log_file)

logging initializer
C:/Users/rajiv/dev/evalwriting/logs/20220119.07.26.txt


In [None]:
EVAL = True
# For each epoch...
for epoch_i in range(epochs):
    if EVAL:
        break
    # ========================================
    #               Training
    # ========================================
    

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            logger.debug('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        id, src, segs, clss, attn, mask_cls, wstarts, wends, labels = batch

        src, segs, clss, attn, mask_cls, wstarts, wends, labels = src.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device), wstarts.to(device), wends.to(device),labels.to(device)

        model.zero_grad()        

        probs, mask_cls, x, h = model( src, segs, clss, attn, mask_cls)
        loss = loss_c(probs, labels.float())
        loss = (loss * mask_cls.unsqueeze(2).repeat(1,1,8).float()).sum()
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        if step % 5000 == 0 and step != 0:
            logger.debug("saving checkpoint"+ str(epoch_i))
            save_cp(model.state_dict(), optimizer.state_dict(), scheduler.state_dict(), epoch_i, step)

        optimizer.step()
        scheduler.step()


    # save checkpoint
    #if (epoch_i+1 )%5 == 0:
    if True:
        logger.debug("saving checkpoint"+ str(epoch_i))
        save_cp(model.state_dict(), optimizer.state_dict(), scheduler.state_dict(), epoch_i)

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)
    logger.debug("  Average training loss: {0:.2f}".format(avg_train_loss))
    logger.debug("  Training epcoh took: {:}".format(training_time))


    print("")
    print("Running Validation...")
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    step = -1
    filesets = list()
    auclist = []
    accuracy = []
    for batch in validation_dataloader:
        step += 1
        id, src, segs, clss, attn, mask_cls, wstarts, wends, labels = batch
        src, segs, clss, attn, mask_cls, wstarts, wends, labels = src.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device), wstarts.to(device), wends.to(device),labels.to(device)
        
        with torch.no_grad():
            probs, mask_cls, x, h = model( src, segs, clss, attn, mask_cls)
            loss = loss_c(probs, labels.float())
            loss = (loss * mask_cls.unsqueeze(2).repeat(1,1,8).float()).sum()
        total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(validation_dataloader)
    #print("Average AUC", np.mean(np.array([x for x in auclist])) )
    logger.debug("  Average validation loss: {0:.2f}".format(avg_eval_loss))
    logger.debug("  Validation epcoh took: {:}".format(format_time(time.time() - t0)))




In [23]:
#load checkpoints if provided
#cp_file = BASE_DIR+"checkpoints/"+"20220115.18.5_0.pth.tar"
cp_file = BASE_DIR+"checkpoints/"+"20220118.13.71_0.pth.tar"
if cp_file:
    m_state, o_state, s_state = load_cp(cp_file)
    model.load_cp(m_state)
    optimizer.load_state_dict(o_state)
    scheduler.load_state_dict(s_state)

In [24]:
# load train dataset, construct the expected outputs
# construct the predicted outputs from bert call
# run the metrics

groundtruth = {}
disc_types = {'Claim': 0,
 'Concluding Statement': 1,
 'Counterclaim': 2,
 'Evidence': 3,
 'Lead': 4,
 'Position': 5,
 'Rebuttal': 6,
 'None': 7}
disc_types = {v:k for k, v in disc_types.items()}

file_id_map = pd.read_csv(BASE_DIR+"inputs/f_mapping.csv")
file_id_map = {row.to_dict()['f_id']:row.to_dict()['id'] for k, row in file_id_map.iterrows()}
train_df = pd.read_csv("../data/train.csv")
pred_by_file = defaultdict(list)


for batch in train_dataloader:
    id, src, segs, clss, attn, mask_cls, wstarts, wends, labels = batch
    src, segs, clss, attn, mask_cls, wstarts, wends, labels = src.to(device), segs.to(device), clss.to(device), attn.to(device), mask_cls.to(device), wstarts.to(device), wends.to(device),labels.to(device)
    
    with torch.no_grad():
        probs, mask_cls, x, h = model( src, segs, clss, attn, mask_cls)
        pred_labs = torch.argmax(probs, dim=2)

        for r in range(pred_labs.size(dim=0)):
            id_lookup = int(id[r])
            # read ground truth
            f_name = file_id_map[id_lookup]
            gt_df = train_df [train_df.id == f_name]
            gt_df = gt_df[["id", "discourse_type", "predictionstring"]]
            gt_df.to_csv(BASE_DIR+"/result/"+f_name+".gt.csv")
            groundtruth[f_name] = gt_df
            ws, we = wstarts[r].tolist(), wends[r].tolist()
            preds = pred_labs[r, :]
            for s_num in range(int(mask_cls[r, :].sum())):
                p_dict = {}
                p_dict["id"] = f_name
                p_dict["discourse_type"] = disc_types[int(preds[s_num])]
                ws_i, we_i = int(ws[s_num]), int(we[s_num])
                p_dict["predictionstring"] = " ".join([str(x) for x in range(ws_i, we_i+1)])
                if p_dict["predictionstring"].strip() == "":
                    p_dict["predictionstring"] = "7"
                pred_by_file[f_name].append(p_dict)

        loss = loss_c(probs, labels.float())
        loss = (loss * mask_cls.unsqueeze(2).repeat(1,1,8).float()).sum()
        #total_eval_loss += loss.item()



In [25]:
pred_by_file_cleaned = defaultdict(list)
for f_name, preds in pred_by_file.items():
    merged_preds = []
    for i, p in enumerate(preds):
        if p["discourse_type"] == "None":
            continue

        if i != 0:
            if merged_preds[-1]["id"] == p["id"] and merged_preds[-1]["discourse_type"] == p["discourse_type"]:
                w_idxs_a = [int(x) for x in merged_preds[-1]["predictionstring"].split(" ")]
                w_idxs_b = [int(x) for x in p["predictionstring"].split(" ")]
                if w_idxs_a[-1] == w_idxs_b[0]-1:
                    merged_preds[-1]['predictionstring'] = merged_preds[-1]['predictionstring'] + " " + p["predictionstring"]
                else:
                    merged_preds.append(p)
            else:
                merged_preds.append(p)
        else:
            merged_preds.append(p)
        
    pred_by_file_cleaned[f_name].extend(merged_preds)

for f_name, sents in pred_by_file_cleaned.items():
    pred_df = pd.DataFrame(sents)
    pred_df.to_csv(BASE_DIR+"/result/"+f_name+".pred.csv")
    pred_by_file_cleaned[f_name] = pred_df



In [26]:
# Below copied from https://www.kaggle.com/robikscube/student-writing-competition-twitch-stream?scriptVersionId=83303421&cellId=31

def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score


def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

In [27]:
scores = []
for k, gt_df in groundtruth.items():
    pred_df = pred_by_file_cleaned[k]
    pred_df = pred_df.rename(columns={"discourse_type":"class"})
    #print(pred_df[["id", "class", "predictionstring"]])
    scores.append(score_feedback_comp(pred_df, gt_df))

print(np.mean(np.array(scores)))

0.6392498110355253
