In [None]:
# !pip install --upgrade torch
import pandas as pd 
import torch 
import numpy as np
import os 
from tqdm import tqdm 

In [None]:
from transformers import LongformerTokenizerFast
tokenizer = LongformerTokenizerFast.from_pretrained("../input/allenailongformerbase4096/longformer")


In [None]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoConfig, AutoModel, AutoTokenizer


class LongformerSingle(nn.Module):
    def __init__(self, backbone, config):
        super().__init__()
        self.config = config
        self.backbone = backbone
        self.classifier = nn.Linear(config.hidden_size, 15)

    def forward(self, tokens, attention_mask):
        transformer_out = self.backbone(
            tokens, attention_mask=attention_mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.classifier(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits


def build_model():
    config = AutoConfig.from_pretrained(
        "../input/allenailongformerbase4096/longformer/config.json")
    backbone = AutoModel.from_pretrained(
        "../input/allenailongformerbase4096/longformer/pytorch_model.bin", config=config)
    return LongformerSingle(backbone=backbone, config=config)

In [None]:
model = build_model()

In [None]:
model.load_state_dict(torch.load("../input/train-longformer/saved_model4.pth"))

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
# setting model up for evaluation 
model.eval()

In [None]:
target_map_rev = {0:'Lead', 1:'Position', 2:'Evidence', 3:'Claim', 4:'Concluding Statement',
             5:'Counterclaim', 6:'Rebuttal', 7:'blank'}
# GET TEST TEXT IDS
files = os.listdir('../input/feedback-prize-2021/test')
TEST_IDS = [f.replace('.txt','') for f in files if 'txt' in f]
print('There are',len(TEST_IDS),'test texts.')

In [None]:
# CONVERT TEST TEXT TO TOKENS
MAX_LEN = 1024
test_tokens = np.zeros((len(TEST_IDS),MAX_LEN), dtype='int32')
test_attention = np.zeros((len(TEST_IDS),MAX_LEN), dtype='int32')
offset_mapping =[]
for id_num in range(len(TEST_IDS)):
        
    # READ TRAIN TEXT, TOKENIZE, AND SAVE IN TOKEN ARRAYS    
    n = TEST_IDS[id_num]
    name = f'../input/feedback-prize-2021/test/{n}.txt'
    txt = open(name, 'r').read()
    tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                   truncation=True, return_offsets_mapping=True)
    test_tokens[id_num,] = tokens['input_ids']
    test_attention[id_num,] = tokens['attention_mask']
    offset_mapping.append(tokens['offset_mapping'])

In [None]:
with torch.no_grad():
    preds = model(torch.tensor(test_tokens), torch.tensor(test_attention,dtype = torch.float16))

In [None]:
print("test pred shape : ", preds.shape)

In [None]:
target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}

def get_preds(dataset = 'train', verbose = True, text_ids = None, preds = None):
    all_predictions = []
    for id_num in range(len(preds)):
#         print(id_num) # predictions from preds
        if (id_num % 100 == 0) & (verbose): print(id_num, ', ', end = '') # for monitoring stuff 
        n = text_ids[id_num] # n here we are getting test_id for that num
#         print(n)
        name = f'../input/feedback-prize-2021/{dataset}/{n}.txt'
#         print(name)
        txt = open(name, 'r').read() # opening text file 
        tokens = tokenizer.encode_plus(txt, max_length = MAX_LEN, padding = 'max_length', truncation = True, return_offsets_mapping = True)
        off = tokens['offset_mapping']
        w = []
#         print("encoding and tokenization done")
        
        blank = True
        for i in range(len(txt)):
            if (txt[i] != ' ') & (txt[i] != '\n') & (blank == True):
                w.append(i)
                blank = False
            elif (txt[i] == ' ') | (txt[i] == '\n'):
                blank = True
        w.append(1e6)
        word_map = -1 * np.ones(MAX_LEN, dtype = 'int32')
        w_i = 0
        for i in range(len(off)):
#             print(off[i][1]," is zero? ")
            if off[i][1] == 0: continue
            while off[i][0] >= w[w_i + 1]: w_i += 1
#             if int(w_i)!= 0 : print(w_i, "there is not an error")
            word_map[i] = int(w_i)
#         print(word_map)
        pred = preds[id_num,] / 2.0
        i = 0
#         print(pred)
        while i < MAX_LEN:
            prediction = []
            start = int(pred[i])
            if start in [0, 1, 2, 3, 4, 5, 6, 7]:
#                 print(word_map[i])
                prediction.append(word_map[i])
                i += 1
                if i >= MAX_LEN: break
                while pred[i] == start + 0.5:
                    if not word_map[i] in prediction: prediction.append(word_map[i])
                    i += 1
                    if i >= MAX_LEN: break
            else: i += 1
            prediction = [x for x in prediction if x != -1]
#             print(prediction) # we are getting prediction as the blank array
            if len(prediction) > 4: all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))
    
#     print(all_predictions)
    # MAKE DATAFRAME
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    return df

# def calc_overlap(row):
#     set_pred = set(row.predictionstring_pred.split(' '))
#     set_gt = set(row.predictionstring_gt.split(' '))
#     len_gt = len(set_gt)
#     len_pred = len(set_pred)
#     inter = len(set_gt.intersection(set_pred))
#     overlap_1 = inter / len_gt
#     overlap_2 = inter / len_pred
#     return [overlap_1, overlap_2]

# def score_feedback_comp(pred_df, gt_df):
#     gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].reset_index(drop = True).copy()
#     pred_df = pred_df[['id', 'class', 'predictionstring']].reset_index(drop = True).copy()
#     pred_df['pred_id'] = pred_df.index
#     gt_df['gt_id'] = gt_df.index
#     joined = pred_df.merge(gt_df, left_on = ['id', 'class'], right_on = ['id', 'discourse_type'], how = 'outer', suffixes = ('_pred', '_gt'))
#     joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
#     joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')
#     joined['overlaps'] = joined.apply(calc_overlap, axis=1)
#     joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
#     joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])
#     joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
#     joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
#     tp_pred_ids = joined.query('potential_TP')         .sort_values('max_overlap', ascending=False)         .groupby(['id','predictionstring_gt']).first()['pred_id'].values
#     fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]
#     matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
#     unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]
#     TP = len(tp_pred_ids)
#     FP = len(fp_pred_ids)
#     FN = len(unmatched_gt_ids)
#     my_f1_score = TP / (TP + 0.5*(FP+FN))
#     return my_f1_score

In [None]:
test_preds = np.argmax(preds, axis=-1)
print('Predicting Test...')
sub = get_preds( dataset='test', verbose=False, text_ids=TEST_IDS, preds=test_preds)

map_clip = {'Lead':9, 'Position':5, 'Evidence':14, 'Claim':3, 'Concluding Statement':11, 'Counterclaim':6, 'Rebuttal':4}
def threshold(df):
    df = df.copy()
    for key, value in map_clip.items():
    # if df.loc[df['class']==key,'len'] < value 
        index = df.loc[df['class']==key].query(f'len<{value}').index
        df.drop(index, inplace = True)
    return df

sub['len'] = sub['predictionstring'].apply(lambda x:len(x.split()))
sub = threshold(sub)

sub[['id','class','predictionstring']].to_csv('submission.csv', index = False)
sub

In [None]:
sub

In [None]:
# test_samples = pd.DataFrame()
# test_samples['id'] = TEST_IDS
# # test_samples["input_ids"]= test_tokens 
# # test_samples["test_attention"] = test_attention
# test_samples["preds"] = None 
# test_samples["pred_scores"] = None


In [None]:
# test_samples

In [None]:
# 

final_preds = []
final_scores = []


pred_class = np.argmax(preds, axis=2)
pred_scrs = np.max(preds.numpy(), axis=2)
for pred, pred_scr in zip(pred_class, pred_scrs):
        pred = pred.tolist()
        pred_scr = pred_scr.tolist()
        final_preds.append(pred)
        final_scores.append(pred_scr)

for j in range(len(test_samples)):
    tt = [p for p in final_preds[j][1:]]
    tt_score = final_scores[j][1:]
    test_samples["preds"][j] = tt
    test_samples["pred_scores"][j]= tt_score

for i in iterrows(test_samples):

In [None]:
# def jn(pst, start, end):
#     return " ".join([str(x) for x in pst[start:end]])


# def link_evidence(oof):
#     thresh = 1
#     idu = oof['id'].unique()
#     idc = idu[1]
#     eoof = oof[oof['class'] == "Evidence"]
#     neoof = oof[oof['class'] != "Evidence"]
#     for thresh2 in range(26,27, 1):
#         retval = []
#         for idv in idu:
#             for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
#                    'Counterclaim', 'Rebuttal']:
#                 q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
#                 if len(q) == 0:
#                     continue
#                 pst = []
#                 for i,r in q.iterrows():
#                     pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
#                 start = 1
#                 end = 1
#                 for i in range(2,len(pst)):
#                     cur = pst[i]
#                     end = i
#                     #if pst[start] == 205:
#                     #   print(cur, pst[start], cur - pst[start])
#                     if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
#                         retval.append((idv, c, jn(pst, start, end)))
#                         start = i + 1
#                 v = (idv, c, jn(pst, start, end+1))
#                 #print(v)
#                 retval.append(v)
#         roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
#         roof = roof.merge(neoof, how='outer')
#         return roof
    
# proba_thresh = {
#     "Lead": 0.7,
#     "Position": 0.55,
#     "Evidence": 0.65,
#     "Claim": 0.55,
#     "Concluding Statement": 0.7,
#     "Counterclaim": 0.5,
#     "Rebuttal": 0.55,
# }

# min_thresh = {
#     "Lead": 9,
#     "Position": 5,
#     "Evidence": 14,
#     "Claim": 3,
#     "Concluding Statement": 11,
#     "Counterclaim": 6,
#     "Rebuttal": 4,
# }

# submission = []
# for sample_idx, sample in enumerate(test_samples):
#     preds = sample["preds"]
#     offset_mapping = sample["offset_mapping"]
#     sample_id = sample["id"]
#     sample_text = sample["text"]
#     sample_input_ids = sample["input_ids"]
#     sample_pred_scores = sample["pred_scores"]
#     sample_preds = []

#     if len(preds) < len(offset_mapping):
#         preds = preds + ["O"] * (len(offset_mapping) - len(preds))
#         sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))
    
#     idx = 0
#     phrase_preds = []
#     while idx < len(offset_mapping):
#         start, _ = offset_mapping[idx]
#         if preds[idx] != "O":
#             label = preds[idx][2:]
#         else:
#             label = "O"
#         phrase_scores = []
#         phrase_scores.append(sample_pred_scores[idx])
#         idx += 1
#         while idx < len(offset_mapping):
#             if label == "O":
#                 matching_label = "O"
#             else:
#                 matching_label = f"I-{label}"
#             if preds[idx] == matching_label:
#                 _, end = offset_mapping[idx]
#                 phrase_scores.append(sample_pred_scores[idx])
#                 idx += 1
#             else:
#                 break
#         if "end" in locals():
#             phrase = sample_text[start:end]
#             phrase_preds.append((phrase, start, end, label, phrase_scores))

#     temp_df = []
#     for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
#         word_start = len(sample_text[:start].split())
#         word_end = word_start + len(sample_text[start:end].split())
#         word_end = min(word_end, len(sample_text.split()))
#         ps = " ".join([str(x) for x in range(word_start, word_end)])
#         if label != "O":
#             if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
#                 if len(ps.split()) >= min_thresh[label]:
#                     temp_df.append((sample_id, label, ps))
    
#     temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
#     submission.append(temp_df)

# submission = pd.concat(submission).reset_index(drop=True)
# submission = link_evidence(submission)
# submission.to_csv("submission.csv", index=False)

In [None]:
# # GET TEST PREDICIONS
# sub = get_preds( dataset='test', verbose=False, text_ids=TEST_IDS, preds=test_preds )
# sub['len'] = sub['predictionstring'].apply(lambda x:len(x.split()))
# sub = threshold(sub)


# sub.head()