In [82]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import random
import os


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class args:
    model_name = "allenai/longformer-base-4096"
    max_length = 1024
    input = "../feedback-prize-2021/"
    
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13
}

id_target_map = {v: k for k, v in target_id_map.items()}


In [None]:

tokenizer = AutoTokenizer.from_pretrained(args.model_name)
# Initializing a model from the configuration
model = AutoModel.from_pretrained(args.model_name)
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:

class TextModel(nn.Module):
    def __init__(self,model_name = None,num_labels = 1):
        super(TextModel,self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)# 768
        self.drop_out = nn.Dropout(0.1)
        self.drop_out1 = nn.Dropout(0.1)
        self.drop_out2 = nn.Dropout(0.2)
        self.drop_out3 = nn.Dropout(0.3)
        self.drop_out4 = nn.Dropout(0.4)
        self.drop_out5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size,num_labels)

        if 'deberta-v2-xxlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:24].requires_grad_(False) # 冻结24/48
        if 'deberta-v2-xlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:14].requires_grad_(False) # 冻结12/24
        
        if 'funnel-transformer-xlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:1].requires_grad_(False) # 冻结1/3

    def forward(self,input_ids,attention_mask,labels = None):
        if 'gpt' in self.model.name_or_path:
            emb = self.model(input_ids)[0]
        else:
            emb = self.model(input_ids,attention_mask)[0]

        preds1 = self.output(self.dropout1(emb))
        preds2 = self.output(self.dropout2(emb))
        preds3 = self.output(self.dropout3(emb))
        preds4 = self.output(self.dropout4(emb))
        preds5 = self.output(self.dropout5(emb))
        preds = (preds1 + preds2 + preds3 + preds4 + preds5) / 5

        logits = torch.softmax(preds,dim = -1)
        if labels is not None:
            loss = self.get_loss(preds,labels,attention_mask)
            return loss,logits
        else:
            return logits

    def get_loss(self,outputs,targets,attention_mask):
        loss_fct =nn.CrossEntropyLoss()

        active_loss = attention_mask.reshape(-1) == 1
        active_logits = outputs.reshape(-1,outputs.shape[-1])
        true_labels = targets.reshape(-1)
        idxs = np.where(active_loss.cpu().numpy()==1)[0]
        active_logits = active_logits[idxs]
        true_labels = true_labels[idxs].to(torch.long)

        loss = loss_fct(active_logits,true_labels)

        return loss


class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]

        # add start token id to the input_ids
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }


class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
    
        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output


def prepare_df(arg, tokenizer, ids):
    samples_df = []
    for id_num in range(len(ids)):
        if id_num%100==0: print(id_num,', ',end='')
        n = ids[id_num]
        name = f'../feedback-prize-2021/train/{n}.txt'
        text = open(name, 'r').read()
        
        encoded_text = tokenizer.encode_plus(
                    text,
                    return_offsets_mapping=True,
                    truncation = True,
                    max_length = arg.max_length,
                    padding='max_length'
                    )

        input_ids = encoded_text["input_ids"]
        attention_masks = encoded_text["attention_mask"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
                "id": n,
                "input_ids": input_ids,
                "attention_masks":attention_masks,
                "text": text,
                "offset_mapping": offset_mapping,
                }

        samples_df.append(sample)
    return samples_df

def train_and_eval():
    

In [48]:
import pandas as pd
train = pd.read_csv("../feedback-prize-2021/train.csv")
ids = train.id.unique()

tokenizer = AutoTokenizer.from_pretrained(args.model)
test_samples = prepare_df(args, tokenizer, ids)
collate = Collate(tokenizer=tokenizer)


raw_preds = []
for fold_ in range(5):
    current_idx = 0
    test_dataset = FeedbackDataset(test_samples, args.max_len, tokenizer)
    model = TextModel(model_name=args.model, num_labels=len(target_id_map) - 1)



In [37]:

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer(question, text, return_tensors="pt",return_offsets_mapping=True)
input_ids = encoding["input_ids"]

# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

outputs = model(input_ids, attention_mask=attention_mask)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]
answer = tokenizer.decode(
    tokenizer.convert_tokens_to_ids(answer_tokens)
)  # remove space prepending space token

AttributeError: 'LongformerBaseModelOutputWithPooling' object has no attribute 'start_logits'

In [85]:
import copy
import os

import numpy as np
import pandas as pd
import torch
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
train = pd.read_csv("../feedback-prize-2021/train.csv")
ids = train.id.unique()

def prepare_df(args, tokenizer, df, train_ids):
    training_samples = []
    for idx in tqdm(train_ids):
        name = args.input +'train/{idx}.txt'
        text = open(name, 'r').read()
    
        encoded_text = tokenizer.encode_plus(
                    text,
                    add_special_tokens=False,
                    return_offsets_mapping=True
                    )

        input_ids = encoded_text["input_ids"]
        input_labels = copy.deepcopy(input_ids)
        offset_mapping = encoded_text["offset_mapping"]

        for k in range(len(input_labels)):
            input_labels[k] = "O"

        sample = {
                "id": idx,
                "input_ids": input_ids,
                "text": text,
                "offset_mapping": offset_mapping,
                }

        temp_df = df[df["id"] == idx]

        for _, row in temp_df.iterrows():
            text_labels = [0] * len(text)
            discourse_start = int(row["discourse_start"])
            discourse_end = int(row["discourse_end"])
            prediction_label = row["discourse_type"]
            text_labels[discourse_start:discourse_end] = [1] * (discourse_end - discourse_start)
            target_idx = []
            for map_idx, (offset1, offset2) in enumerate(encoded_text["offset_mapping"]):
                if sum(text_labels[offset1:offset2]) > 0:
                    if len(text[offset1:offset2].split()) > 0:
                        target_idx.append(map_idx)

            targets_start = target_idx[0]
            targets_end = target_idx[-1]
            pred_start = "B-" + prediction_label
            pred_end = "I-" + prediction_label
            input_labels[targets_start] = pred_start
            input_labels[targets_start + 1 : targets_end + 1] = [pred_end] * (targets_end - targets_start)

        sample["input_ids"] = input_ids
        sample["input_labels"] = input_labels
        training_samples.append(sample)
    return training_samples

a = prepare_df(args,tokenizer,train,ids)


  6%|▌         | 875/15594 [00:19<05:21, 45.83it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (18882 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 15594/15594 [05:39<00:00, 45.95it/s]


In [88]:
a[0]["input_labels"]

['O',
 'O',
 'O',
 'O',
 'B-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'I-Lead',
 'B-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'I-Position',
 'B-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 'I-Evidence',
 '

In [89]:
import torch
from transformers import BertTokenizerFast
 
 
MODELNAME="bert-base-chinese"
 
text = "Many people believe that the Electoral College should be abolished,"
tokens = tokenizer.tokenize(text,add_special_tokens=True)
outputs = tokenizer.encode_plus(text, return_offsets_mapping=True, add_special_tokens=True)  #add_special_tokens=True 添加 [cls] [sep]等标志
token_span=outputs["offset_mapping"]
print(tokens)
print(token_span)
 
print("hello")
 
'''
offset_mapping  记录的是tokenizer后的token与原来的关系
'''

['<s>', 'Many', 'Ġpeople', 'Ġbelieve', 'Ġthat', 'Ġthe', 'ĠElectoral', 'ĠCollege', 'Ġshould', 'Ġbe', 'Ġabolished', ',', '</s>']
[(0, 0), (0, 4), (5, 11), (12, 19), (20, 24), (25, 28), (29, 38), (39, 46), (47, 53), (54, 56), (57, 66), (66, 67), (0, 0)]
hello


'\noffset_mapping  记录的是tokenizer后的token与原来的关系\n'

In [13]:

from sklearn.model_selection import KFold
import pandas as pd
df = pd.read_csv('../feedback-prize-2021/train.csv')
kf = KFold(n_splits=5,random_state=42,shuffle=True)

kf.get_n_splits(df)
n = 1
for train_index, test_index in kf.split(df):
    df.loc[test_index,"kfold"] = int(n)
    n +=1
df["kfold"] = df["kfold"].astype(int)

In [14]:
df.to_csv("../feedback-prize-2021/df_n_fold.csv",index = None)

In [16]:
import pandas as pd
df = pd.read_csv('../feedback-prize-2021\df_n_fold.csv')