In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import utils
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import torch.nn.functional as F

In [None]:
MAX_LEN = 160
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 4

ROBERTA_PATH = "../input/roberta-base"
TRAINING_FILE = "../input/tweet-train-folds/train_folds.csv"

TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file = "../input/roberta-base/vocab.json",
    merges_file = "../input/roberta-base/merges.txt",
    lowercase = True,
    add_prefix_space = True
)

In [None]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())    # 在推特文本的第一个词前加空格
    selected_text = " " + " ".join(str(selected_text).split())    # 在情感文本的第一个词前加空格

    len_st = len(selected_text) - 1    # len(情感文本)-1：不考虑首空格
    idx0 = -1    # 情感文本第一个词在推特文本中的索引
    idx1 = -1    # 情感文本最后一个词在推特文本中的索引

    # 确定情感文本范围（字符级）
    for i in [index for index, c in enumerate(tweet) if c == selected_text[1]]:
        if " " + tweet[i: i + len_st] == selected_text:    # 左闭右开
            idx0 = i
            idx1 = i + len_st - 1
            break

    # 将 情感文本对应的全部字符 在推特文本中的索引设为1（字符级）
    char_targets = [0] * len(tweet)
    if idx0 != -1 and idx1 != -1:
        for i in range(idx0, idx1 + 1):    # 左闭右开
            char_targets[i]= 1

    tok_tweet = tokenizer.encode(tweet)    # 分词后的推特文本（词级）
    # print(tok_tweet.tokens)
    input_ids_orig = tok_tweet.ids    # 各词语id
    tweet_offsets_orig = tok_tweet.offsets    # 各词语首尾跨度（左闭右开）
    # print(tweet_offsets_orig)

    # 情感文本对应的全部词 在推特文本中的索引（词级）
    target_idx = []
    for i, (offset1, offset2) in enumerate(tweet_offsets_orig):
        # 情感文本对应的词
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(i)

    # 情感文本的首尾索引
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    # print("positive", tokenizer.encode("positive").ids)
    # print("negative", tokenizer.encode("negative").ids)
    # print("neutral", tokenizer.encode("neutral").ids)
    sentiment_id = {"positive": 1313, "negative": 2430, "neutral": 7974}

    # "<s>"：0，"</s>"：2
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [0] + input_ids_orig + [2]

    token_type_ids = [0] * 4 + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)

    tweet_offsets = [(0, 0)] * 4 + tweet_offsets_orig + [(0, 0)]

    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)    # "<pad>"：1
        mask = mask + ([0] * padding_length)    # padding部分设置为0
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)

    return {
        "ids": input_ids,    # 各词语id
        "mask": mask,    # mask向量
        "token_type_ids": token_type_ids,
        "targets_start": targets_start,    # 情感文本首单词索引
        "targets_end": targets_end,    # 情感文本尾单词索引
        "orig_tweet": tweet,    # 推特文本
        "orig_selected": selected_text,    # 情感文本
        "sentiment": sentiment,    # 情感标签
        "offsets": tweet_offsets    # 各词语首尾跨度
    }

In [None]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet    # 推特文本
        self.sentiment = sentiment    # 情感标签
        self.seleted_text = selected_text    # 情感文本
        self.tokenizer = TOKENIZER    # 分词
        self.max_len = MAX_LEN    # 最大长度

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.seleted_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            "ids": torch.tensor(data["ids"], dtype = torch.long),
            "mask": torch.tensor(data["mask"], dtype = torch.long),
            "token_type_ids": torch.tensor(data["token_type_ids"], dtype = torch.long),
            "targets_start": torch.tensor(data["targets_start"], dtype = torch.long),
            "targets_end": torch.tensor(data["targets_end"], dtype = torch.long),
            "orig_tweet": data["orig_tweet"],
            "orig_selected": data["orig_selected"],
            "sentiment": data["sentiment"],
            "offsets": torch.tensor(data["offsets"], dtype = torch.long)
        }

In [None]:
hidden_size = 768
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        # 预训练的roberta模型
        self.roberta = transformers.RobertaModel.from_pretrained(ROBERTA_PATH, config = conf)      
        self.drop_out = nn.Dropout(0.5)        
        self.w0 = nn.Linear(hidden_size * 6, hidden_size * 6)
        self.w1 = nn.Linear(hidden_size * 6, hidden_size * 2) 
        self.w2 = nn.Parameter(torch.zeros((768 * 2,768 * 2),dtype=torch.float32))
        self.w3 = nn.Linear(hidden_size * 6,hidden_size * 2)
#         nn.init.xavier_normal_(self.w0.weight.data)
        nn.init.xavier_normal_(self.w2)
#         nn.init.xavier_normal_(self.w1.weight.data)
#         nn.init.xavier_normal_(self.w3.weight.data)
        # roberta-base隐藏状态的维度是768
        self.lstm = nn.LSTM(input_size = hidden_size * 6, hidden_size = hidden_size, num_layers = 1, bidirectional = True, batch_first = True)
        # 两维（情感文本首词概率，情感文本末词概率）
        self.l0 = nn.Linear(hidden_size * 2, 2)
#         nn.init.xavier_normal_(self.l0.weight.data)
        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim = -1)
        self.relu = torch.nn.ReLU()
        
    def forward(self, ids, mask, token_type_ids):
        # bert层数 x batch_size x 序列长度(160) x 768 = 13 x 24 x 160 x 768
        _, _, out = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids)

        out = torch.cat((out[0],out[4], out[5], out[10], out[11], out[12]), dim=-1) # [24, 160, 768 * 13]
        out = self.drop_out(out)
        out = self.w0(out)
        sentiment = out[:,1,:] # batch_size,hidden_size * 6
        sentiment = self.w1(sentiment) # batch_size,hidden_size * 2
        sentence = out[:,3:,:] # batch_size ,seq_len - 3,hidden_size * 6

        sentence_out = self.w3(sentence)
        sentence_out = torch.matmul(sentence_out,self.w2)
        sentence_out = sentence_out.permute(1,0,2) # seq_len - 3,batch_size,hidden * 2
        sentence_out = torch.mul(sentence_out,sentiment)
        sentence_out = sentence_out.permute(1,0,2)
        sentence_out = self.tanh(sentence_out)
        logits = self.l0(sentence_out)
        # batch_size x 序列长度(160) x 2 -> batch_size x 序列长度(160) x 1，batch_size x 序列长度(160) x 1
        start_logits, end_logits = logits.split(1, dim = -1)

        # batch_size x 序列长度(160)
        start_logits = start_logits.squeeze(-1)

        # batch_size x 序列长度(160)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
# 损失函数
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()    # 交叉熵
    # 情感文本第一个词的概率
    start_loss = loss_fct(start_logits, start_positions - 3)
    # 情感文本最后一个词的概率
    end_loss = loss_fct(end_logits, end_positions - 3)
    # 总损失
    total_loss = start_loss + end_loss
    return total_loss

In [None]:
# 训练函数
def train_fn(data_loader, model, optimizer, device, scheduler = None):
    model.train()    # 训练模式
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total = len(data_loader))
    # 按batch读取
    for bi, d in enumerate(tk0):
        ids = d["ids"].to(device, dtype = torch.long)
        mask = d["mask"].to(device, dtype = torch.long)
        token_type_ids = d["token_type_ids"].to(device, dtype = torch.long)

        targets_start = d["targets_start"].to(device, dtype = torch.long)
        targets_end = d["targets_end"].to(device, dtype = torch.long)

        orig_tweet =  d["orig_tweet"]
        orig_selected = d["orig_selected"]
        sentiment =  d["sentiment"]
        offsets = d["offsets"]

        model.zero_grad()
        # batch_size x 序列长度(192)，batch_size x 序列长度(192)
        outputs_start, outputs_end = model(ids = ids, mask = mask, token_type_ids = token_type_ids)

        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim = 1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim = 1).cpu().detach().numpy()

        jaccard_scores = []
        for i, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[i]
            tweet_sentiment = sentiment[i]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet = tweet,
                target_string = selected_tweet,
                sentiment_val = tweet_sentiment,
                idx_start = np.argmax(outputs_start[i, :]) + 3,
                idx_end = np.argmax(outputs_end[i, :]) + 3,
                offsets = offsets[i]
            )
            jaccard_scores.append(jaccard_score)
        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        # 打印loss和jaccard
        tk0.set_postfix(loss = losses.avg, jaccard = jaccards.avg)

In [None]:
# 评价指标jaccard
def calculate_jaccard_score(
        original_tweet,
        target_string,
        sentiment_val,
        idx_start,
        idx_end,
        offsets,
        verbose = False
):
    if idx_end < idx_start:
        idx_end = idx_start

    filtered_output = ""
    for ix in range(idx_start, idx_end + 1):
            filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]

    jac = utils.jaccard(target_string, filtered_output)
    return jac, filtered_output

In [None]:
# 测试函数
def eval_fn(data_loader, model, device):
    model.eval()    # 测试模式
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    with torch.no_grad():
        tk0 = tqdm(data_loader, total = len(data_loader))
        # 按batch读取
        for bi, d in enumerate(tk0):
            ids = d["ids"].to(device, dtype = torch.long)
            mask = d["mask"].to(device, dtype = torch.long)
            token_type_ids = d["token_type_ids"].to(device, dtype = torch.long)

            targets_start = d["targets_start"].to(device, dtype = torch.long)
            targets_end = d["targets_end"].to(device, dtype = torch.long)

            orig_tweet = d["orig_tweet"]
            orig_selected = d["orig_selected"]
            sentiment = d["sentiment"]
            offsets = d["offsets"]

            # batch_size x 序列长度(192)，batch_size x 序列长度(192)
            outputs_start, outputs_end = model(ids = ids, mask = mask, token_type_ids = token_type_ids)

            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)

            outputs_start = torch.softmax(outputs_start, dim = 1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim = 1).cpu().detach().numpy()

            jaccard_scores = []
            for i, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[i]
                tweet_sentiment = sentiment[i]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet = tweet,
                    target_string = selected_tweet,
                    sentiment_val = tweet_sentiment,
                    idx_start = np.argmax(outputs_start[i, :]) + 3,
                    idx_end = np.argmax(outputs_end[i, :]) + 3,
                    offsets = offsets[i],
                    verbose = False
                )
                jaccard_scores.append(jaccard_score)
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            # 打印loss和jaccard
            tk0.set_postfix(loss = losses.avg, jaccard = jaccards.avg)
    print("Jaccard = ", jaccards.avg)
    return jaccards.avg

In [None]:
import os
def run(fold):
    dfx = pd.read_csv(TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)
    
    # 训练集
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values
    )
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = TRAIN_BATCH_SIZE,
        shuffle = True,
        num_workers = 4,
        drop_last = True
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = VALID_BATCH_SIZE,
        shuffle = False,
        num_workers = 2,
        drop_last = True
    )

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf = model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    
    optimizer = AdamW(optimizer_parameters, lr = 2e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 200, 
        num_training_steps = num_train_steps
    )

    es = utils.EarlyStopping(patience = 3, mode = "max")
    print("Training is Starting for fold", fold)
    
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print("Jaccard Score = ", jaccard)
        es(jaccard, model, model_path = f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break

In [None]:
run(fold = 0)

In [None]:
run(fold = 1)

In [None]:
run(fold = 2)

In [None]:
run(fold = 3)

In [None]:
run(fold = 4)

In [None]:
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
model_config.output_hidden_states = True

In [None]:
model1 = TweetModel(conf = model_config)
model1.to(device)
model1.load_state_dict(torch.load("../input/qa-dot/model_0.bin"))
model1.eval()

model2 = TweetModel(conf = model_config)
model2.to(device)
model2.load_state_dict(torch.load("../input/qa-dot/model_1.bin"))
model2.eval()

model3 = TweetModel(conf = model_config)
model3.to(device)
model3.load_state_dict(torch.load("../input/qa-dot/model_2.bin"))
model3.eval()

model4 = TweetModel(conf = model_config)
model4.to(device)
model4.load_state_dict(torch.load("../input/qa-dot/model_3.bin"))
model4.eval()

model5 = TweetModel(conf = model_config)
model5.to(device)
model5.load_state_dict(torch.load("../input/qa-dot/model_4.bin"))
model5.eval()

In [None]:
final_output = []

test_dataset = TweetDataset(
        tweet = df_test.text.values,
        sentiment = df_test.sentiment.values,
        selected_text = df_test.selected_text.values
)
data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle = False,
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1
)

with torch.no_grad():
    tk0 = tqdm(data_loader, total = len(data_loader))
    # 按batch读取
    for bi, d in enumerate(tk0):
        ids = d["ids"].to(device, dtype = torch.long)
        mask = d["mask"].to(device, dtype = torch.long)
        token_type_ids = d["token_type_ids"].to(device, dtype = torch.long)

        targets_start = d["targets_start"].to(device, dtype = torch.long)
        targets_end = d["targets_end"].to(device, dtype = torch.long)

        orig_tweet = d["orig_tweet"]
        orig_selected = d["orig_selected"]
        sentiment = d["sentiment"]
        offsets = d["offsets"]

        outputs_start1, outputs_end1 = model1(ids = ids, mask = mask, token_type_ids = token_type_ids)
        outputs_start2, outputs_end2 = model2(ids = ids, mask = mask, token_type_ids = token_type_ids)
        outputs_start3, outputs_end3 = model3(ids = ids, mask = mask, token_type_ids = token_type_ids)
        outputs_start4, outputs_end4 = model4(ids = ids, mask = mask, token_type_ids = token_type_ids)
        outputs_start5, outputs_end5 = model5(ids = ids, mask = mask, token_type_ids = token_type_ids)
        
        outputs_start1 = torch.softmax(outputs_start1, dim=1)
        outputs_start2 = torch.softmax(outputs_start2, dim=1)
        outputs_start3 = torch.softmax(outputs_start3, dim=1)
        outputs_start4 = torch.softmax(outputs_start4, dim=1)
        outputs_start5 = torch.softmax(outputs_start5, dim=1)
            
            
        outputs_end1 = torch.softmax(outputs_end1, dim=1)
        outputs_end2 = torch.softmax(outputs_end2, dim=1)
        outputs_end3 = torch.softmax(outputs_end3, dim=1)
        outputs_end4 = torch.softmax(outputs_end4, dim=1)
        outputs_end5 = torch.softmax(outputs_end5, dim=1)
        
        outputs_start = (outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5) / 5
        outputs_end = (outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for i, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[i]
            tweet_sentiment = sentiment[i]
            _, output_sentence = calculate_jaccard_score(
                original_tweet = tweet,
                target_string = selected_tweet,
                sentiment_val = tweet_sentiment,
                idx_start = np.argmax(outputs_start[i, :]) + 3,
                idx_end = np.argmax(outputs_end[i, :]) + 3,
                offsets = offsets[i]
            )
            final_output.append(output_sentence)

In [None]:
def post_process(selected):
    return " ".join(set(selected.lower().split()))

In [None]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output
sample.selected_text = sample.selected_text.map(post_process)
sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sample['selected_text'] = sample['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sample.to_csv("submission.csv", index=False)