# Story:

I don't comment much for kernels but for this kernel I will.

First of all I am very thankful to: @akensert, @ajinomoto132 and @adityaecdrid who helped me a lot in finding the mistake I was making. It was due to them that I was able to find the mistake in my training code.

@akensert shared a kernel with data-processing similar to mine but a different model and loss function.
This kernel was written using Tensorflow. You can checkout the kernel here: https://www.kaggle.com/akensert/complete-tf2-1-mixed-precision-implementation

Please upvote @akensert's kernel mentioned above! :)

Thus, my plan began to replicate the same score in pytorch. Previously I was using BCEWithLogitsLoss. The TF kernel used Cross Entropy loss. This was one of the major differences. Another major difference was using the last two hidden states instead of just the last one.

So, I tried and failed.

I then tried again, cleaned up my code, started comparing line-by-line and failed again.

After 2 days of frustration, I made a discussion post asking for help: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/141019

Then came @ajinomoto132. He mentioned he had replicated the model and gladly shared his code to help me out!!! His code can be found here: https://www.kaggle.com/ajinomoto132/starter-kernel-in-pytorch .

Please upvote @ajinomoto132's kernel mentioned above! :)

After a few more hours of struggle, I was able to find the mistake I was doing. It was a stupid mistake of not using `.from_pretrained` when using BertModel. Quite stupid I would say.

Since the community helped me so much, I am releasing the fixed version of my code which is also much cleaner than the previous versions of my code.

I love this community! Thank you for all the help!

# All the important imports

In [None]:
'''
Pytorch:
1. 分析数量，dataset
2. 构建DataLoader
3. 构建模型BERT
    3.1 导入预训练的模型 transformers第三方库使用BERT
    3.2 接BERT后面的模型
    3.3 构建loss
    3.4 构建optimizer，EarlyStop
4. 训练
5. 测试
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.set_option('display.max_columns', None)
import transformers
from transformers import AdamW
import tokenizers
import torch
import torch.nn as nn

path = '../input/tweet-sentiment-extraction/'
df_train = pd.read_csv(os.path.join(path, 'train.csv'))

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 48
    EPOCH = 3
    BERT_PATH = '../input/bert-base-uncased/'
    MODEL_PATH = 'model.bin'
    TOKENIZER = tokenizers.BertWordPieceTokenizer(os.path.join(BERT_PATH, 'vocab.txt'), lowercase=True)

In [None]:
'''
1. 定义DataLoader
'''


class TweetDataset:
    def __init__(self, tweet, selected_text, sentiment):
        self.tweet = tweet
        self.seleted_text = selected_text
        self.sentiment = sentiment
        self.tokenizer = config.TOKENIZER

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        tweet = str(self.tweet[item])
        selected_text = str(self.seleted_text[item])
        sentimen = str(self.sentiment[item])
        '''
        BERT模型需要的输入数据的格式：
        1) ids，把text数据转换成index方式。tokenizer
        2) mask
        3) token_type_ids
        '''
        # 1）selected_text的位置找到，label的值
        idx0 = None
        idx1 = None
        for i, text in enumerate(tweet):
            if text == selected_text[0] and tweet[i:i + len(selected_text)] == selected_text:
                idx0 = i
                idx1 = i + len(selected_text) - 1
                break

        char_targets = [0] * len(tweet)
        char_targets[idx0:idx1 + 1] = [1] * len(selected_text)

        tok_tweet = self.tokenizer.encode(tweet)
        input_ids_orig = tok_tweet.ids[1:-1]
        tweet_offset = tok_tweet.offsets[1:-1]
        target_idx = []
        for j, (offset1, offset2) in enumerate(tweet_offset):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(j)
        target_start = target_idx[0]
        target_end = target_idx[-1]

        '''
        进行ids, mask, token_types数据清洗
        查看ids方法
        print(config.TOKENIZER.encode('[CLS] [SEP]').ids)
        '''
        sentiment_id = {
            'positive': 3893,
            'negative': 4997,
            'neutral': 8699
        }
        input_ids = [101] + [sentiment_id[sentimen]] + [102] + input_ids_orig
        token_type_ids = [0, 0, 0] + [1] * (len(input_ids) - 3)
        mask = [1] * len(input_ids)
        tweet_offset = [(0, 0)] * 3 + tweet_offset
        target_start += 3
        target_end += 3

        # padding
        padding_length = config.MAX_LEN - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + [0] * padding_length
            token_type_ids = token_type_ids + [0] * padding_length
            mask = mask + [0] * padding_length
            tweet_offset = tweet_offset + ([(0, 0)] * padding_length)
        return {
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'target_start': torch.tensor(target_start, dtype=torch.long),
            'target_end': torch.tensor(target_end, dtype=torch.long),
            'sentiment':sentimen,
            'tweet': tweet,
            'selected': selected_text,
            'tweet_offset': torch.tensor(tweet_offset, dtype=torch.long)
        }


# 测试用例

'''
构建BERT模型：
transformers.BertModel 模型
1. 导入模型
    1) 导入配置
    2）导入模型

2.BERT模型训练参数
https://www.cnblogs.com/deep-deep-learning/p/12792041.html
    1) ids: word的编码
    2) mask：指定哪些词用作self-attention
    3) token_types_ids：区分两个句子的编码

3.BERT模型输出：
sequence_output, pooled_output, (), ()
    1) sequence_output: 输出序列，所有word的embedding [batch, length, embedding]
    2）pooled_output：CLS的embedding输出 [batch, embedding]
    3) hidden_states(model_hidden_states=True): 13 * [batch, length, embedding] 
    4) attentions
'''


class Tweet(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(Tweet, self).__init__(conf)
        self.bert = transformers.BertModel.from_pretrained(pretrained_model_name_or_path=config.BERT_PATH, config=conf)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)

    def forward(self, ids, mask, token_type_ids):
        t1, t2, out = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logist = self.l0(out)
        start_logists, end_logists = logist.split(1, dim=-1)
        start_logists, end_logists = start_logists.squeeze(-1), end_logists.squeeze(-1)
        # start_logists, end_logists = torch.softmax(start_logists.squeeze(-1), dim=-1), torch.softmax(end_logists.squeeze(-1), dim=-1)
        return start_logists, end_logists


def loss_fn(start_logists, end_logists, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logists, start_positions)
    end_loss = loss_fct(end_logists, end_positions)
    loss = start_loss + end_loss
    return loss


'''
构建optimizer，EarlyStopping
optimizer_parameters参数的设计：
1. 根据name选择不同的参数值
2. 根据模型选择不同的参数值
'''

# 定义Early Stopping

es = EarlyStopping(path='checkpoint.pt')

from torch.utils.data import DataLoader

# # 1. 构建数据集
# train_dataloader = DataLoader(TweetDataset(tweet=df_train['text'],sentiment=df_train['sentiment'],selected_text=df_train['selected_text']),batch_size=config.TRAIN_BATCH_SIZE)
# # 2. 构建模型
# model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
# model_config.output_hidden_states = True
# m = Tweet(model_config)
# '''
# 测试
# '''
# for t1 in train_dataloader:
#     m(t1['ids'], t1['mask'], t1['token_type_ids'])

'''
开始训练
'''
device = torch.device("cuda")
train_dataloader = DataLoader(TweetDataset(tweet=df_train['text'],
                                           selected_text=df_train['selected_text'],
                                           sentiment=df_train['sentiment']),
                              batch_size=config.TRAIN_BATCH_SIZE)
model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True
model = Tweet(conf=model_config).to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(i in n for i in no_decay)], 'weight_decay': 0.01, 'lr': 3e-5},
    {'params': [p for n, p in param_optimizer if any(i in n for i in no_decay)], 'weight_decay': 0.00, 'lr': 5e-5}
]
# optimizer_parameters = [
#     {'params':model.bert.parameters(),'weight_decay':0.01,'lr':3e-5},
#     {'params':model.l0.parameters(),'lr':1e-3}
# ]

optimizer = AdamW(optimizer_parameters, lr=5e-5)
# 调整learning rate
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       patience=3,
                                                       eps=1e-8)

from tqdm.autonotebook import tqdm


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def calculate_jaccard_score(tweet, orig_selected, start_logist, end_logist, sentiment, offset):
    if start_logist > end_logist:
        start_logist = end_logist
    logist_selected = tweet[offset[start_logist][0]: offset[end_logist][1]]
    # print('\n')
    # print('offset:', offset)
    # print('num：s', start_logist, 'num：s', end_logist)
    # print('text: ', orig_selected, 'text1', logist_selected)
    if sentiment == 'neutral' or len(orig_selected.split()) < 2:
        logist_selected = tweet
    return jaccard(orig_selected, logist_selected)

def main():
    for i in range(config.EPOCH):
        tk0 = tqdm(train_dataloader, total=len(train_dataloader))
        losses = 0
        for i, data in enumerate(tk0):
            start_logists, end_logists = model(data['ids'].to(device), data['mask'].to(device), data['token_type_ids'].to(device))
            loss = loss_fn(start_logists, end_logists, data['target_start'].to(device), data['target_end'].to(device))
            losses += loss.item() * data['ids'].shape[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
            '''
            计算jaccard相似度 
            '''
            jaccard_scores = []
            output_start = torch.argmax(start_logists, dim=-1)
            output_end = torch.argmax(end_logists, dim=-1)
            for p_i, tweet in enumerate(data['tweet']):
                jaccard_scores = calculate_jaccard_score(tweet, data['selected'][p_i], output_start[p_i], output_end[p_i], data['sentiment'][p_i],data['tweet_offset'][p_i])
            tk0.set_postfix({'loss': loss.item(), 'jaccard_scores': np.mean(jaccard_scores)})
        losses = losses / len(df_train)
        scheduler.step(losses)
        es(losses, model)

In [None]:
main()

In [None]:
'''
1. 定义DataLoader
'''


class TweetTestDataset:
    def __init__(self, tweet, sentiment):
        self.tweet = tweet
        self.sentiment = sentiment
        self.tokenizer = config.TOKENIZER

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        tweet = str(self.tweet[item])
        sentimen = str(self.sentiment[item])
        tok_tweet = self.tokenizer.encode(tweet)
        input_ids_orig = tok_tweet.ids[1:-1]
        tweet_offset = tok_tweet.offsets[1:-1]
        '''
        进行ids, mask, token_types数据清洗
        查看ids方法
        print(config.TOKENIZER.encode('[CLS] [SEP]').ids)
        '''
        sentiment_id = {
            'positive': 3893,
            'negative': 4997,
            'neutral': 8699
        }
        input_ids = [101] + [sentiment_id[sentimen]] + [102] + input_ids_orig
        token_type_ids = [0, 0, 0] + [1] * (len(input_ids) - 3)
        mask = [1] * len(input_ids)
        tweet_offset = [(0, 0)] * 3 + tweet_offset

        # padding
        padding_length = config.MAX_LEN - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + [0] * padding_length
            token_type_ids = token_type_ids + [0] * padding_length
            mask = mask + [0] * padding_length
            tweet_offset = tweet_offset + ([(0, 0)] * padding_length)
        return {
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'sentiment':sentimen,
            'tweet': tweet,
            'tweet_offset': torch.tensor(tweet_offset, dtype=torch.long)
        }

In [None]:
df_train = pd.read_csv(os.path.join(path, 'test.csv'))
train_dataloader = DataLoader(TweetTestDataset(tweet=df_train['text'],
                                           sentiment=df_train['sentiment']),
                              batch_size=config.TRAIN_BATCH_SIZE)
# model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
# model_config.output_hidden_states = True
# model = Tweet(conf=model_config).to(device)
# model.load_state_dict(torch.load('./output/checkpoint.pt'))

tk0 = tqdm(train_dataloader, total=len(train_dataloader))

jaccard_scores = []
logist_selecteds = []
model.eval()
for i, data in enumerate(tk0):
    start_logists, end_logists = model(data['ids'].to(device), data['mask'].to(device), data['token_type_ids'].to(device))
    '''
    计算jaccard相似度
    '''
    output_start = torch.argmax(start_logists, dim=-1)
    output_end = torch.argmax(end_logists, dim=-1)
    for p_i, tweet in enumerate(data['tweet']):
        start_logist = output_start[p_i]
        end_logist = output_end[p_i]
        offset = data['tweet_offset'][p_i]
        logist_selected = tweet[offset[start_logist][0]: offset[end_logist][1]]
        logist_selecteds.append(logist_selected)


df_train['selected_text'] = logist_selecteds
df_train[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
df_train.sample(25)