In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertPreTrainedModel, BertConfig, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold

import re

tqdm.pandas()

# Import Training Dataset

In [None]:
train_dataset = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')

In [None]:
train_dataset.isnull().sum()

In [None]:
train_dataset.dropna(inplace=True)

# Data Cleaning

In [None]:
def remove_leading_truncuated_word(row):
    text = row['text']
    selected_text = row['selected_text']

    first_selected_text_word = selected_text.lower().split()[0]
    if (first_selected_text_word not in text.lower().split()) and \
        (len(first_selected_text_word) == 1):
        return " ".join(selected_text.lower().split()[1:])
        
    else:
        return selected_text

def modified_text_to_bert_input(text, tokenizer):
    return tokenizer.convert_tokens_to_string(tokenizer.tokenize(text))

In [None]:
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bertbaseuncased', max_len=192)

train_dataset['text'] = train_dataset['text'].apply(modified_text_to_bert_input, args=(tokenizer,))
train_dataset['selected_text'] = train_dataset['selected_text'].apply(modified_text_to_bert_input, args=(tokenizer,))

In [None]:
train_dataset['selected_text'] = train_dataset.apply(remove_leading_truncuated_word, axis=1)

train_dataset['selected_text'].replace('', np.nan, inplace=True)
train_dataset.dropna(inplace=True)  

## Check if the 'selected_text' is a substring of 'text'

In [None]:
def is_substring(row):
    return row.loc['selected_text'] in row.loc['text']

train_dataset['is_substring'] = train_dataset.apply(is_substring, axis=1)
train_dataset['is_substring'].value_counts()

## Investigate dataset with neutral sentiment

In [None]:
neutral_dataset = train_dataset[train_dataset['sentiment'] == 'neutral']
neutral_dataset['is_same'] = neutral_dataset['selected_text'] == neutral_dataset['text']
neutral_dataset['is_same'].value_counts()

As we can see, most of the data with neutral sentiment will have 'text' == 'selected_text'.

So we will not input neutral sentiment dataset into the model, we will just copy the whole 'text' as 'selected_text'

In [None]:
train_dataset = train_dataset[~(train_dataset['sentiment'] == 'neutral')]

# Dataset
## Helper Function

In [None]:
def get_start_and_end_char_index(row):
    text = row['text']
    selected_text = row['selected_text']
    
    start_idx = text.find(selected_text)
    end_idx = start_idx + len(selected_text)

    return start_idx, end_idx

def target_answer(row, tokenizer):
    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True

    for c in row['text']:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    start_idx = row['start_char_idx']
    end_idx = row['end_char_idx']

    start_position = char_to_word_offset[start_idx]
    end_position = char_to_word_offset[end_idx - 1]

    actual_text = " ".join(doc_tokens[start_position:end_position+1])

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []

    for i, token in enumerate(doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    tok_start_position = orig_to_tok_index[start_position]
    if end_position < len(doc_tokens) - 1:
        tok_end_position = orig_to_tok_index[end_position + 1] - 1
    else:
        tok_end_position = len(all_doc_tokens) - 1

    # Improve span answer
    tok_answer_text = tokenizer.tokenize(row['selected_text'])

    for new_start in range(tok_start_position, tok_end_position + 1):
        for new_end in range(tok_end_position, tok_start_position - 1, -1):
            text_span = all_doc_tokens[new_start : new_end + 1]
            if text_span == tok_answer_text:
                return new_start, new_end, tokenizer.convert_tokens_to_string(text_span)
    
    text_span = all_doc_tokens[tok_start_position : tok_end_position + 1]

    return tok_start_position, tok_end_position, tokenizer.convert_tokens_to_string(text_span)

## Get 'start_char_idx' & 'end_char_idx' for each data
'start_char_idx' is the position of the first character of 'selected_text' in 'text'

'end_char_idx' is the position of the last character of 'selected_text' in 'text'

In [None]:
train_dataset[['start_char_idx', 'end_char_idx']] = train_dataset.apply(get_start_and_end_char_index, axis=1, result_type='expand')

## Get 'target_start_idx' & 'target_end_idx' for each data
'target_start_idx' is the position of the first token of 'selected_text' in 'text'

'target_end_idx' is the position of the last token of 'selected_text' in 'text'

In [None]:
train_dataset[['target_start_idx', 'target_end_idx', 'target_answer']] = train_dataset.progress_apply(target_answer, axis=1, result_type='expand', args=(tokenizer,))
train_dataset

In [None]:
train_dataset.isnull().sum()

In [None]:
train_dataset['is_same'] = train_dataset['selected_text'] == train_dataset['target_answer']
train_dataset['is_same'].value_counts()

From the results above, we can see that some of the 'target_answer' is not the same as 'selected_text'.

This is might due to truncuated word in 'selected_text' that is not appear in the tokens of the 'text'

## Pytorch Dataset

In [None]:
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer):
        # assert len(text) == len(selected_text), "Length of list of 'text' should be same as list of 'selected_text'"

        self.text = df['text'].tolist()
        self.selected_text = df['selected_text'].tolist()
        self.target_start_idx = df['target_start_idx'].tolist()
        self.target_end_idx = df['target_end_idx'].tolist()
        self.sentiment = df['sentiment'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        encoded = self._process_data(self.text[idx], self.sentiment[idx], self.target_start_idx[idx], self.target_end_idx[idx])
        data = {
            'input_ids': torch.tensor(encoded['input_ids'], dtype=torch.long),
            'token_type_ids': torch.tensor(encoded['token_type_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoded['attention_mask'], dtype=torch.float),
            'target_start_idx': torch.tensor(encoded['target_start_idx'], dtype=torch.long),
            'target_end_idx': torch.tensor(encoded['target_end_idx'], dtype=torch.long)       
        }

        return data 

    def _process_data(self, text, sentiment, target_start_idx, target_end_idx):
        data = self.tokenizer.encode_plus(sentiment, text, pad_to_max_length=True)
        
        data['target_start_idx'] = target_start_idx + 3
        data['target_end_idx'] = target_end_idx + 3

        return data

# Model

In [None]:
class SentimentExtractionBert(nn.Module):
    def __init__(self):
        super(SentimentExtractionBert, self).__init__()
        self.num_labels = 2

        self.bert = BertModel.from_pretrained('/kaggle/input/bertbaseuncased/')
        self.dropout = nn.Dropout(0.2)
        self.outputs = nn.Linear(768 * 2, self.num_labels)

        # Initialise weight for the outputs layer
        nn.init.zeros_(self.outputs.bias)
        nn.init.normal_(self.outputs.weight, mean=0.0, std=0.02)

    def forward(self,
        input_ids,
        attention_mask,
        token_type_ids,
    ):

        outputs = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
            )
        
        sequence_output = outputs[0]
        sentiment_rep = sequence_output[:,1].unsqueeze(1).repeat(1, 192, 1)
        sent_seq_output = torch.cat([sequence_output, sentiment_rep], dim=-1)

        logits = self.outputs(self.dropout(sent_seq_output))
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

# Training
## Hyperparameters

In [None]:
FOLD = 5

EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 3e-5

# Optimizer Hyperparams
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 0

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
kf = KFold(n_splits=FOLD)
kfold_list = list(kf.split(train_dataset))

In [None]:
def train_one_epoch(model, train_dl, optimizer, scheduler):
    criterion = nn.CrossEntropyLoss()

    training_loss = 0
    model.train()
    for data in tqdm(train_dl):
        input_ids = data['input_ids'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        target_start_idx = data['target_start_idx'].to(device)
        target_end_idx = data['target_end_idx'].to(device)

        optimizer.zero_grad()
        start_logits, end_logits = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(start_logits, target_start_idx) + criterion(end_logits, target_end_idx)

        loss.backward()
        optimizer.step()
        scheduler.step()

        training_loss += loss.item()
    training_loss /= len(train_dl)
 
    return training_loss

In [None]:
def train(fold):
    model = SentimentExtractionBert()
    
    model = model.to(device)
    train_ds = TweetDataset(train_dataset.iloc[kfold_list[fold][0]], tokenizer)
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": WEIGHT_DECAY,
        },
        {   "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
            "weight_decay": 0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-8)

    t_total = len(train_dl) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

    for epoch in range(EPOCHS):
        training_loss = train_one_epoch(model, train_dl, optimizer, scheduler)
        with open("./log.txt", 'a') as f:
            f.write("Fold {}, Epoch {}, Learning Rate: {},Loss: {}\n".format(fold, epoch, optimizer.param_groups[0]['lr'], training_loss))

    torch.save(model.state_dict(), 'ft_ckpt_{}.pth'.format(fold))

In [None]:
for fold in range(FOLD):
    train(fold)

# Testing

In [None]:
model0 = SentimentExtractionBert()
model0.load_state_dict(torch.load("ft_ckpt_0.pth"))
model0.to(device)
model0.eval()

model1 = SentimentExtractionBert()
model1.load_state_dict(torch.load("ft_ckpt_1.pth"))
model1.to(device)
model1.eval()

model2 = SentimentExtractionBert()
model2.load_state_dict(torch.load("ft_ckpt_2.pth"))
model2.to(device)
model2.eval()

model3 = SentimentExtractionBert()
model3.load_state_dict(torch.load("ft_ckpt_3.pth"))
model3.to(device)
model3.eval()

model4 = SentimentExtractionBert()
model4.load_state_dict(torch.load("ft_ckpt_4.pth"))
model4.to(device)
model4.eval()


In [None]:
def process_test_data(row):
        data = tokenizer.encode_plus(row['sentiment'], row['text'], pad_to_max_length=True)
        
        tensor_data = {
            'input_ids': torch.tensor(data['input_ids'], dtype=torch.long).unsqueeze(0),
            'token_type_ids': torch.tensor(data['token_type_ids'], dtype=torch.long).unsqueeze(0),
            'attention_mask': torch.tensor(data['attention_mask'], dtype=torch.float).unsqueeze((0)),          
        }

        return tensor_data

def postprocessing(text):
    out_text = (
        text.replace(" ` ", "`")
        .replace("* * * *", "****")
    )
    
    return out_text
    
def test(row):
    if row['sentiment'] == 'neutral':
        predicted_text = row['text']

    else:
        data = process_test_data(row)

        input_ids = data['input_ids'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)

        with torch.no_grad():
            start_logits_0, end_logits_0 = model0(input_ids, attention_mask, token_type_ids)
            start_logits_1, end_logits_1 = model1(input_ids, attention_mask, token_type_ids)
            start_logits_2, end_logits_2 = model2(input_ids, attention_mask, token_type_ids)
            start_logits_3, end_logits_3 = model3(input_ids, attention_mask, token_type_ids)
            start_logits_4, end_logits_4 = model4(input_ids, attention_mask, token_type_ids)

        start_logits = (
                    start_logits_0
                    + start_logits_1
                    + start_logits_2
                    + start_logits_3 
                    + start_logits_4
        ) / 5
        end_logits = (
                end_logits_0
                + end_logits_1
                + end_logits_2
                + end_logits_3
                + end_logits_4
        ) / 5
        
        predicted_start_idx = torch.argmax(torch.softmax(start_logits, dim=1), dim=1).item()
        predicted_end_idx = torch.argmax(torch.softmax(end_logits, dim=1), dim=1).item()

        input_ids = input_ids.squeeze()
        
        predicted_text = tokenizer.decode(input_ids[predicted_start_idx : predicted_end_idx + 1], skip_special_tokens=True)
        predicted_text = postprocessing(predicted_text)
        
    return predicted_text

In [None]:
test_dataset = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

test_dataset['predicted_text'] = test_dataset.progress_apply(test, axis=1)

# Submission

In [None]:
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = test_dataset['predicted_text']

submission.to_csv('./submission.csv', index=False)