In [None]:
!pip install --upgrade pip
!pip install tokenizers

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
import os
import random
import time
import warnings

warnings.filterwarnings('ignore')

# torch modules
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# transformer modules
from transformers import RobertaConfig
from transformers import RobertaTokenizer
from transformers import RobertaModel
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

# tokenizer modules
import tokenizers

# sklearn modules
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

In [None]:
x = torch.randn(32, 192, 768)
x = torch.nn.functional.pad(input=x.transpose(1, 2), pad=(1, 0), mode='constant', value=0)
conv = nn.Conv1d(768, 128, 2, stride=1)
fc = nn.Linear(128, 1)
cout = conv(x)
out = fc(cout.transpose(1, 2))
print(cout.size())
print(cout.transpose(1, 2).size())
print(out.size())

In [None]:
# map text, selected_text and offsets

# offset - start and end position of any tokens
# pseudocode

# get indicies of the selected_text in text

def process_data(text, selected_text, sentiment, tokenizer, max_length):
    """
    Process text, selected_text, sentiment etc. by encoding them using tokenizer
    
    return: 
            padded input_ids, token_type_ids, attention_mask, offsets, 
            target_start_idx, target_end_idx, orig_text, orig_selected_text, sentiment
    """
    
    text = " " + " ".join(str(text).strip().lower().split())
    selected_text = " " + " ".join(str(selected_text).strip().lower().split())
    sentiment = sentiment.lower()
    
    char_targets = [0] * len(text)

    for idx in (i for i, e in enumerate(text) if e == selected_text[1]):
        if " " + text[idx:idx + len(selected_text) - 1] == selected_text:
            idx0 = idx
            idx1 = idx + len(selected_text) - 1
            break

    for idx in range(idx0, idx1):
        char_targets[idx] = 1

    assert len(char_targets) == len(text), "Length of char_targets not equal to len(text)"

    # encoding by tokenizer
    output = tokenizer.encode(text)
    
    input_ids_orig = output.ids
    offsets_orig = output.offsets
    
    target_idx = [j for j, (offset1, offset2) in enumerate(offsets_orig) 
                  if sum(char_targets[offset1: offset2]) > 0]

    target_start_idx = target_idx[0]
    target_end_idx = target_idx[-1]
    
    # token ids for sentiments
    SENTIMENT = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
    
    # create other output variables
    input_ids = [0] + [SENTIMENT[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0] * len(input_ids)
    attention_mask = [1] * len(token_type_ids)
    offsets = [(0, 0)] * 4 + offsets_orig + [(0, 0)]
    target_start_idx += 4
    target_end_idx += 4
    
    # add padding
    padding_length = max_length - len(input_ids)
    
    if padding_length > 0:
        input_ids += [0] * padding_length
        token_type_ids += [1] * padding_length
        attention_mask += [0] * padding_length
        offsets += [(0, 0)] * padding_length
    
    assert len(input_ids) == len(token_type_ids) == len(attention_mask) == len(offsets), \
    f"Lengths mismatch: {len(input_ids)}, {len(token_type_ids)}, {len(attention_mask)}, {len(offsets)} "

    return {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'offsets': offsets,
        'target_start_idx': target_start_idx,
        'target_end_idx': target_end_idx,
        'orig_text': text,
        'orig_selected_text': selected_text,
        'sentiment': sentiment}


class TweetDataset(Dataset):
    def __init__(
        self, 
        text=None, 
        selected_text=None, 
        sentiment=None, 
        tokenizer=None, 
        max_length=None):
        
        self.text = text
        self.selected_text = selected_text
        self.sentiment = sentiment
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return self.text.shape[0]
    
    def __getitem__(self, item):
        output = process_data(
            text=self.text[item],
            selected_text=self.selected_text[item],
            sentiment=self.sentiment[item],
            tokenizer=self.tokenizer,
            max_length=self.max_length)
        
        return {
            'input_ids': torch.tensor(output['input_ids'], dtype=torch.long),
            'token_type_ids': torch.tensor(output['token_type_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(output['attention_mask'], dtype=torch.long),
            'offsets': torch.tensor(output['offsets'], dtype=torch.long),
            'target_start_idx': torch.tensor(output['target_start_idx'], dtype=torch.long),
            'target_end_idx': torch.tensor(output['target_end_idx'], dtype=torch.long),
            'orig_text': output['orig_text'],
            'orig_selected_text': output['orig_selected_text'],
            'sentiment': output['sentiment']}

In [None]:
# add conv layers to the roberta model

class RobertaBaseModel(nn.Module):
    def __init__(self):
        super(RobertaBaseModel, self).__init__()
        config = RobertaConfig.from_pretrained('roberta-base', output_hidden_states=True)
        self.roberta_model = RobertaModel.from_pretrained('roberta-base', config=config)
        self.fc = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(0.2)
        
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
        
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        _, _, hs = self.roberta_model(
            input_ids=input_ids, 
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4], hs[6]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits
    

class RobertaConvHead(nn.Module):
    def __init__(self):
        super(RobertaConvHead, self).__init__()
        config = RobertaConfig.from_pretrained('roberta-base')
        config.output_hidden_states=True
        
        self.roberta = RobertaModel.from_pretrained('roberta-base', config=config)
        
        _size = 128
        
        self.conv1a = nn.Conv1d(config.hidden_size, _size, 2)
        self.conv1b = nn.Conv1d(config.hidden_size, _size, 2)
        
        self.fc1a = nn.Linear(_size, 1)
        self.fc1b = nn.Linear(_size, 1) 
        
        self.relu = nn.ReLU(True)
        self.leaky_relu = nn.LeakyReLU(0.1)
        
        self.dropout = nn.Dropout(0.2)
        
        # initialize weights and biases
        nn.init.normal_(self.conv1a.weight, std=0.02)
        nn.init.normal_(self.conv1b.weight, std=0.02)
        
        nn.init.normal_(self.conv1a.bias, 0)
        nn.init.normal_(self.conv1b.bias, 0)
        
        nn.init.normal_(self.fc1a.weight, std=0.02)
        nn.init.normal_(self.fc1a.bias, 0)
        
        nn.init.normal_(self.fc1b.weight, std=0.02)
        nn.init.normal_(self.fc1b.bias, 0)
        
        
    def forward(
        self, 
        input_ids, 
        relu=False, 
        token_type_ids=None, 
        attention_mask=None):

        _, _, hs = self.roberta(
            input_ids=input_ids, 
            token_type_ids=token_type_ids, 
            attention_mask=attention_mask)
        
        
        # stack the last four layer hidden states
        hidden_states = self.dropout(torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]]))
        
        # mean pool
        _mp = torch.mean(hidden_states, axis=0)
        
        # pad: to get same size after convolution
        _out = nn.functional.pad(
            _mp.transpose(1, 2), pad=(1, 0), mode='constant', value=0)
        
        if relu is True:
            cout_a = self.dropout(self.relu(self.conv1a(_out)))
            cout_b = self.dropout(self.relu(self.conv1b(_out)))
            
        else:
            cout_a = self.dropout(self.leaky_relu(self.conv1a(_out)))
            cout_b = self.dropout(self.leaky_relu(self.conv1b(_out)))
        
        out_a = self.fc1a(cout_a.transpose(1, 2))
        out_b = self.fc1b(cout_b.transpose(1, 2))
        
        start_logits = out_a.squeeze(-1)
        end_logits = out_b.squeeze(-1)
        
        return start_logits, end_logits

In [None]:
def loss_fn(start_logits, end_logits, target_start_idx, target_end_idx):
    ce = nn.CrossEntropyLoss()
    start_loss = ce(start_logits, target_start_idx)
    end_loss = ce(end_logits, target_end_idx)
    return start_loss + end_loss


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# To compute jaccard score we need two strings - text and selected_text
# we have the text we only need to find the selected_text

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)


def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text


def train_fn(model, dataloader, optimizer=None, scheduler=None, device=None):
    model.train()
    train_loss = []
    
    for i, data in enumerate(dataloader):
        input_ids = data['input_ids']
        token_type_ids = data['token_type_ids']
        attention_mask = data['attention_mask']
        target_start_idx = data['target_start_idx']
        target_end_idx = data['target_end_idx']

        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)
        target_start_idx = target_start_idx.to(device)
        target_end_idx = target_end_idx.to(device)
        
        optimizer.zero_grad()
        
        start_logits, end_logits = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)
            
        loss = loss_fn(start_logits, end_logits, target_start_idx, target_end_idx)
        train_loss.append(loss.item())
        
        
        if i % 100 == 0:
            #print(start_logits.size(), end_logits.size(), len(input_ids))
            it_loss = loss.item() #* len(input_ids)
            print(f"iter: {i} | loss: {it_loss:.5f}")
        
        loss.backward()
        optimizer.step()
        
        if scheduler is not None:
            scheduler.step()
        
    return train_loss
      

def valid_fn(model, dataloader, optimizer=None, scheduler=None, device=None):
    jaccard_score = 0.0
    
    with torch.no_grad():
        
        for i, data in enumerate(dataloader):
            input_ids = data['input_ids']
            token_type_ids = data['token_type_ids']
            attention_mask = data['attention_mask']
            target_start_idx = data['target_start_idx']
            target_end_idx = data['target_end_idx']
            offsets = data['offsets'].numpy()
            text_original = data['orig_text']
            
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            attention_mask = attention_mask.to(device)
            target_start_idx = target_start_idx.to(device)
            target_end_idx = target_end_idx.to(device)
            
            start_logits, end_logits = model(
                input_ids=input_ids, 
                token_type_ids=token_type_ids, 
                attention_mask=attention_mask)
            
            start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
            end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
            
            start_idx = target_start_idx.cpu().detach().numpy()
            end_idx = target_end_idx.cpu().detach().numpy()
            
            # compute jaccard scores
            for j in range(len(input_ids)):
                jscore = compute_jaccard_score(
                    text_original[j],
                    target_start_idx[j],
                    target_end_idx[j],
                    start_logits[j],
                    end_logits[j],
                    offsets[j])
                
                jaccard_score += jscore
    
    return jaccard_score


def run(
    fold, 
    df, 
    train_idx, 
    valid_idx, 
    num_epochs, 
    tokenizer, 
    max_length, 
    batch_size, 
    lr, 
    model, 
    device):
    
    # datasets and dataloaders
    train_df = df.iloc[train_idx]
    valid_df = df.iloc[valid_idx]
    
    train_dataset = TweetDataset(
        train_df.text.values, 
        train_df.selected_text.values, 
        train_df.sentiment.values, 
        tokenizer=tokenizer, 
        max_length=max_length)
    
    valid_dataset = TweetDataset(
        valid_df.text.values,
        valid_df.selected_text.values,
        valid_df.sentiment.values, 
        tokenizer=tokenizer,
        max_length=max_length)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True)
    
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=batch_size, 
        shuffle=False)
    
    #print(f"Finished loading data!")

    optimizer = AdamW(model.parameters(), lr=lr)
    
    num_training_steps = len(train_loader) * num_epochs
    warmup_steps = int(0.05 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_training_steps=num_training_steps, 
        num_warmup_steps=0)
    
    # train and eval function
    for epoch in range(num_epochs):
        start = time.time()

        train_loss = train_fn(
            model, 
            train_loader, 
            optimizer=optimizer, 
            scheduler=scheduler, 
            device=device)
        
        end = time.time()
        t = end - start

        jaccard_score = valid_fn(model, valid_loader, device=device)

        train_loss = np.mean(train_loss)
        jscore = jaccard_score / len(valid_dataset)
        print(f"time: {(t/60):.2f} mins")
        print(f"Epoch: {epoch+1}/{num_epochs} | train loss: {train_loss:.5f} | jaccard score: {jscore:.5f}")
        
    torch.save(model.state_dict(), 
               f'fold{fold+1}_roberta_e{epoch+1}_bs{batch_size}.pth')

In [None]:
EPOCHS = 4

ROBERTA_PATH = "/kaggle/input/tokenizers/model_dumps/roberta-base"

TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json",
    merges_file=f"{ROBERTA_PATH}/merges.txt",
    lowercase=True,
    add_prefix_space=True)

MAX_LENGTH = 192
BATCH_SIZE = 32
LR = 1e-5 * 3

TRAIN_PATH = "/kaggle/input/tweet-sentiment-extraction/train.csv"

df = pd.read_csv(TRAIN_PATH, usecols=['text', 'selected_text', 'sentiment']).fillna('none')
df = df.sample(frac=1)
#df = df[:1000]
print(df.shape)

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)

for fold, (train_idx, valid_idx) in enumerate(skf.split(df.text, df.sentiment)):
    # model
    MODEL = RobertaConvHead()
    MODEL = MODEL.to(DEVICE)
    
    #print(f"Finished loading model!")
    
    print(f"Fold: {fold+1}")
    
    run(
        fold=fold,
        df=df,
        train_idx=train_idx,
        valid_idx=valid_idx,
        num_epochs=EPOCHS, 
        tokenizer=TOKENIZER, 
        max_length=MAX_LENGTH, 
        batch_size=BATCH_SIZE, 
        lr=LR,
        model=MODEL,
        device=DEVICE)
    
    print("\n")

print(f"Finished running script.")