In [None]:
import numpy as np
import pandas as pd
import os
import random
import torch 
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig, RobertaTokenizer
from twittersentimentextactionmodule import *

preprocess<br>
data augmentation<br>
model architecture, loss<br>
training schedule, optimizer<br>
postprocess

In [None]:
online = False
e_cuda = True

config_vocab_file = '../input/roberta-base/vocab.json'
config_merges_file = '../input/roberta-base/merges.txt'
config_roberta_config = '../input/roberta-base/config.json'
config_roberta_model = '../input/roberta-base/pytorch_model.bin'

if online:
    rtokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    rtokenizer.save_vocabulary(".")
    config_vocab_file = 'vocab.json'
    config_merges_file = 'merges.txt'
    config_roberta_config = 'roberta-base'
    config_roberta_model = 'roberta-base'

huggingface recommends training a byte-level BPE (rather than let’s say, a WordPiece tokenizer like BERT) because it will start building its vocabulary from an alphabet of single bytes, so all words will be decomposable into tokens (no more <unk> tokens!).

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file = config_vocab_file, 
            merges_file = config_merges_file, 
            lowercase = True,
            add_prefix_space = True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx

In [None]:
def get_train_val_loaders(df, train_idx, val_idx, batch_size):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs):
    
    if e_cuda:
        model.cuda()

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            for data in (dataloaders_dict[phase]):
                ids = data['ids'] #.cuda()
                masks = data['masks'] #.cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'] #.cuda()
                end_idx = data['end_idx'] #.cuda()

                if e_cuda:
                    ids = ids.cuda()
                    masks = masks.cuda()
                    start_idx = start_idx.cuda()
                    end_idx = end_idx.cuda()

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    start_logits, end_logits = model(ids, masks)

                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
                    
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            print('|  {}/{}  | {:^5} | {:.4f} |  {:.4f} |'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
            #print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
            #    epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))

In [None]:
def init_logits(test_loader):
    s_logits = {}
    e_logits = {}
    for i, data in enumerate(test_loader):
        s_logits[i] = []
        e_logits[i] = []      
    return s_logits, e_logits

In [None]:
# tf.keras.layers.Conv1D(
#     filters, kernel_size, strides=1, padding='valid', data_format='channels_last',
#     dilation_rate=1, activation=None, use_bias=True,
#     kernel_initializer='glorot_uniform', bias_initializer='zeros',
#     kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
#     kernel_constraint=None, bias_constraint=None, **kwargs
# )        
# 768 -> filters     -> Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution).
# 2   -> kernel_size -> An integer or tuple/list of a single integer, specifying the length of the 1D convolution window.
# tf.keras.layers.Conv1D(768, 2, padding='same')(x1)

# in_channels (int)  – Number of channels in the input image
# out_channels (int) – Number of channels produced by the convolution
# kernel_size (int or tuple) – Size of the convolving kernel
# torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')

# torch.nn.Linear(in_features, out_features, bias=True)

# x1 = tf.keras.layers.Dropout(0.1)(x[0])
# x1 = tf.keras.layers.Conv1D(768, 2, padding='same')(x1)
# x1 = tf.keras.layers.LeakyReLU()(x1)
# x1 = tf.keras.layers.Dense(1)(x1)
# x1 = tf.keras.layers.Flatten()(x1)
# x1 = tf.keras.layers.Activation('softmax')(x1)

# hs[-1] torch.Size([32, 96, 768])
## torch.Size([3, 32, 96, 768])
#x = x.permute(1, 2, 3, 0) # channels to last dim
#print(x.size())

In [None]:

class TweetModel(nn.Module):
    def __init__(self, p_drop, l_std, m_size):
        super(TweetModel, self).__init__()
       
        config = RobertaConfig.from_pretrained(
            config_roberta_config, output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            config_roberta_model, config=config)
        
        # (N, C, H, W) format. N is the number of samples/batch_size. C is the channels. H and W are height and width resp.
        # For conv1D, input should be (N,C,L) see documentation at
        # N is a batch size, C denotes a number of channels, L is a length of signal sequence.    
        
        self.dropout = nn.Dropout(p_drop)
        self.conv = nn.Conv1d(config.hidden_size, m_size , kernel_size = 2) # padding='same'
        self.conv2 = nn.Conv1d(config.hidden_size, m_size, kernel_size = 2) # padding='same'
        
        #self.dropout2 = nn.Dropout(p_drop)
        
        self.fc = nn.Linear(m_size, 1)
        nn.init.normal_(self.fc.weight, std=l_std)
        nn.init.normal_(self.fc.bias, 0)

        self.fc2 = nn.Linear(m_size, 1)
        nn.init.normal_(self.fc2.weight, std=l_std)
        nn.init.normal_(self.fc2.bias, 0)

    def forward(self, input_ids, attention_mask):

        _, _, hs = self.roberta(input_ids, attention_mask)
        x = torch.stack([hs[-1], hs[-2], hs[-3]]) 
        x = torch.mean(x, 0)                      
        x = self.dropout(x).transpose(1,2)
        
        #print(x.size()) # torch.Size([32, 768, 96])
        
        x1 = F.leaky_relu(self.conv(x)).transpose(1,2)                  
        x2 = F.leaky_relu(self.conv2(x)).transpose(1,2)
        
        #start_logits, end_logits = x.split(1, dim=-1)

        #print(x1.size()) # torch.Size([32, 95, 128])
        #print(x2.size()) # torch.Size([32, 95, 128])
        
        start_logits = self.fc(x1).squeeze(-1)
        end_logits = self.fc2(x2).squeeze(-1)
        
        #print(start_logits.size()) # torch.Size([32, 95, 2])
        #print(end_logits.size())   # torch.Size([32, 95, 2])
        
        return start_logits, end_logits

In [None]:
seed = 93
num_epochs = 3
batch_size = 32
n_splits = 10
p_drop = 0.1
learning_rate=3e-5
l_std=0.03
m_size=128

print(f'| param  |        |')
print(f'|--------|--------|')
print(f'| seed   |  {seed}    |')
print(f'| epochs |   {num_epochs}    |')
print(f'| batch  |  {batch_size}    |')
print(f'| splits |  {n_splits}    |')
print(f'| p_drop |   {p_drop}  |')
print(f'| lr     | {learning_rate}  |')
print(f'| std    |  {l_std}  |')
print(f'| m_size | {m_size}    |')

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
# %%time

train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)

test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df, batch_size)

s_logits, e_logits = init_logits(test_loader)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment)): 
    print_and_log(f'Fold: {fold+1}')
    print('| Epoch |       |  Loss  | Jaccard |')
    print('|-------|-------|--------|---------|')

    model = TweetModel(p_drop, l_std, m_size)
    # https://www.fast.ai/2018/07/02/adam-weight-decay/
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))
    criterion = loss_fn    
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)

    train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs)

    for i, data in enumerate(test_loader):
        ids = data['ids'] #.cuda()
        masks = data['masks'] #.cuda()
        
        if e_cuda:
            ids = ids.cuda()
            masks = masks.cuda()

        with torch.no_grad():
            output = model(ids, masks)
            s_logits[i].append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            e_logits[i].append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

In [None]:
np.save('s_logits.npy', s_logits)
np.save('e_logits.npy', e_logits)

In [None]:
# %%time

predictions = []

for i, data in enumerate(test_loader):
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()
    start_logits = np.mean(s_logits[i], axis=0)
    end_logits = np.mean(e_logits[i], axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

In [None]:
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()