In [None]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig
warnings.filterwarnings('ignore')

# Seed

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

# TODO: Average across many seeds. Yes, later.
seed = 4444
seed_everything(seed)

In [None]:
# Using the fine-tuned RoBERTa on the twitter training dataset
# No fine-tuning here...
BASE_ROBERTA = "../input/roberta-base"

# Data Loader

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        # TODO: Change with the fine-tuned model.
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file=f'{BASE_ROBERTA}/vocab.json', 
            merges_file=f'{BASE_ROBERTA}/merges.txt', 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

# Model

In [None]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        # TODO: Change with the fine-tuned roberta
        config = RobertaConfig.from_pretrained(
           f'{BASE_ROBERTA}/config.json', output_hidden_states=True)
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            f'{BASE_ROBERTA}/pytorch_model.bin', config=self.config)
        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        # +1 for the bias term
        n_weights = config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = nn.Parameter(weights_init)

        self.intermediate_classifier = nn.Linear(config.hidden_size, config.hidden_size // 2)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        
        # TODO: Is this the bug? Yes it was.
        # self.roberta.init_weights()
        nn.init.normal_(self.classifier.weight, std=0.02)
        nn.init.normal_(self.classifier.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, _, hidden_layers = self.roberta(input_ids, attention_mask)
        
        
        """
        x = torch.stack([hidden_layers[-1], hidden_layers[-2]],
                                  dim=3)
        
        x = torch.mean(x, dim=3)
        """
        
        x = torch.stack([self.dropout(layer) for layer in hidden_layers],
                                  dim=3)
        
        # Weighted mean.
        x = (torch.softmax(self.layer_weights, dim=0) * x).sum(-1)
        
        # x = torch.mean(x, -1)
        
        # x = self.intermediate_classifier(x)
        
        x = torch.mean(torch.stack([
            self.classifier(self.high_dropout(x))
            for _ in range(5)
        ], dim=0), dim=0)
        
        
        # x = self.classifier(self.dropout(x))
        
        
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

In [None]:
model = TweetModel()

In [None]:
output = model(torch.tensor([0, 1313, 2, 2, 42, 16, 41, 6344, 3545, 328, 2], dtype=torch.long).unsqueeze(0), 
      torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.long).unsqueeze(0), 
)
start_pos_logit = output[0]
end_pos_logit = output[1]
print(start_pos_logit)
print(end_pos_logit)

# Loss Function

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2, smooth=0): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return (float(len(c)) + smooth) / (len(a) + len(b) - len(c) + smooth)

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

# Inference

In [None]:
# Load trained model folds and predict
MODEL_FOLDS_BASE_FOLDER = "../input/tweeter-stratification-no-fine-tuning-seed-4444/"

In [None]:
%%time


# TODO: Remove incorrect selected_text or complete it?
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
models = []
# TODO: Make this work with many seeds later. 
for fold in range(5):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'{MODEL_FOLDS_BASE_FOLDER}/roberta_fold{fold+1}_{seed}.pth'))
    model.eval()
    models.append(model)
count = 0
for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(output[0].cpu().detach().numpy())
            end_logits.append(output[1].cpu().detach().numpy())

    start_logits = np.array(start_logits)
    end_logits = np.array(end_logits)
    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        # TODO: Better post-processing?
        if start_pred > end_pred:
            count += 1
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

In [None]:
print(count)
len(predictions)

In [None]:
pred_length = [len(p) for p in predictions]

In [None]:
pd.Series(pred_length).value_counts().hist(bins=50)

# Submission

In [None]:
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()