In [None]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig
from tqdm.notebook import tqdm
import sys
import matplotlib.pyplot as plt
import re
import string

%matplotlib inline
warnings.filterwarnings('ignore')

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

batch_size = 32
N = 10

skf = StratifiedKFold(n_splits=N, shuffle=True, random_state=seed)
NUM_WORKERS = 2

ROBERTA_PATH = '/kaggle/input/robertamodel0524/'
MODEL_CONFIG_PATH = ROBERTA_PATH+'roberta-base-config.json'
MODEL_PATH = ROBERTA_PATH+'roberta-base-pytorch_model.bin'
MODEL_VOCAB_PATH = ROBERTA_PATH+'roberta-base-vocab.json'
MODEL_VOCAB_MERGES_PATH = ROBERTA_PATH+'roberta-base-merges.txt'
outdir = '/kaggle/input/roberta714kernel/'

test_file = '/kaggle/input/tweet-sentiment-extraction/test.csv'
submission_template = '/kaggle/input/tweet-sentiment-extraction/sample_submission.csv'

MAX_LEN = 96
LINEAR_DROPOUT = 0.2

CLS_TOK = 0
PAD_TOK = 1
SEP_TOK = 2

In [None]:
class TweetDataset(torch.utils.data.Dataset):
  def __init__(self, df, max_len=MAX_LEN):
    self.df = df
    self.max_len = max_len
    self.labeled = 'selected_text' in df
    self.tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file = MODEL_VOCAB_PATH, 
        merges_file = MODEL_VOCAB_MERGES_PATH, 
        lowercase=True,
        add_prefix_space=True)

  def __getitem__(self, index):
    data = {}
    row = self.df.iloc[index]
    
    ids, masks, tweet, offsets = self.get_input_data(row)
    data['ids'] = ids
    data['masks'] = masks
    data['tweet'] = tweet
    data['offsets'] = offsets
    
    if self.labeled:
      start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
      data['start_idx'] = start_idx
      data['end_idx'] = end_idx
    
    return data

  def __len__(self):
    return len(self.df)
  
  def get_input_data(self, row):
    tweet = " " + " ".join(row.text.lower().split())
    encoding = self.tokenizer.encode(tweet)
    sentiment_id = self.tokenizer.encode(row.sentiment).ids
    ids = [CLS_TOK] + sentiment_id + [SEP_TOK, SEP_TOK] + encoding.ids + [SEP_TOK]
    offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
            
    pad_len = self.max_len - len(ids)
    if pad_len > 0:
      ids += [PAD_TOK] * pad_len
      offsets += [(0, 0)] * pad_len
    
    ids = torch.tensor(ids)
    masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
    offsets = torch.tensor(offsets)
    
    return ids, masks, tweet, offsets
      
  def get_target_idx(self, row, tweet, offsets):
    selected_text = " " +  " ".join(row.selected_text.lower().split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
      if " " + tweet[ind: ind+len_st] == selected_text:
        idx0 = ind
        idx1 = ind + len_st - 1
        break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
      for ct in range(idx0, idx1 + 1):
        char_targets[ct] = 1

    target_idx = []
    for j, (offset1, offset2) in enumerate(offsets):
      if sum(char_targets[offset1: offset2]) > 0:
        target_idx.append(j)

    start_idx = target_idx[0]
    end_idx = target_idx[-1]
    
    return start_idx, end_idx

In [None]:
def get_test_loader(df, batch_size=32):
  loader = torch.utils.data.DataLoader(
    TweetDataset(df), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=NUM_WORKERS)    
  return loader

In [None]:
class TweetModel(nn.Module):
  def __init__(self):
    super(TweetModel, self).__init__()
    
    config = RobertaConfig.from_pretrained(
        MODEL_CONFIG_PATH, output_hidden_states=True)    
    self.roberta = RobertaModel.from_pretrained(
        MODEL_PATH, config=config)
    self.dropout = nn.Dropout(LINEAR_DROPOUT)
    self.fc = nn.Linear(config.hidden_size, 2)
    nn.init.normal_(self.fc.weight, std=0.02)
    nn.init.normal_(self.fc.bias, 0)

  def forward(self, input_ids, attention_mask):
    _, _, hs = self.roberta(input_ids, attention_mask)
      
    x = torch.stack([hs[-1], hs[-2], hs[-3]])
    x = torch.mean(x, 0)
    x = self.dropout(x)
    x = self.fc(x)
    start_logits, end_logits = x.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)
            
    return start_logits, end_logits

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
  selected_text = ""
  for ix in range(start_idx, end_idx + 1):
    selected_text += text[offsets[ix][0]: offsets[ix][1]]
    if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
      selected_text += " "
  return selected_text

def jaccard(str1, str2): 
  a = set(str1.lower().split()) 
  b = set(str2.lower().split())
  c = a.intersection(b)
  return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
  start_pred = np.argmax(start_logits)
  end_pred = np.argmax(end_logits)
  if start_pred > end_pred:
    pred = text
  else:
    pred = get_selected_text(text, start_pred, end_pred, offsets)
      
  true = get_selected_text(text, start_idx, end_idx, offsets)
  
  return jaccard(true, pred)

In [None]:
%%time

test_df = pd.read_csv(test_file)
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
max_votes = []
models = []

print("loading models..")
for fold in range(2, skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'{outdir}roberta_fold{fold+1}.pth'))
    model.eval()
    print(f"load {outdir}roberta_fold{fold+1}.pth")
    models.append(model)
    
for data in tqdm(test_loader):
  ids = data['ids'].cuda()
  masks = data['masks'].cuda()
  tweet = data['tweet']
  offsets = data['offsets'].numpy()

  start_logits = []
  end_logits = []
  for model in models:
    with torch.no_grad():
        output = model(ids, masks)
        start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
        end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

  mean_start_logits = np.mean(start_logits, axis=0)
  mean_end_logits = np.mean(end_logits, axis=0)

  for i in range(len(ids)):
    prediction = []
    mean_pred = None
    # calculate mean logits
    mean_start_pred = np.argmax(mean_start_logits[i])
    mean_end_pred = np.argmax(mean_end_logits[i])
    if mean_start_pred > mean_end_pred:
        mean_pred = tweet[i]
    else:
        mean_pred = get_selected_text(tweet[i], mean_start_pred, mean_end_pred, offsets[i])
    
    for k in range(len(models)):
        start = np.argmax(start_logits[k][i])
        end = np.argmax(end_logits[k][i])
        if start > end:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start, end, offsets[i])
        prediction.append(pred)
    votes = {}
    votes[mean_pred] = 1
    for p in prediction:
        if p in votes:
            votes[p] += 1
        else:
            votes[p] = 1
    
    max_vote = max(votes.values())
    for v in votes:
        if votes[v] == max_vote:
            predictions.append(v)
            max_votes.append(votes[v])
            break
    

#   for i in range(len(ids)):    
#     start_pred = np.argmax(start_logits[i])
#     end_pred = np.argmax(end_logits[i])
#     if start_pred > end_pred:
#         pred = tweet[i]
#     else:
#         pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
#     predictions.append(pred)

In [None]:
sub_df = pd.read_csv(submission_template)

sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)

In [None]:
sub_df.head(batch_size)

In [None]:
test_df_out = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
test_df_out['selected_text'] = predictions
test_df_out['selected_text'] = test_df_out['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
test_df_out['selected_text'] = test_df_out['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
test_df_out['selected_text'] = test_df_out['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
test_df_out['max_votes'] = max_votes
test_df_out.to_csv('test_pred.csv', index=False)

In [None]:
test_df_out.head(batch_size)