## Full credits to the origin author [@shoheiazuma](https://kaggle.com/shoheiazuma) of the original notebook. But also big thanks to [@abhishek](https://www.kaggle.com/abhishek) for his example notebooks and videos on how to build and run models on TPUs and multiple TPUs.

I did some tidying and reorganisation of the code to learn more about how to switch code between CPU/GPU/TPU. There can be more improvements as we go along please join me in simplifying the process of writing and running code on CPUs, GPUs and TPUs. 

Please feel free to answer there as well as comment below.

#### Forked from https://www.kaggle.com/shoheiazuma/tweet-sentiment-roberta-pytorch

### This is the inference version of the notebook, the [training version can be found here](https://www.kaggle.com/neomatrix369/tse2020-roberta-pytorch-multi-tpu-10-skfd-1-2).

### Loading TPU models on GPU instances, after manually downgrading the model version from 4 to 3

#### Thanks to [@msmelguizo](https://www.kaggle.com/msmelguizo) for suggesting the solution, see this [discussion](https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/157695#886125). And thanks to [@cpmpml](https://www.kaggle.com/cpmpml) for leading me to the below solution.

In [None]:
%%bash
apt-get install -y xarchiver || true
SAVED_MODEL_PATH="/kaggle/input/tse2020-roberta-pytorch-multi-tpu-10-skfd-d"
NEW_MODEL_PATH="/kaggle/working"
for model_file in $(ls $SAVED_MODEL_PATH/*.pth)
do
    just_filename=$(basename "${model_file%.*}")
    if [[ ! -e "$NEW_MODEL_PATH/$just_filename.pth" ]]; then
        echo "Copying $model_file to $NEW_MODEL_PATH"
        cp $model_file /tmp
        cd /tmp/
        mkdir -p $just_filename
        echo 3 > $just_filename/version
        echo ""
        zip -u  $just_filename.pth $just_filename/version
        echo "Moving model file $just_filename.pth to $NEW_MODEL_PATH"
        mv $just_filename.pth $NEW_MODEL_PATH/$just_filename.pth
    fi
    echo "(After) Contents of '$just_filename/version' in the model file '$model_file'"
    unzip -p $NEW_MODEL_PATH/$(basename $model_file) $just_filename/version
done

In [None]:
import warnings
warnings.filterwarnings("ignore")

import string
import re

import numpy as np
import pandas as pd
import os
import random
import torch 
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
import transformers
from transformers import RobertaModel, RobertaConfig

In [None]:
accelerator_device = "gpu"
def print_to_console(string_to_print, end='\n', flush=False):
    if accelerator_device == "tpu":
        xm.master_print(string_to_print, flush=flush) 
    else:
        print(string_to_print, end=end, flush=flush)

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
ROBERTA_PATH = "/kaggle/input/roberta-base"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json", 
    merges_file=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)
# previous version, now we are copying and moving the files to working folder: SAVED_MODEL_PATH = "/kaggle/input/tse2020-roberta-pytorch-multi-tpu-10-skfd-d"
SAVED_MODEL_PATH="/kaggle/working/"
NUM_OF_SAVED_MODELS = 10
cpu_count = os.cpu_count()

# Seed

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

seed = 42
seed_everything(seed)

# Data Loader

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='../input/roberta-base/vocab.json', 
            merges_file='../input/roberta-base/merges.txt', 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

# Model

In [None]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            f'{ROBERTA_PATH}/config.json', output_hidden_states=True)    
        config.output_hidden_states = True
        self.roberta = RobertaModel.from_pretrained(
            f'{ROBERTA_PATH}/pytorch_model.bin', config=config)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _, _, hs = self.roberta(input_ids, attention_mask)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

# Evaluation Function

In [None]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp
%env JOBLIB_START_METHOD="forkserver"  ### commented out helped, usually its set to stop Parallel from hanging or going idle
%env TMPDIR=/tmp

In [None]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
model_config.output_hidden_states = True

In [None]:
%%time
models = {}

for index in range(NUM_OF_SAVED_MODELS):
    model_filename=f"{SAVED_MODEL_PATH}/roberta_fold{index}.pth"
    print(f"Loading model {index} from {model_filename}")
    models[index] = TweetModel()
    models[index].to(device)
    models[index].load_state_dict(torch.load(model_filename))
    models[index].eval()

In [None]:
%%time
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []

for data in test_loader:
    print('Reading test data via the test loader...')
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        print(f'Processing model {model}...')
        with torch.no_grad():
            output = models[model](ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())
    
    print('Preparing predictions...')
    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

In [None]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
sample.loc[:, 'selected_text'] = predictions
sample[['textID','selected_text']].to_csv("submission.csv", index=False)

In [None]:
sample.head()

In [None]:
!rm -fr *.pth