In This Kernal we are going to train model using predefined RoBert BERT Model, not with given data.
This kernal is created from this [kernal](https://www.kaggle.com/abhishek/roberta-inference-5-folds)

# Imports


In [None]:
import numpy as np             # for algebric functions
import pandas as pd            # to handle dataframes
import os                      # to import files 
#!pip install transformers
import transformers            # Transformers (pytorch-transformers /pytorch-pretrained-bert) provides general-purpose architectures (BERT, RoBERTa,..)
import tokenizers              # A tokenizer is in charge of preparing the inputs for a model. 
import string                  
import torch                   # pytorch
import torch.nn as nn   
from torch.nn import functional as F
from tqdm import tqdm          # TQDM is a progress bar library
import re                      # regular expression
import json
import requests

# Packages used in this kernal:



* Transformers

  Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.

  This is the documentation of our [repository transformers](https://huggingface.co/transformers/index.html).


* Tokenizers
  
  A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a “Fast” implementation based on the Rust library tokenizers.

  For more Info: [Tokenizers from transformers](https://huggingface.co/transformers/main_classes/tokenizer.html)

* Pytorch (torch)

  PyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing. It is primarily developed by Facebook's AI Research lab.

  For PyTorch Beginner Tutorial:
  * https://www.kaggle.com/anandsubbu007/pytorch-basics-tutorial-1 
  * https://www.kaggle.com/anandsubbu007/pytorch-autograd-tutorial-2 
  * https://www.kaggle.com/anandsubbu007/deep-nn-pytorch-tutorial-3 
  * https://www.kaggle.com/anandsubbu007/cnn-cifar10-pytorch-tutorial-4

  

For Roberta Vocablary file: 
* https://huggingface.co/transformers/_modules/transformers/modeling_roberta.html

For Roberta PreTrainedModel File:
* https://huggingface.co/transformers/_modules/transformers/tokenization_roberta.html

In [None]:
MAX_LEN = 192
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
ROBERTA_PATH = 'roberta-base'

# Downloading Vocab & Merge File from Roberta

In [None]:
pre_voc_file = transformers.RobertaTokenizer.pretrained_vocab_files_map
merges_file  = pre_voc_file.get('merges_file').get('roberta-base')
vocab_file = pre_voc_file.get('vocab_file').get('roberta-base')
model_bin = transformers.modeling_roberta.ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.get('roberta-base')

In [None]:
#pre_voc_file

In [None]:
# Download Vocab file & Merge file
json_f = requests.get(vocab_file)
txt_f = requests.get(merges_file)
mod_bin = requests.get(model_bin)

data = json_f.json()
#saving json vocab file
with open('vocab.json', 'w') as f:
    json.dump(data, f)
#saving merge.txt file
open('merge.txt', 'wb').write(txt_f.content)
open('model.bin', 'wb').write(mod_bin.content)

In [None]:
TOKENIZER = tokenizers.ByteLevelBPETokenizer(vocab_file=f"vocab.json", 
                                             merges_file=f"merge.txt", 
                                             lowercase=True,
                                             add_prefix_space=True)

In [None]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

## Roberta Model

The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018.

We are using [Roberta Model](https://huggingface.co/transformers/model_doc/roberta.html#robertamodel) In this kernal. For more models [Click Here](https://huggingface.co/models)



In [None]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }


class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Evaluation Model

In [None]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    if sentiment_val != "neutral" and verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {original_tweet.strip()}")
            print("********************************")

    jac = 0
    return jac, filtered_output


# Importing Test Data

In [None]:
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained('roberta-base')             # to download from internet
model_config.output_hidden_states = True

In [None]:
TweetDataset(tweet=df_test.text.values,
             sentiment=df_test.sentiment.values,
             selected_text=df_test.selected_text.values)

In [None]:
model = TweetModel(conf=model_config)
model.to(device)
model.eval()
print("k")

In [None]:
final_output = []

In [None]:
test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=0
)


# Prediction

In [None]:
with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids            = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask           = mask.to(device, dtype=torch.long)
        targets_start  = targets_start.to(device, dtype=torch.long)
        targets_end    = targets_end.to(device, dtype=torch.long)

        outputs_start1, outputs_end1 = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start = outputs_start1
        outputs_end = outputs_end1
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
          selected_tweet = orig_selected[px]
          tweet_sentiment = sentiment[px]
          _, output_sentence = calculate_jaccard_score(original_tweet=tweet,
                                                       target_string=selected_tweet,
                                                       sentiment_val=tweet_sentiment,
                                                       idx_start=np.argmax(outputs_start[px, :]),
                                                       idx_end=np.argmax(outputs_end[px, :]),
                                                       offsets=offsets[px])
          final_output.append(output_sentence)


# Submission

In [None]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output
sample.to_csv("submission.csv", index=False)

In [None]:
sample.head()