In [None]:
!pip install /kaggle/input/fairseq-and-fastbpe/sacrebleu-1.4.9-py3-none-any.whl
!pip install /kaggle/input/fairseq-and-fastbpe/fairseq-0.9.0-cp37-cp37m-linux_x86_64.whl
!pip install /kaggle/input/fairseq-and-fastbpe/fastBPE-0.1.0-cp37-cp37m-linux_x86_64.whl

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers

import argparse
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary

from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

In [None]:
LEARNING_RATE = 6e-5
MAX_LEN = 126
TRAIN_BATCH_SIZE = 35
VALID_BATCH_SIZE = 32
EPOCHS = 3
INPUT_PATH = "/kaggle/input/"
MODEL_INPUT_PATH = f"{INPUT_PATH}bertweet-model/"

key = argparse.Namespace(bpe_codes= f"{MODEL_INPUT_PATH}BERTweet_base_transformers/bpe.codes")
bpe = fastBPE(key)

# Load the dictionary  
vocab = Dictionary()
vocab.add_from_file(f"{MODEL_INPUT_PATH}BERTweet_base_transformers/dict.txt")

# TOKENIZER = transformers.RobertaTokenizer(
#     vocab_file =  f'{ROBERTA_PATH}vocab.json',
#     merges_file = f'{ROBERTA_PATH}merges.txt',
#     lowercase = True,
#     add_prefix_space = True
# )

In [None]:
class TweetDataset:
    def __init__(self, tweets, sentiments, selected_texts):
        self.tweets = [' '+' '.join(str(tweet).split()) for tweet in tweets]
        self.sentiments = [' '+' '.join(str(sentiment).split()) for sentiment in sentiments]
        self.selected_texts = [' '+' '.join(str(selected_text).split()) for selected_text in selected_texts]
        self.max_len = MAX_LEN
        
    
    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        e_tweet = '<s> ' + bpe.encode(self.tweets[item]) + ' </s>'
        enc_tweet = vocab.encode_line(e_tweet, append_eos=False, add_if_not_exist=False).long().tolist()

        if self.sentiments[item] != 'neutral':
            e_sentiment = '</s> ' + bpe.encode(self.sentiments[item]) + " " + bpe.encode(self.sentiments[item]) + ' </s>'
        else:
            e_sentiment = '</s> ' + bpe.encode(self.sentiments[item]) + ' </s>'
        enc_sentiment = vocab.encode_line(e_sentiment, append_eos=False, add_if_not_exist=False).long().tolist()

        enc_tweet_sentiment = enc_tweet + enc_sentiment

        padding_len = self.max_len - len(enc_tweet_sentiment)
        input_ids = enc_tweet_sentiment + ([0] * padding_len)
        attention_mask = ([1] * len(enc_tweet_sentiment)) + ([0] * padding_len)

        start_index, end_index = 0, 0
        token_type_ids = [0] * self.max_len

        e_selected_text_ids = bpe.encode(self.selected_texts[item])
        enc_selected_text_ids = vocab.encode_line(e_selected_text_ids, append_eos=False, add_if_not_exist=False).long().tolist()

        for j in (i for i,e in enumerate(enc_tweet_sentiment) if e == enc_selected_text_ids[0]):
            if enc_tweet_sentiment[j:j+len(enc_selected_text_ids)] == enc_selected_text_ids:
                start_index = j
                end_index = j+(len(enc_selected_text_ids))

        return {
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets_start': torch.tensor(start_index, dtype=torch.long),
            'targets_end': torch.tensor(end_index, dtype=torch.long),
            'orig_tweet': self.tweets[item],
            'orig_selected': self.selected_texts[item],
            'sentiment': self.sentiments[item]
        }

In [None]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(
            "/kaggle/input/bertweet-model/BERTweet_base_transformers/model.bin",
            config=conf
        )
        self.drop_out = nn.Dropout(0.1)
        self.activation = nn.LeakyReLU()
        self.l0 = nn.Linear(768 * 2, 2)

        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
# INFERENCE
df_test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
model_config = transformers.RobertaConfig.from_pretrained(
                f"{MODEL_INPUT_PATH}BERTweet_base_transformers/config.json"
            )
model_config.output_hidden_states = True

In [None]:
test_dataset = TweetDataset(
        tweets=df_test.text.values,
        sentiments=df_test.sentiment.values,
        selected_texts=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_4.bin"))
model5.eval()

model6 = TweetModel(conf=model_config)
model6.to(device)
model6.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_5.bin"))
model6.eval()

model7 = TweetModel(conf=model_config)
model7.to(device)
model7.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_6.bin"))
model7.eval()

model8 = TweetModel(conf=model_config)
model8.to(device)
model8.load_state_dict(torch.load(f"{INPUT_PATH}twitroberta/model_7.bin"))
model8.eval()

In [None]:
final_output = []
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start6, outputs_end6 = model6(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start7, outputs_end7 = model7(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start8, outputs_end8 = model8(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start = (outputs_start1 + outputs_start2 + 
                         outputs_start3 + outputs_start4 + 
                         outputs_start5 + outputs_start7 + 
                         outputs_start6 + outputs_start8) / 8

        outputs_end = (outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 +
                       outputs_end5 + outputs_end6 + 
                       outputs_end7 + outputs_end8) / 8
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        
        for id, tweet in enumerate(orig_tweet):
            a = np.argmax(outputs_start[id])
            b = np.argmax(outputs_end[id])
            if a > b:
                selected_text = tweet
            else:
                n_tweet = '<s> ' + bpe.encode(tweet) + ' </s>'
                nn_tweet = vocab.encode_line(n_tweet, append_eos=False, add_if_not_exist=False).long().tolist()
                select = nn_tweet[a:b+1]
                selected_text = bpe.decode(" ".join([vocab[i] for i in select]))
                selected_text = selected_text.replace("<s>","").replace("</s>","").replace("<unk>", "")
                if selected_text.split()[-1] in set(string.punctuation) or selected_text.split()[-1] in stop_words:
                    selected_text = " ".join(selected_text.split()[:-1])
                if selected_text.strip() == "":
                    selected_text = tweet
            final_output.append(selected_text)

In [None]:
a = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")
a.loc[:, 'selected_text'] = final_output
a.to_csv("submission.csv", index=False)

In [None]:
a.sample(20)