In [None]:
import pandas as pd
import os
import torch
import statistics
import pytorch_lightning as pl
from torch import nn
import glob
import gc
import torch.nn.functional as F
from transformers import RobertaTokenizer, AutoModel, AutoTokenizer, ElectraTokenizer, AutoModelForQuestionAnswering, RobertaForQuestionAnswering
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import Dataset, DataLoader, random_split
from pytorch_lightning import Trainer
import re

# Checkpoints

In [None]:
sort_ckpt = (lambda x: float(re.findall(r"[0-9]\.[0-9]*", x)[0]))
hyperparameters = {
    "batch_size": 32,
    "tokenizer_config": {
        "return_tensors": "pt",
        "padding": True,
        "max_length": 512,
        "truncation": True,
    },        
    "roberta_base": [
        "../input/tsecheckpoints/roberta-base-squad-2/roberta-base:v105/epoch_002_jaccard_0.737.ckpt", 
        "../input/tsecheckpoints/roberta-base-squad-2/roberta-base:v106/epoch_002_jaccard_0.739.ckpt",
        "../input/tsecheckpoints/roberta-base-squad-2/roberta-base:v107/epoch_002_jaccard_0.744.ckpt",
        "../input/tsecheckpoints/roberta-base-squad-2/roberta-base:v108/epoch_001_jaccard_0.742.ckpt",
        "../input/tsecheckpoints/roberta-base-squad-2/roberta-base:v109/epoch_001_jaccard_0.742.ckpt",
        
        "../input/tsecheckpoints/roberta-base-lower-lr/roberta-base:v110/epoch_002_jaccard_0.728.ckpt",
        "../input/tsecheckpoints/roberta-base-lower-lr/roberta-base:v111/epoch_001_jaccard_0.741.ckpt",
        "../input/tsecheckpoints/roberta-base-lower-lr/roberta-base:v112/epoch_001_jaccard_0.738.ckpt",
        "../input/tsecheckpoints/roberta-base-lower-lr/roberta-base:v113/epoch_001_jaccard_0.744.ckpt",
        "../input/tsecheckpoints/roberta-base-lower-lr/roberta-base:v114/epoch_002_jaccard_0.733.ckpt",
        
        ],
    
}

In [None]:
# model = RobertaForQuestionAnswering.from_pretrained("roberta-base")
# model.save_pretrained('./roberta/base/model')
tokenizer = RobertaTokenizer.from_pretrained('../input/tsepretrainedmodels/roberta/base/tokenizer', use_fast=True)
tokenizer.save_pretrained('./roberta/base/tokenizer')

# Dataset + Datamodule

In [None]:
class TSEDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset.iloc[idx]
        return example.to_dict()


class TSEDataModule(pl.LightningDataModule):
    """
    A DataModule implements 5 key methods:
        - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode)
        - setup (things to do on every accelerator in distributed mode)
        - train_dataloader (the training dataloader)
        - val_dataloader (the validation dataloader(s))
        - test_dataloader (the test dataloader(s))

    This allows you to share a full dataset without explaining how to download,
    split, transform and process the data.

    Read the docs:
        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
    """

    def __init__(
        self,
        data_dir: str = "../input/tweet-sentiment-extraction/test.csv",
        train_val_test_split: float = 0.8,
        batch_size: int = 32,
        val_batch_size: int = 32,
        num_workers: int = 0,
        max_length: int = 256,
        pin_memory: bool = False,
        k_folds: int = 5,
        current_fold: int = 0,
    ):
        super().__init__()

        self.current_fold = current_fold
        self.k_folds = k_folds
        self.data_dir = data_dir
        self.tokenizer = RobertaTokenizer.from_pretrained('../input/tsepretrainedmodels/roberta/base/tokenizer', use_fast=True)
        self.train_val_split = train_val_test_split
        self.batch_size = batch_size
        self.val_batch_size = val_batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.max_length = max_length
        self.data_test = None
        self.full_dataset = None

    def prepare_data(self):
        csv = pd.read_csv(self.data_dir).dropna()
        def preprocess_text(text):
            text = re.sub(r"http\S+", "URL", text)
            text = re.sub(r"www\.[a-zA-Z].\S+", "URL", text)
            text = str(text).replace("ï¿½", "`")
            text = text.replace("ï", "")
            text = text.replace("Aam   these", "these")
            text = text.replace("Aam  these", "these")
            text = text.replace("d stinks", "stinks")
            text = text.replace("Wave looks interesting. ht", "Wave looks interesting.")
            return text
        for index, example in csv.iterrows():
            context = preprocess_text(example["text"])
            question = example["sentiment"] + "?"
            csv.at[index, "question"] = question
        self.full_dataset = TSEDataset(csv)

    def setup(self, stage = None):
        """Load data. Set variables: self.data_train, self.data_val."""
        self.data_test = self.full_dataset


    def test_dataloader(self):
        return DataLoader(
            dataset=self.data_test,
            batch_size=self.val_batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        collate = torch.utils.data.dataloader.default_collate(batch)
        encodings = self.tokenizer(
            collate["question"],
            collate["text"],
            truncation=True,
            return_tensors="pt",
            padding=True,
            max_length=self.max_length,
        )
        return encodings, collate["textID"]

# Model

In [None]:
class TSEModel(pl.LightningModule):
    """
    A LightningModule organizes your PyTorch code into 5 sections:
        - Computations (init).
        - Train loop (training_step)
        - Validation loop (validation_step)
        - Test loop (test_step)
        - Optimizers (configure_optimizers)

    Read the docs:
        https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html
    """

    def __init__(
        self,
    ):
        super().__init__()

        # this line ensures params passed to LightningModule will be saved to ckpt
        # it also allows to access params with 'self.hparams' attribute
        self.model = RobertaForQuestionAnswering.from_pretrained('../input/tsepretrainedmodels/roberta/base/model')
        self.tokenizer = RobertaTokenizer.from_pretrained('../input/tsepretrainedmodels/roberta/base/tokenizer', use_fast=True)
        self.outputs = {"textID": [], "selected_text": [], "start": [], "end": [], "x": []}

    def forward(self, x):
        return self.model(**x)

    def step(self, batch):
        outputs = self.forward(batch)
        start = self.model(**batch).start_logits.argmax(dim=1)
        end = self.model(**batch).end_logits.argmax(dim=1)
        return outputs.loss, start, end, self.model(**batch).start_logits, self.model(**batch).end_logits

    def convert_pred_response(self, x, start, end):
        return [
            self.tokenizer.convert_tokens_to_string(
                self.tokenizer.convert_ids_to_tokens(
                    input_id[start[index] : end[index] + 1], skip_special_tokens=True
                )
            )
            for index, input_id in enumerate(x.input_ids)
        ]

    def test_step(self, batch, batch_idx: int):
        x, _ = batch
        loss, start, end, s_logit, e_logit = self.step(x)
        preds = self.convert_pred_response(x, start, end)
        self.outputs["textID"] += batch[1]
        self.outputs["selected_text"] += preds
        self.outputs["start"] += s_logit
        self.outputs["end"] += e_logit  
        self.outputs["x"] += [input_id for input_id in x.input_ids]       

# Inference

In [None]:
examples_id = list(pd.read_csv("../input/tweet-sentiment-extraction/test.csv")["textID"])
targets = [[] for _ in range(len(examples_id))]
start=[[] for _ in range(len(examples_id))]
end= [[] for _ in range(len(examples_id))]
x= [[] for _ in range(len(examples_id))]

trainer = Trainer(gpus=1)
datamodule = TSEDataModule()

for model_checkpoints in hyperparameters['roberta_base']:
    model = TSEModel().load_from_checkpoint(model_checkpoints)
    trainer.test(model=model, datamodule=datamodule)
    for index, target in enumerate(model.outputs['selected_text']):
        targets[index].append(target)
        start[index].append(model.outputs['start'][index])
        end[index].append(model.outputs['end'][index])
        x[index].append(model.outputs['x'][index])
    del model
    gc.collect()
    torch.cuda.empty_cache()

# Bagging Ensemble Inference

In [None]:
bagging_targets = [[] for _ in range(len(examples_id))]

for s, e, x_el, b in zip(start, end, x, bagging_targets):
    s_ = torch.mean(torch.stack(s), dim=0).argmax(dim=0)
    e_ = torch.mean(torch.stack(e), dim=0).argmax(dim=0)
    x_ = x_el[0]
    b.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(x_[s_ : e_ + 1], skip_special_tokens=True)))

In [None]:
test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df = pd.DataFrame(data={'textID':examples_id, 'selected_text': [target[0] for target in bagging_targets]})

# Post-Processing

In [None]:
for (_, col), (_, x_col) in zip(df.iterrows(), test.iterrows()):
    if len(x_col["text"].split()) <= 3:
        col["selected_text"] = x_col["text"]
df.to_csv('./submission.csv', index=False)