In [2]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from tqdm import tqdm
import os

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
df_test = pd.read_csv("/kaggle/input/roberta-data-inference/test_data.csv")

In [5]:
df_test.shape

(5724, 3)

In [6]:
df_test.head()

Unnamed: 0,ID,article_link,text
0,27927,https://www.huffingtonpost.com/entry/teacher-e...,states slow to shut down weak teacher educatio...
1,1660,https://www.theonion.com/drone-places-fresh-ki...,drone places fresh kill on steps of white house
2,96,https://www.theonion.com/report-majority-of-in...,report: majority of instances of people gettin...
3,6237,https://local.theonion.com/sole-remaining-lung...,"sole remaining lung filled with rich, satisfyi..."
4,6650,https://www.huffingtonpost.com/entry/the-gops-...,the gop's stockholm syndrome


In [7]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
# TRAIN_BATCH_SIZE = 16
# VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 1
# LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [15]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe['text']
        self.ids = dataframe['ID']
#         self.targets = self.data["IS_SARCASTIC"]
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
#         token_type_ids = inputs["token_type_ids"]
        ID = self.ids[index]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
#             'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'ID': torch.tensor(ID, dtype=torch.long)
        }

In [16]:
test_dataset=df_test

print("TEST Dataset: {}".format(test_dataset.shape))

testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TEST Dataset: (5724, 3)


In [17]:
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(testing_set, **test_params)

In [18]:
class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1)
    
    def forward(self, ids, mask):
        # RoBERTa doesn't use token type ids so we don't pass that
        output = self.l1(ids, attention_mask=mask)
        # In transformers>=3.0, outputs are returned as tuples with the first element being last hidden state
        output_1 = output.last_hidden_state[:, 0]  # we take the output from the first token (CLS token)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [20]:
model = RoBERTaClass().to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model.load_state_dict(torch.load("/kaggle/input/roberta-data-inference/roberta_model.pth"))  

<All keys matched successfully>

In [24]:
result_dict = {}
result_dict['ID']= []
result_dict["label"] = []

In [25]:
def validation():
    model.eval()
    fin_outputs=[]
    with torch.no_grad():
        for i, data in enumerate(testing_loader, 0):
            ID = data['ID'].to(device, dtype = torch.long)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            val = torch.sigmoid(outputs).item()
            result_dict['ID'].append(ID.item())
            if val>=0.5:
                result_dict['label'].append(1)
            else:
                result_dict['label'].append(0)
#             fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return 

In [26]:
validation()

In [27]:
len(result_dict['ID'])

5724

In [28]:
len(result_dict['label'])

5724

In [29]:
df = pd.DataFrame(result_dict)
csv_file = "submission.csv"
df.to_csv(csv_file, index=False)

In [30]:
result_dict['ID'][:10]

[27927, 1660, 96, 6237, 6650, 27135, 2718, 19300, 13658, 12840]

In [31]:
result_dict['label'][:10]

[0, 1, 1, 1, 0, 0, 0, 1, 1, 1]