In [None]:
import numpy as np
import pandas as pd 
import re
import tqdm.notebook as tqdm
import transformers
from transformers import AutoTokenizer
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import os

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
DEVICE = torch.device('cuda')

MAX_LENGTH = 200
DOC_STRIDE = 128
BATCH_SIZE = 4

MODEL_PATH = 'roberta-large'

EPOCHS = 13

In [None]:
def clean_text(text):
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

def preprocess_data(data_to_process):
    data = data_to_process.copy()
    
    data['text'] = data['text'].apply(lambda x : clean_text(x))
    
    return data

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH)

In [None]:
p_train_data = preprocess_data(train_data)
p_test_data = preprocess_data(test_data)

In [None]:
p_train_data.head(10)

In [None]:
def tokenize_train_data(datas):
    datas = datas.reset_index().to_dict(orient='index')
    
    tokenized_datas = {
                        'input_ids' : [],
                        'attention_mask' : [],
                        'target':[]
                      }
    for data_idx in tqdm.tqdm_notebook(range(len(datas))):
        data = datas[data_idx]

        tokenized_data = TOKENIZER(text = data['text'],
                                   padding='max_length', 
                                   max_length=MAX_LENGTH, 
                                   truncation = 'only_first',
                                   stride=DOC_STRIDE
                                      )
        tokenized_datas["input_ids"].append(tokenized_data["input_ids"])
        tokenized_datas["attention_mask"].append(tokenized_data["attention_mask"])
        tokenized_datas["target"].append(data['target'])
    return tokenized_datas

def tokenize_test_data(datas):
    datas = datas.reset_index().to_dict(orient='index')
    
    tokenized_datas = {
                        'input_ids' : [],
                        'attention_mask' : []
                      }
    for data_idx in tqdm.tqdm_notebook(range(len(datas))):
        data = datas[data_idx]

        tokenized_data = TOKENIZER(text = data['text'],
                                   padding='max_length', 
                                   max_length=MAX_LENGTH, 
                                   truncation = 'only_first',
                                   stride=DOC_STRIDE
                                      )
        tokenized_datas["input_ids"].append(tokenized_data["input_ids"])
        tokenized_datas["attention_mask"].append(tokenized_data["attention_mask"])
        
    return tokenized_datas

In [None]:
class TweetTrainTensorDataset:
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data['input_ids'])
    
    def __getitem__(self, idx):
        
        data = {}
        data['input_ids'] = torch.tensor(self.tokenized_data['input_ids'][idx])
        data['attention_mask'] = torch.tensor(self.tokenized_data['attention_mask'][idx])
        data['target'] = torch.tensor(self.tokenized_data['target'][idx], dtype=torch.float32)
        return data
    
class TweetTestTensorDataset:
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_masks = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    
    def __getitem__(self, idx):
        
        data = {}
        data['input_ids'] = torch.tensor(self.input_ids[idx])
        data['attention_mask'] = torch.tensor(self.attention_masks[idx])
        return data

In [None]:
t_train_data = tokenize_train_data(p_train_data)
t_test_data = tokenize_test_data(p_test_data)

In [None]:
dataset = TweetTrainTensorDataset(t_train_data)
test_dataset = TweetTestTensorDataset(t_test_data)

In [None]:
train_dataset, valid_dataset = train_test_split(dataset, train_size=0.8, shuffle=True, random_state=1)

train_dl = DataLoader(train_dataset, BATCH_SIZE, pin_memory=True, shuffle=True, num_workers=os.cpu_count())
valid_dl = DataLoader(valid_dataset, BATCH_SIZE, pin_memory=True, num_workers=os.cpu_count())
test_dl = DataLoader(test_dataset, BATCH_SIZE)

In [None]:
def eval_fn(model, valid_dl, device):
    model.eval()
    losses =[]
    loss_sum = None
    f1_scores = []
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(valid_dl, total=len(valid_dl)):
            for k, v in data.items():
                data[k] = v.to(device)
            preds, loss = model(data)
            
            if loss_sum == None:
                loss_sum = loss
            else:
                loss_sum += loss
                
            losses.append(loss.item())
            f1_scores.append(f1_score(data['target'].cpu(),preds, zero_division=1))
            
    print(f'Val_Loss : {sum(losses) / len(losses):.5f}, F1 : {sum(f1_scores) / len(f1_scores):.5f}')

    return loss_sum
    
def train_fn(model, train_dl, optimizer, device, scheduler = None):
    model.train()
    losses = []
    f1_scores = []
    optimizer.zero_grad()
    for i, data in tqdm.tqdm_notebook(enumerate(train_dl), total=len(train_dl)):
        for k, v in data.items():
            data[k] = v.to(device)

        preds, loss = model(data)
        
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()

        losses.append(loss.item())
        f1_scores.append(f1_score(data['target'].cpu(),preds, zero_division = 1))
        print(f'Loss : {sum(losses) / len(losses):.5f}, F1 : {sum(f1_scores) / len(f1_scores):.5f}', end="\r")
        
    return losses

In [None]:
class TweetModel(nn.Module):
    def __init__(self,roberta_path):
        super(TweetModel, self).__init__()
        
        self.loss_fn = nn.functional.binary_cross_entropy_with_logits

        self.roberta = transformers.RobertaForSequenceClassification.from_pretrained(roberta_path, num_labels = 1)
        self.sigmoid = torch.sigmoid
     
    def forward(self, data):
        output= self.roberta(data['input_ids'], data['attention_mask'])
        logits = self.sigmoid(output[0])
        loss = self.loss_fn(logits, data['target'].reshape(-1,1))
        
        preds = logits.cpu().detach().numpy()
        
        preds[preds >= 0.5] = 1 
        preds[preds < 0.5] = 0
        
        return preds, loss
    
    def predict(self, data):
        
        output= self.roberta(data['input_ids'], data['attention_mask'])
        logits = self.sigmoid(output[0])
        preds = logits.cpu().detach().numpy()
        
        preds[preds > 0.5] = 1 
        preds[preds <= 0.5] = 0
        
        return preds

In [None]:
def train_model(epochs, model, train_dl, valid_dl, optimizer, scheduler, device):
    
    for epoch in tqdm.tqdm_notebook(range(epochs)):
        losses = []
        
        model.train()
        
        losses = train_fn(model, train_dl, optimizer, device)
        val_losses = eval_fn(model, valid_dl, device)
        
        scheduler.step(val_losses)

        print(f'EPOCH : {epoch}, Loss : {sum(losses) / len(losses):5f}') 

In [None]:
model = TweetModel(MODEL_PATH)

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=1e-7, eps=1e-6, weight_decay=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

In [None]:
model.to(DEVICE)

In [None]:
%%time
train_model(EPOCHS, model, train_dl, valid_dl, optimizer, scheduler, DEVICE)

In [None]:
def create_submission(model, test_data, test_dl, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(test_dl, total=len(test_dl)):
            for k, v in data.items():
                data[k] = v.to(device)
            preds = model.predict(data)
            
            preds = [int(n) for n in preds]
            
            predictions.extend(preds)

            
    submission = {'id' : test_data['id'].to_list(),
                 'target': predictions}
    
    return submission
            

In [None]:
submission = create_submission(model, test_data, test_dl, DEVICE)

In [None]:
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index=False)
df_submission = pd.read_csv('submission.csv')
df_submission