In [None]:
import numpy as np
import pandas as pd
import re
import os
import string
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from transformers import ElectraTokenizer, ElectraForSequenceClassification,AdamW,get_linear_schedule_with_warmup
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import StratifiedKFold

In [None]:
class CFG:
    ROOT_DIR = '../input/nlp-getting-started'
    BATCH_SIZE = 32
    ELECTRA_MODEL = 'google/electra-base-discriminator'
    EPOCHS = 30
    DEVICE = 'cuda'

In [None]:
def preprocess(text):
    text=text.lower()
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #Replace &amp, &lt, &gt with &,<,> respectively
    text=text.replace(r'&amp;?',r'and')
    text=text.replace(r'&lt;',r'<')
    text=text.replace(r'&gt;',r'>') 
    #remove mentions
    text = re.sub(r"(?:\@)\w+", '', text)
    #remove non ascii chars
    text=text.encode("ascii",errors="ignore").decode()
    #remove some puncts (except . ! ?)
    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
    text=re.sub(r'[!]+','!',text)
    text=re.sub(r'[?]+','?',text)
    text=re.sub(r'[.]+','.',text)
    text=re.sub(r"'","",text)
    text=re.sub(r"\(","",text)
    text=re.sub(r"\)","",text)

    text=" ".join(text.split())
    return text

In [None]:
class LitDataNLP(pl.LightningDataModule):
    def __init__(self, fold, tokenizer, data_dir:str = './', batch_size: int = 32):
        super().__init__()
        self.fold = fold
        self.tokenizer = tokenizer
        self.data_dir = data_dir
        self.batch_size = batch_size
        
    def setup(self, stage=None):
        train_df = pd.read_csv(os.path.join(self.data_dir, 'train.csv'))
        #train_df['text'] = train_df['text'].apply(preprocess)
        train_df = train_df[train_df["text"]!='']
        train_df = train_df[["text","target"]]
        texts = train_df.text.values
        labels = train_df.target.values
        indices=self.tokenizer.batch_encode_plus(texts,max_length=64,add_special_tokens=True, 
                                            return_attention_mask=True,pad_to_max_length=True,
                                            truncation=True)

        input_ids=np.array(indices["input_ids"])
        attention_masks=np.array(indices["attention_mask"])
        skf = StratifiedKFold(5, shuffle=True, random_state=42)
        #train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
        #                                                    random_state=42, test_size=0.2)
        #train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
        #                                             random_state=42, test_size=0.2)
        for fold, (tr_idx, val_idx) in enumerate(skf.split(input_ids, labels)):
            train_inputs = input_ids[tr_idx]
            train_labels = labels[tr_idx]
            validation_inputs = input_ids[val_idx]
            validation_labels = labels[val_idx]
            if fold == self.fold:
                break
                
        for fold, (tr_idx, val_idx) in enumerate(skf.split(attention_masks, labels)):
            train_masks = attention_masks[tr_idx]
            validation_masks = attention_masks[val_idx]
            if fold == self.fold:
                break
                
        self.train_inputs = torch.tensor(train_inputs)
        self.validation_inputs = torch.tensor(validation_inputs)
        self.train_labels = torch.tensor(train_labels, dtype=torch.long)
        self.validation_labels = torch.tensor(validation_labels, dtype=torch.long)
        self.train_masks = torch.tensor(train_masks, dtype=torch.long)
        self.validation_masks = torch.tensor(validation_masks, dtype=torch.long)
        
    def train_dataloader(self):
        train_data = TensorDataset(self.train_inputs, self.train_masks, self.train_labels)
        train_sampler = RandomSampler(train_data)
        return DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size)
    
    def val_dataloader(self):
        validation_data = TensorDataset(self.validation_inputs, self.validation_masks, self.validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        return DataLoader(validation_data, sampler=validation_sampler, batch_size=self.batch_size)

In [None]:
class LitNLPModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = ElectraForSequenceClassification.from_pretrained(CFG.ELECTRA_MODEL, num_labels=2)
        self.f1_score = pl.metrics.F1(num_classes=2)
        
    def forward(self, b_input_ids, b_input_mask, b_labels):
        output = self.model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask,
                        labels=b_labels)
        return output
    
    def training_step(self, batch, batch_idx):
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]
        z = self(b_input_ids, b_input_mask, b_labels)
        loss = z[0]
        return loss

    def validation_step(self, batch, batch_idx):
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]
        z = self(b_input_ids, b_input_mask, b_labels)
        val_loss = z[0]
        logits = z[1]
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        self.log('val_loss', val_loss, prog_bar=True)
        self.log('val_f1_score', self.f1_score(logits, b_labels), prog_bar=True)
        return val_loss
    
    def configure_optimizers(self):
        optimizer = AdamW(model.parameters(), lr=6e-6)
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=189*CFG.EPOCHS)
        return [optimizer], [scheduler]
    
    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
tokenizer = ElectraTokenizer.from_pretrained(CFG.ELECTRA_MODEL)
for fold in range(5):
    dm = LitDataNLP(fold=fold, tokenizer=tokenizer, data_dir=CFG.ROOT_DIR, batch_size=CFG.BATCH_SIZE)
    chk_callback = ModelCheckpoint(
        monitor='val_f1_score',
        filename='model_best',
        save_top_k=1,
        mode='max',
    )
    es_callback = EarlyStopping(
       monitor='val_f1_score',
       min_delta=0.001,
       patience=5,
       verbose=False,
       mode='max'
    )
    model = LitNLPModel()

    trainer = pl.Trainer(
        gpus=1,
        max_epochs=CFG.EPOCHS,
        callbacks=[chk_callback, es_callback]
    )

    trainer.fit(model, dm)

## Submission

In [None]:
def run_inference(data_dir, model, device, batch_size:int = 32):
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
    #test_df['text'] = test_df['text'].apply(preprocess)
    comments = test_df.text.values

    indices = tokenizer.batch_encode_plus(comments, max_length=128, add_special_tokens=True, 
                                           return_attention_mask=True, pad_to_max_length=True,
                                           truncation=True)
    input_ids = indices["input_ids"]
    attention_masks = indices["attention_mask"]

    test_inputs = torch.tensor(input_ids)
    test_masks = torch.tensor(attention_masks)

    # Create the DataLoader.
    test_data = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    print('Predicting labels...')
    
    preds = []
    for fold in range(5):
        model.load_state_dict(torch.load(f'./lightning_logs/version_{fold}/checkpoints/model_best.ckpt')['state_dict'])
        model.eval()
        model.to(device)

        # Tracking variables 
        predictions = []

        # Predict 
        for batch in tqdm(test_dataloader, total=len(test_dataloader)):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask = batch

            with torch.no_grad():
                outputs = model(b_input_ids, b_input_mask, None)

            logits = outputs[0]

            logits = logits.detach().cpu().numpy()

            # Store predictions and true labels
            predictions.append(logits)

        flat_predictions = [item for sublist in predictions for item in sublist]
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        preds.append(flat_predictions)
    return np.round(np.mean(preds, axis=0), 0)

In [None]:
preds = run_inference(CFG.ROOT_DIR, model, CFG.DEVICE, batch_size=CFG.BATCH_SIZE)

In [None]:
sample_sub = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':preds.astype(int)})
sub.to_csv('submission.csv',index=False)
sub