In [None]:
import pandas as pd
import glob
import json
import re
import plotly.express as exp
import os
import numpy as np
from tqdm.autonotebook import tqdm

In [None]:
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
import gc

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import Callback

In [None]:
WINDOW_LENGTH = 512
STRIDE_LENGTH = 256
SEQ_LEN = 30
EPOCHS=10
FOLDS=5
BATCH_SIZE=128
PRED_LENGTH=20

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")

In [None]:
base_path = "../input/coleridgeinitiative-show-us-the-data/train"
train_files = train.Id.to_numpy()

In [None]:
submission = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")

In [None]:
def mask_if_present(s,l):
    len_s = len(s) 
    m = []
    for i in range(len(l) - len_s+1):
        if s==l[i:len_s+i]:
            m.extend([x for x in range(i,len_s+i)])
    return list(set(m))

In [None]:
# extra_dsets = pd.read_csv("../input/bigger-govt-dataset-list/data_set_26897.csv")
ext_2 = pd.read_csv("../input/bigger-govt-dataset-list/data_set_800.csv")

filtered = []
# for x in extra_dsets.title:
#     if len(x.split())>3 and len(x.split())<8:
#         filtered.append(x)
for x in ext_2.title:
    if len(x.split())>3 and len(x.split())<8:
        filtered.append(x)
filtered  = list(set(filtered))
filtered = np.array(filtered)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, x_train, y_train,train=True):
        self.pre_x = x_train
        self.pre_y = y_train
        self.pro = 0.9
        self.train=train

    def __getitem__(self, index):
        x = self.pre_x[index]
        y = self.pre_y[index]
        if len(y)>1 and np.random.binomial(True,self.pro) and self.train:
            r = np.random.choice(filtered)
            x = x.replace(y,r)
            y = r
        x = tokenizer(x,max_length=SEQ_LEN,padding="max_length",truncation=True)
        y = tokenizer(y)
        lab = torch.zeros(len(x['input_ids']))
        ind = mask_if_present(y['input_ids'][1:-1],x['input_ids'])
        if(len(ind)!=0):
            lab[ind] = 1
        return {"inputs":{"input_ids":torch.tensor(x["input_ids"],dtype=torch.long),
                 "attention_mask":torch.tensor(x["attention_mask"],dtype=torch.long)}, "label": lab.float()}

    def __len__(self):
        return len(self.pre_x)

In [None]:
def create_chunks(text,label):
    t = []
    l = []
    p = 0
    while p+WINDOW_LENGTH < len(text):
        t.append(text[p:p+WINDOW_LENGTH])
        if(t[-1].find(label)==-1):
            l.append("")
        else:
            l.append(label)
        p += STRIDE_LENGTH
    t.append(text[p:])
    if(t[-1].find(label)==-1):
        l.append("")
    else:
        l.append(label)
    return t,l

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/scibert-huggingface/coleridge-scibert-models/output")

In [None]:
train = pd.read_parquet('../input/bert-segmentation-data-prep/data.parq')
train = train[train.text!=""]

In [None]:
mask = train.label!=""
texts = train[mask].text.to_list()
labels = train[mask].label.to_list()
no_lab = train[~mask].sample(int(len(texts)*5))
texts.extend(no_lab.text.to_list())
labels.extend(no_lab.label.to_list())
len(texts)

In [None]:
class MetricsCallback(Callback):
    def __init__(self,model,fold,hp=False):
        super().__init__()
        self.model = model
        self.current_low = 999
        self.metrics = []
        self.fold=fold
        self.hp = hp
    def on_validation_end(self,trainer,pl_module):
        if 'val_loss' in trainer.callback_metrics:
            self.metrics.append(trainer.callback_metrics['val_loss'])
            if(self.hp):
                return
            if trainer.callback_metrics['val_loss']<self.current_low:
                print("saving state dict")
                self.current_low = trainer.callback_metrics['val_loss']
                torch.save(self.model.state_dict(),f'model_best_{self.fold}.state')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve,fbeta_score

In [None]:
def Find_Optimal_Cutoff(target, pred):
    thres = np.arange(0,1,0.01)
    fbeta = []
    for th in thres:
        fbeta.append(fbeta_score(target,pred>th,beta=0.5))
    argmax = np.argmax(fbeta)
    print("best_beta score: ",fbeta[argmax])
    return thres[np.argmax(fbeta)]

In [None]:
class DatasetFinder(pl.LightningModule):
    
    def __init__(self,params):
        super().__init__()
        self.model = transformers.AutoModel.from_pretrained("../input/scibert-huggingface/coleridge-scibert-models/output")
        for param in self.model.parameters():
            param.requires_grad=False
        self.dr = torch.nn.Dropout(params['dropout'])
        self.fc1 = torch.nn.Linear(768,params['lstm_inp_size'])
        self.relu = torch.nn.ReLU(inplace=True)
        self.lstm = torch.nn.LSTM(input_size=params['lstm_inp_size'],hidden_size=params['hid_size'],bidirectional=True,batch_first=True)
        self.fc = torch.nn.Linear(2*params['hid_size'],1)
        self.lr = params['lr']
        self.lossf = params['loss_func']
        self.e=0
    def forward(self,inp):
        inp = self.dr(self.model(**inp).last_hidden_state)
        inp = self.relu(self.fc1(inp))
        inp,_=self.lstm(inp)
        inp = self.fc(inp).squeeze(2)
        return torch.sigmoid(inp)
    
    def training_step(self,batch,batch_idx):
        x = batch['inputs']
        y = batch['label'] 
        loss = self.lossf(self(x),y)
        return {"loss": loss}
    def validation_step(self,batch,batch_idx):
        x = batch['inputs']
        x = self(x)
        y = batch['label'] 
        loss = self.lossf(x,y)
        return {"valid_loss": loss,"pred":x.view(-1).detach().cpu(),"true":y.view(-1).detach().cpu()}
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["valid_loss"] for x in outputs]).mean().item()
        pred = torch.cat([x["pred"] for x in outputs],0)
        true = torch.cat([x["true"] for x in outputs],0).numpy()
        if true.sum()>0:
            thr = Find_Optimal_Cutoff(true,pred.numpy())
            print("roc_auc: ",roc_auc_score(true,pred.numpy()))
            print(thr," : ")
            print(confusion_matrix(true,(pred>thr).long().numpy()))
            print("0.5: ")
            print(confusion_matrix(true,(pred>0.5).long().numpy()))
            print("0.1: ")
            print(confusion_matrix(true,(pred>0.5).long().numpy()))
            print("0.9: ")
            print(confusion_matrix(true,(pred>0.5).long().numpy()))

        print(f'valid_loss epoch {self.e} : ',avg_loss)
        self.lrreducer.step(avg_loss)
        self.e+=1
        self.log('val_loss',avg_loss)
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean().item()
        print('train_loss : ',avg_loss)
        self.log('train_loss',avg_loss)
    def configure_optimizers(self):
        self.optimizer = torch.optim.Adam(self.parameters(),lr = self.lr,weight_decay=1e-5)
        self.lrreducer = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,factor=0.1,verbose=True,patience=3)
        return self.optimizer

In [None]:
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
params = {
        'lr':0.000150074,
        'loss_func':torch.nn.BCELoss(),
        'lstm_inp_size': 2**10,
        'dropout': 0.0024323,
        'hid_size':2**12
    }  

In [None]:
skf = KFold(FOLDS,shuffle=True)
for fold, (idt, idv) in enumerate(skf.split(texts,labels)):
    print(f'training fold: {fold}')
    mtrain, ltrain = [texts[i] for i in idt], [labels[i] for i in idt]
    mvalid, lvalid = [texts[i] for i in idv], [labels[i] for i in idv]
    tr_ds = CustomDataset(mtrain,ltrain)
    val_ds  = CustomDataset(mvalid,lvalid,train=False)
    t_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True,num_workers=2)
    v_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,num_workers=2)
    model = DatasetFinder(params)
    esc = EarlyStopping(
        monitor = 'val_loss',
        patience = 10,
        verbose = True
    )
    mc = MetricsCallback(model,fold,False)
    trainer = pl.Trainer(max_epochs=15,gpus=1,callbacks=[esc,mc],checkpoint_callback=False,logger=False)
    trainer.fit(model,t_loader,v_loader)
    gc.collect()

In [None]:
# import optuna

In [None]:
# ids = np.arange(len(texts))
# mask = np.random.rand(len(ids))<0.8
# idt = ids[mask]
# idv = ids[~mask]
# mtrain, ltrain = [texts[i] for i in idt], [labels[i] for i in idt]
# mvalid, lvalid = [texts[i] for i in idv], [labels[i] for i in idv]
# tr_ds = CustomDataset(mtrain,ltrain)
# val_ds  = CustomDataset(mvalid,lvalid,train=False)
# t_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True,num_workers=2)
# v_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,num_workers=2)

In [None]:
# def objective(trail):
#     params = {
#         'lr':trail.suggest_loguniform("lr",0.00001,0.01),
#         'loss_func':torch.nn.BCELoss(),
#         'lstm_inp_size': 2**trail.suggest_int("lstm_inp_size",4,12),
#         'dropout': trail.suggest_float('dropout',0,0.5),
#         'hid_size':2**trail.suggest_int("hid_size",4,12)
#     }    
#     model = DatasetFinder(params)
#     esc = EarlyStopping(
#         monitor = 'val_loss',
#         patience = 10,
#         verbose = True
#     )
#     mc = MetricsCallback(model,0,False)
#     trainer = pl.Trainer(max_epochs=10,gpus=1,callbacks=[esc,mc],checkpoint_callback=False,logger=False)
#     trainer.fit(model,t_loader,v_loader)
#     return mc.current_low

In [None]:
# study = optuna.create_study()

In [None]:
# study.optimize(objective)