In [2]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.path.append('..')
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.max_rows=1000
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


import torch
from torch.utils.data import DataLoader,Dataset
import transformers
from torch import nn
from torch import optim
import torch.nn.functional as F

from tqdm import tqdm
import random
import gc
from sklearn.metrics import f1_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from dataset import BertDataV1
from Engine import GpuEngine
from utils import EarlyStopping,SlackWebhook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_DIR = '../'

In [4]:
seed = 100
random.seed(seed)

torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

np.random.seed(seed)

<torch._C.Generator at 0x7f4710d7a310>

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [6]:
webhook_url = "YOUR SLACK WEBHOOK"
slack_loggger = SlackWebhook(webhook_url,verbose=True)

In [7]:
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
train.ABSTRACT = train.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
train.TITLE = train.TITLE.str.replace('\n',' ').str.strip().str.lower()

test.ABSTRACT = test.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
test.TITLE = test.TITLE.str.replace('\n',' ').str.strip().str.lower()
train['text'] = train['TITLE']+'. '+train['ABSTRACT']
test['text'] = test['TITLE']+'. '+test['ABSTRACT']

In [12]:
BERT_TYPE = 'allenai/scibert_scivocab_uncased'
BERT_CONFIG = transformers.AutoConfig.from_pretrained(BERT_TYPE)
BERT_CONFIG.output_hidden_states = True
BERT_TOKENIZER = transformers.AutoTokenizer.from_pretrained(BERT_TYPE)
BERT_TOKENIZER.model_max_length = 512

TARGET_COLS = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
NUM_WORKERS = 0

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [13]:
class BertForPaperClassificationV1(nn.Module):
    
    def __init__(self,arch_type,config):
        
        super().__init__()
        self.arch_type = arch_type
        self.config = config        
        self.bert = transformers.AutoModel.from_pretrained(arch_type,config = config)
        self.cs = nn.Linear(768,2)
        self.phy = nn.Linear(768,2)        
        self.math = nn.Linear(768,2)        
        self.stat = nn.Linear(768,2)        
        self.bio = nn.Linear(768,2)               
        self.fin = nn.Linear(768,2)
        self.pool = nn.AdaptiveAvgPool1d(1)
    
    def forward(self,input_ids,token_type_ids,attention_mask):
        
        emb = self.bert(input_ids = input_ids, token_type_ids = token_type_ids , attention_mask = attention_mask)
        
        pooled_output = self.pool(emb[0].permute(0,2,1)).squeeze(-1)
        
        output = {}
        
        output['cs'] = self.cs(pooled_output)
        output['phy'] = self.phy(pooled_output)        
        output['math'] = self.math(pooled_output)        
        output['stat'] = self.stat(pooled_output)        
        output['bio'] = self.bio(pooled_output)                
        output['fin'] = self.fin(pooled_output)                
                
        return output

In [14]:
def AvgLoss(preds,targets):
    loss_ = nn.CrossEntropyLoss()
    return torch.stack([loss_(x[0],x[1]) for x in list(zip(preds,targets))]).mean()    
    

In [15]:
folds = MultilabelStratifiedKFold(n_splits=5, random_state=100,shuffle=True)
folds = [(x,y) for x,y in folds.split(train['ID'],train[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']])]                                  



In [16]:
[(len(x),len(y)) for x,y in folds]

[(16768, 4204), (16787, 4185), (16767, 4205), (16810, 4162), (16756, 4216)]

In [17]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,text
0,1,reconstructing subject-specific effect maps,predictive models allow subject-specific infer...,1,0,0,0,0,0,reconstructing subject-specific effect maps. p...
1,2,rotation invariance neural network,rotation invariance and translation invariance...,1,0,0,0,0,0,rotation invariance neural network. rotation i...
2,3,spherical polyharmonics and poisson kernels fo...,we introduce and develop the notion of spheric...,0,0,1,0,0,0,spherical polyharmonics and poisson kernels fo...
3,4,a finite element approximation for the stochas...,the stochastic landau--lifshitz--gilbert (llg)...,0,0,1,0,0,0,a finite element approximation for the stochas...
4,5,comparative study of discrete wavelet transfor...,fourier-transform infra-red (ftir) spectra of ...,1,0,0,1,0,0,comparative study of discrete wavelet transfor...


In [18]:
for i in [0,1,2,3,4]:
    
    train_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][0]], target_cols=TARGET_COLS)
    val_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][1]], target_cols=TARGET_COLS)

    train_dl = DataLoader(dataset=train_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=True,shuffle=True)    
    val_dl = DataLoader(dataset=val_df,batch_size=VAL_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

    early_stopping_rounds=5
    maximize=True
    num_epochs=4
    max_lr=0.00001

    mod = BertForPaperClassificationV1(arch_type=BERT_TYPE,config=BERT_CONFIG)

    _ = mod.to(device)

    optimizer = optim.AdamW(params=mod.parameters(),lr=max_lr)

    schedular = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=1,factor=0.5,mode='max',min_lr=0.0000001,verbose=True)

    es=EarlyStopping(patience=early_stopping_rounds,higher_is_better=maximize,tolerance=0.0001,
                    save_path='scibert_uncased_V1_fold{}.path'.format(i),model=mod)    

    fitter = GpuEngine(model = mod,device = device,optimizer= optimizer,schedular=schedular, slack_header='ScibertV2 scivocab uncased maverick711 fold {}'.format(i),
                       es = es,log_path='log_fold{}.txt'.format(i),num_epochs=num_epochs,
                       slack_loggger=slack_loggger,criterion=AvgLoss)
    
    fitter.fit(train_dl,val_dl)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




100%|██████████| 1048/1048 [16:24<00:00,  1.06it/s]
100%|██████████| 263/263 [01:35<00:00,  2.75it/s]


Epoch: 0 train loss: 0.17364191411077293 train_metric: 0.8206519083223378 val_loss: 0.15447639535712016 val_metric: 0.8453508602358399 time 18.01


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:30<00:00,  1.06it/s]
100%|██████████| 263/263 [01:36<00:00,  2.71it/s]


Epoch: 1 train loss: 0.13292056568791621 train_metric: 0.8661966782172303 val_loss: 0.14978721136711076 val_metric: 0.8528061463506043 time 18.14


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:36<00:00,  1.05it/s]
100%|██████████| 263/263 [01:37<00:00,  2.70it/s]


Epoch: 2 train loss: 0.10828899747233496 train_metric: 0.893944431251484 val_loss: 0.1616134744644392 val_metric: 0.8514375520014792 time 18.24


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:38<00:00,  1.05it/s]
100%|██████████| 263/263 [01:37<00:00,  2.70it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08301891538337626 train_metric: 0.919222174677887 val_loss: 0.17295840416603378 val_metric: 0.8463181353886156 time 18.26
200


100%|██████████| 1049/1049 [16:42<00:00,  1.05it/s]
100%|██████████| 262/262 [01:37<00:00,  2.69it/s]


Epoch: 0 train loss: 0.1725667156634215 train_metric: 0.8222618553174855 val_loss: 0.15067211454344764 val_metric: 0.8504646038892614 time 18.33


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:41<00:00,  1.05it/s]
100%|██████████| 262/262 [01:37<00:00,  2.70it/s]


Epoch: 1 train loss: 0.13267503474955336 train_metric: 0.8668227648492378 val_loss: 0.15196589135354566 val_metric: 0.8459907478797224 time 18.31


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:38<00:00,  1.05it/s]
100%|██████████| 262/262 [01:35<00:00,  2.74it/s]


Epoch     3: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 2 train loss: 0.10826743585195168 train_metric: 0.8937495542188725 val_loss: 0.16449267906242515 val_metric: 0.8470007593014427 time 18.24


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:33<00:00,  1.06it/s]
100%|██████████| 262/262 [01:34<00:00,  2.76it/s]


Epoch: 3 train loss: 0.077515400667365 train_metric: 0.9258153204398939 val_loss: 0.17598938066045017 val_metric: 0.8473842679713964 time 18.14
200


100%|██████████| 1047/1047 [16:30<00:00,  1.06it/s]
100%|██████████| 263/263 [01:35<00:00,  2.74it/s]


Epoch: 0 train loss: 0.17483174086414072 train_metric: 0.8203972666487055 val_loss: 0.15157963182072862 val_metric: 0.8442880243508037 time 18.11


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:27<00:00,  1.06it/s]
100%|██████████| 263/263 [01:35<00:00,  2.75it/s]


Epoch: 1 train loss: 0.1336356851822895 train_metric: 0.8642164491799353 val_loss: 0.15161141619954466 val_metric: 0.8486268628737463 time 18.07


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:27<00:00,  1.06it/s]
100%|██████████| 263/263 [01:35<00:00,  2.76it/s]


Epoch: 2 train loss: 0.10958804124067853 train_metric: 0.8918565069389892 val_loss: 0.15656918620382845 val_metric: 0.8483342536351923 time 18.04


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:23<00:00,  1.06it/s]
100%|██████████| 263/263 [01:34<00:00,  2.77it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08617095627004517 train_metric: 0.9161143182952035 val_loss: 0.17615241101465667 val_metric: 0.8370840546258896 time 17.98
200


100%|██████████| 1050/1050 [16:25<00:00,  1.07it/s]
100%|██████████| 261/261 [01:34<00:00,  2.76it/s]


Epoch: 0 train loss: 0.17465337162216504 train_metric: 0.8200777487103005 val_loss: 0.15118475778745322 val_metric: 0.8438106325184324 time 18.01


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:26<00:00,  1.06it/s]
100%|██████████| 261/261 [01:34<00:00,  2.77it/s]


Epoch: 1 train loss: 0.13333296378630968 train_metric: 0.865047427421673 val_loss: 0.15454587162772604 val_metric: 0.8436589263890195 time 18.01


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:31<00:00,  1.06it/s]
100%|██████████| 261/261 [01:34<00:00,  2.75it/s]


Epoch: 2 train loss: 0.10853119161068683 train_metric: 0.8915846421506708 val_loss: 0.16450687152848673 val_metric: 0.8462273161413562 time 18.12


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:34<00:00,  1.06it/s]
100%|██████████| 261/261 [01:34<00:00,  2.76it/s]


Epoch: 3 train loss: 0.08387430727836631 train_metric: 0.9203531399471718 val_loss: 0.17557749024725258 val_metric: 0.8420550038197097 time 18.14
200


100%|██████████| 1047/1047 [16:37<00:00,  1.05it/s]
100%|██████████| 264/264 [01:38<00:00,  2.69it/s]


Epoch: 0 train loss: 0.1750129254387181 train_metric: 0.8176097549031611 val_loss: 0.14930868155872573 val_metric: 0.8477429227237949 time 18.27


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:34<00:00,  1.05it/s]
100%|██████████| 264/264 [01:35<00:00,  2.77it/s]


Epoch: 1 train loss: 0.13373031443360314 train_metric: 0.8642128942708085 val_loss: 0.14858002723875036 val_metric: 0.851851851851852 time 18.17


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:27<00:00,  1.06it/s]
100%|██████████| 264/264 [01:38<00:00,  2.69it/s]


Epoch: 2 train loss: 0.11018770402288483 train_metric: 0.890945425132036 val_loss: 0.15841361499290765 val_metric: 0.8494694337496479 time 18.09


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:30<00:00,  1.06it/s]
100%|██████████| 264/264 [01:37<00:00,  2.71it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08374755422886258 train_metric: 0.9187341231214833 val_loss: 0.18024627024579998 val_metric: 0.8432649932937345 time 18.12
200


-------------------------------------

### Inferencing

In [9]:
def predict(test_dl,model,model_path=None):

    if model_path is not None:
        model.load_state_dict(torch.load(model_path))

    model.eval()
    with torch.no_grad():
        pred_list=[]
        tar_list=[]
    for  data_ in tqdm(test_dl, total=len(test_dl)):        
        input_ids = data_['input_ids'].to(device)                        
        token_type_ids = data_['token_type_ids'].to(device)                                    
        attention_mask = data_['attention_mask'].to(device)      

        preds = model(input_ids,token_type_ids,attention_mask)            
        preds = list(preds.values())
        preds = [x.cpu().detach().numpy() for x in preds]
        pred_list.append(preds)
            
    return pred_list

In [None]:
models = glob.glob('*.path')
models

In [None]:
test_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = test)
test_dl = DataLoader(dataset=test_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

In [None]:
%%time
preds1 = []
preds2 = []
preds3 = []
preds4 = []
preds5 = []
preds6 = []

for _mod in models:
    m = predict(test_dl,mod,_mod)
    preds1.append(np.concatenate([x[0] for x in m]))
    preds2.append(np.concatenate([x[1] for x in m]))
    preds3.append(np.concatenate([x[2] for x in m]))    
    preds4.append(np.concatenate([x[3] for x in m]))
    preds5.append(np.concatenate([x[4] for x in m]))
    preds6.append(np.concatenate([x[5] for x in m]))        

test['Computer Science'] = np.sum(preds1,axis=0).argmax(axis=1)
test['Physics'] = np.sum(preds2,axis=0).argmax(axis=1)
test['Mathematics'] = np.sum(preds3,axis=0).argmax(axis=1)
test['Statistics'] = np.sum(preds4,axis=0).argmax(axis=1)
test['Quantitative Biology'] = np.sum(preds5,axis=0).argmax(axis=1)
test['Quantitative Finance'] = np.sum(preds6,axis=0).argmax(axis=1)

In [None]:
test[['ID','Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']].to_csv('submission_scibertv2_uncased.csv',index=False)

In [None]:
dict_ = {}
dict_['Computer Science'] = np.sum(preds1,axis=0)
dict_['Physics'] = np.sum(preds2,axis=0)
dict_['Mathematics'] = np.sum(preds3,axis=0)
dict_['Statistics'] = np.sum(preds4,axis=0)
dict_['Quantitative Biology'] = np.sum(preds5,axis=0)
dict_['Quantitative Finance'] = np.sum(preds6,axis=0)

In [None]:
import pickle
def save_obj(obj, name ):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name , 'rb') as f:
        return pickle.load(f)
    
save_obj(dict_,'test_raw_preds_scibertv2_uncased.pkl')    