In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.path.append('..')
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.max_rows=1000
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


import torch
from torch.utils.data import DataLoader,Dataset
import transformers
from torch import nn
from torch import optim
import torch.nn.functional as F

from tqdm import tqdm
import random
import gc
from sklearn.metrics import f1_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from dataset import BertDataV1
from Engine import GpuEngine
from utils import EarlyStopping,SlackWebhook

In [2]:
DATA_DIR = '../'

In [3]:
seed = 100
random.seed(seed)

torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

np.random.seed(seed)

<torch._C.Generator at 0x7f254b426310>

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [5]:
webhook_url = "YOUR SLACK WEBHOOK"
slack_loggger = SlackWebhook(webhook_url,verbose=True)

In [6]:
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
train.ABSTRACT = train.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
train.TITLE = train.TITLE.str.replace('\n',' ').str.strip().str.lower()

test.ABSTRACT = test.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
test.TITLE = test.TITLE.str.replace('\n',' ').str.strip().str.lower()
train['text'] = train['TITLE']+'. '+train['ABSTRACT']
test['text'] = test['TITLE']+'. '+test['ABSTRACT']

In [12]:
BERT_TYPE = 'allenai/scibert_scivocab_uncased'
BERT_CONFIG = transformers.AutoConfig.from_pretrained(BERT_TYPE)
BERT_CONFIG.output_hidden_states = True
BERT_TOKENIZER = transformers.AutoTokenizer.from_pretrained(BERT_TYPE)
BERT_TOKENIZER.model_max_length = 512

TARGET_COLS = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
NUM_WORKERS = 0

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [13]:
class BertForPaperClassificationV2(nn.Module):
    
    def __init__(self,arch_type,config):
        
        super().__init__()
        self.arch_type = arch_type
        self.config = config        
        self.bert = transformers.AutoModel.from_pretrained(arch_type,config = config)
        self.cs = nn.Linear(768,2)
        self.phy = nn.Linear(768,2)        
        self.math = nn.Linear(768,2)        
        self.stat = nn.Linear(768,2)        
        self.bio = nn.Linear(768,2)               
        self.fin = nn.Linear(768,2)
        self.pool = nn.AdaptiveAvgPool1d(1)
    
    def forward(self,input_ids,token_type_ids,attention_mask):
        
        emb = self.bert(input_ids = input_ids, token_type_ids = token_type_ids , attention_mask = attention_mask)
        
        ### taking the mean of the last three layers
        emb_output = (emb[2][-1]+emb[2][-2]+emb[2][-3])/3
        
        pooled_output = self.pool(emb_output.permute(0,2,1)).squeeze(-1)
        
        output = {}
        
        output['cs'] = self.cs(pooled_output)
        output['phy'] = self.phy(pooled_output)        
        output['math'] = self.math(pooled_output)        
        output['stat'] = self.stat(pooled_output)        
        output['bio'] = self.bio(pooled_output)                
        output['fin'] = self.fin(pooled_output)                
                
        return output    

In [14]:
def AvgLoss(preds,targets):
    loss_ = nn.CrossEntropyLoss()
    return torch.stack([loss_(x[0],x[1]) for x in list(zip(preds,targets))]).mean()    
    

In [15]:
folds = MultilabelStratifiedKFold(n_splits=5, random_state=100,shuffle=True)
folds = [(x,y) for x,y in folds.split(train['ID'],train[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']])]                                  



In [16]:
[(len(x),len(y)) for x,y in folds]

[(16768, 4204), (16787, 4185), (16767, 4205), (16810, 4162), (16756, 4216)]

In [17]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,text
0,1,reconstructing subject-specific effect maps,predictive models allow subject-specific infer...,1,0,0,0,0,0,reconstructing subject-specific effect maps. p...
1,2,rotation invariance neural network,rotation invariance and translation invariance...,1,0,0,0,0,0,rotation invariance neural network. rotation i...
2,3,spherical polyharmonics and poisson kernels fo...,we introduce and develop the notion of spheric...,0,0,1,0,0,0,spherical polyharmonics and poisson kernels fo...
3,4,a finite element approximation for the stochas...,the stochastic landau--lifshitz--gilbert (llg)...,0,0,1,0,0,0,a finite element approximation for the stochas...
4,5,comparative study of discrete wavelet transfor...,fourier-transform infra-red (ftir) spectra of ...,1,0,0,1,0,0,comparative study of discrete wavelet transfor...


In [18]:
for i in [0,1,2,3,4]:
    
    train_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][0]], target_cols=TARGET_COLS)
    val_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][1]], target_cols=TARGET_COLS)

    train_dl = DataLoader(dataset=train_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=True,shuffle=True)    
    val_dl = DataLoader(dataset=val_df,batch_size=VAL_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

    early_stopping_rounds=5
    maximize=True
    num_epochs=4
    max_lr=0.00001

    mod = BertForPaperClassificationV2(arch_type=BERT_TYPE,config=BERT_CONFIG)

    _ = mod.to(device)

    optimizer = optim.AdamW(params=mod.parameters(),lr=max_lr)

    schedular = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=1,factor=0.5,mode='max',min_lr=0.0000001,verbose=True)

    es=EarlyStopping(patience=early_stopping_rounds,higher_is_better=maximize,tolerance=0.0001,
                    save_path='scibertv2_uncased_mutlilayer_fold{}.path'.format(i),model=mod)    

    fitter = GpuEngine(model = mod,device = device,optimizer= optimizer,schedular=schedular, slack_header='ScibertV2 scivocab uncased mutlilayer sahilv711 fold {}'.format(i),
                       es = es,log_path='log_fold{}.txt'.format(i),num_epochs=num_epochs,
                       slack_loggger=slack_loggger,criterion=AvgLoss)
    
    fitter.fit(train_dl,val_dl)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




100%|██████████| 1048/1048 [16:08<00:00,  1.08it/s]
100%|██████████| 263/263 [01:34<00:00,  2.80it/s]


Epoch: 0 train loss: 0.17598136070091988 train_metric: 0.8175829557977226 val_loss: 0.1551786137409827 val_metric: 0.8427648703364503 time 17.71


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:10<00:00,  1.08it/s]
100%|██████████| 263/263 [01:33<00:00,  2.80it/s]


Epoch: 1 train loss: 0.1342530194723282 train_metric: 0.8652539940686884 val_loss: 0.14890189777942753 val_metric: 0.8536995515695067 time 17.75


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:10<00:00,  1.08it/s]
100%|██████████| 263/263 [01:33<00:00,  2.80it/s]


Epoch: 2 train loss: 0.11061272896001585 train_metric: 0.8905726704828865 val_loss: 0.16050376584992648 val_metric: 0.8513339466421345 time 17.74


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:13<00:00,  1.08it/s]
100%|██████████| 263/263 [01:34<00:00,  2.78it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08690942781053142 train_metric: 0.9155967665240133 val_loss: 0.17049104814638308 val_metric: 0.8459825570606793 time 17.8
200


100%|██████████| 1049/1049 [16:15<00:00,  1.08it/s]
100%|██████████| 262/262 [01:33<00:00,  2.79it/s]


Epoch: 0 train loss: 0.17443385375423928 train_metric: 0.8200529256101146 val_loss: 0.15075738523311227 val_metric: 0.8503831417624521 time 17.83


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:15<00:00,  1.07it/s]
100%|██████████| 262/262 [01:33<00:00,  2.79it/s]


Epoch: 1 train loss: 0.1334055203205779 train_metric: 0.8660695091365103 val_loss: 0.1518927074247791 val_metric: 0.8477152477152476 time 17.83


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:15<00:00,  1.08it/s]
100%|██████████| 262/262 [01:34<00:00,  2.77it/s]


Epoch     3: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 2 train loss: 0.1096222077348084 train_metric: 0.892128418549346 val_loss: 0.16333766091112026 val_metric: 0.845209780230235 time 17.83


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:20<00:00,  1.07it/s]
100%|██████████| 262/262 [01:35<00:00,  2.75it/s]


Epoch: 3 train loss: 0.08035261306407425 train_metric: 0.9229566453447051 val_loss: 0.17461587704780424 val_metric: 0.845165525649442 time 17.93
200


100%|██████████| 1047/1047 [16:23<00:00,  1.06it/s]
100%|██████████| 263/263 [01:36<00:00,  2.72it/s]


Epoch: 0 train loss: 0.17629715865931572 train_metric: 0.8186764705882352 val_loss: 0.1511189392857546 val_metric: 0.8436432637571158 time 18.02


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:23<00:00,  1.06it/s]
100%|██████████| 263/263 [01:36<00:00,  2.72it/s]


Epoch: 1 train loss: 0.1343277106070473 train_metric: 0.8633083175486129 val_loss: 0.152165323999067 val_metric: 0.845793516957092 time 18.02


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:21<00:00,  1.07it/s]
100%|██████████| 263/263 [01:36<00:00,  2.73it/s]


Epoch: 2 train loss: 0.11145442034052942 train_metric: 0.8897992235692 val_loss: 0.15737161671789307 val_metric: 0.8473616943582253 time 17.98


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:22<00:00,  1.07it/s]
100%|██████████| 263/263 [01:35<00:00,  2.74it/s]


Epoch: 3 train loss: 0.08840366070351804 train_metric: 0.9141649249477485 val_loss: 0.17506567450255758 val_metric: 0.8351585014409222 time 17.97
200


100%|██████████| 1050/1050 [16:26<00:00,  1.06it/s]
100%|██████████| 261/261 [01:36<00:00,  2.71it/s]


Epoch: 0 train loss: 0.17721120255334036 train_metric: 0.8169661410585503 val_loss: 0.1510134445479187 val_metric: 0.8457508731082655 time 18.06


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:27<00:00,  1.06it/s]
100%|██████████| 261/261 [01:35<00:00,  2.74it/s]


Epoch: 1 train loss: 0.1343531211572034 train_metric: 0.8642874251497007 val_loss: 0.15503615103679916 val_metric: 0.844644701023186 time 18.04


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:26<00:00,  1.06it/s]
100%|██████████| 261/261 [01:34<00:00,  2.76it/s]


Epoch     3: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 2 train loss: 0.11054362011452516 train_metric: 0.8903982269250018 val_loss: 0.16548120417327392 val_metric: 0.8447584494939852 time 18.01


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:24<00:00,  1.07it/s]
100%|██████████| 261/261 [01:35<00:00,  2.73it/s]


Epoch: 3 train loss: 0.08119781460169526 train_metric: 0.9227770779359684 val_loss: 0.17629569246541532 val_metric: 0.8407606607760277 time 18.01
200


100%|██████████| 1047/1047 [16:25<00:00,  1.06it/s]
100%|██████████| 264/264 [01:36<00:00,  2.72it/s]


Epoch: 0 train loss: 0.17757137167448073 train_metric: 0.8135601541596091 val_loss: 0.151086445622132 val_metric: 0.8462348100660224 time 18.04


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:25<00:00,  1.06it/s]
100%|██████████| 264/264 [01:36<00:00,  2.72it/s]


Epoch: 1 train loss: 0.1353594944229725 train_metric: 0.8622622526709145 val_loss: 0.14911752143270376 val_metric: 0.8487889607256585 time 18.06


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:28<00:00,  1.06it/s]
100%|██████████| 264/264 [01:39<00:00,  2.66it/s]


Epoch: 2 train loss: 0.11340703614191316 train_metric: 0.8867574257425743 val_loss: 0.15704223534809797 val_metric: 0.8524804177545693 time 18.14


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:25<00:00,  1.06it/s]
100%|██████████| 264/264 [01:35<00:00,  2.75it/s]


Epoch: 3 train loss: 0.08742883727274686 train_metric: 0.9154481579946829 val_loss: 0.17529548559005617 val_metric: 0.8444571755943856 time 18.03
200


------------------------------

### Inferencing

In [8]:
def predict(test_dl,model,model_path=None):

    if model_path is not None:
        model.load_state_dict(torch.load(model_path))

    model.eval()
    with torch.no_grad():
        pred_list=[]
        tar_list=[]
    for  data_ in tqdm(test_dl, total=len(test_dl)):        
        input_ids = data_['input_ids'].to(device)                        
        token_type_ids = data_['token_type_ids'].to(device)                                    
        attention_mask = data_['attention_mask'].to(device)      

        preds = model(input_ids,token_type_ids,attention_mask)            
        preds = list(preds.values())
        preds = [x.cpu().detach().numpy() for x in preds]
        pred_list.append(preds)
            
    return pred_list

In [None]:
models = glob.glob('*.path')
models

In [None]:
test_df = BertDataV2(tokenizer=BERT_TOKENIZER,df = test)
test_dl = DataLoader(dataset=test_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

In [None]:
%%time
preds1 = []
preds2 = []
preds3 = []
preds4 = []
preds5 = []
preds6 = []

for _mod in models:
    m = predict(test_dl,mod,_mod)
    preds1.append(np.concatenate([x[0] for x in m]))
    preds2.append(np.concatenate([x[1] for x in m]))
    preds3.append(np.concatenate([x[2] for x in m]))    
    preds4.append(np.concatenate([x[3] for x in m]))
    preds5.append(np.concatenate([x[4] for x in m]))
    preds6.append(np.concatenate([x[5] for x in m]))

test['Computer Science'] = np.sum(preds1,axis=0).argmax(axis=1)
test['Physics'] = np.sum(preds2,axis=0).argmax(axis=1)
test['Mathematics'] = np.sum(preds3,axis=0).argmax(axis=1)
test['Statistics'] = np.sum(preds4,axis=0).argmax(axis=1)
test['Quantitative Biology'] = np.sum(preds5,axis=0).argmax(axis=1)
test['Quantitative Finance'] = np.sum(preds6,axis=0).argmax(axis=1)

In [None]:
test[['ID','Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']].to_csv('submission_scibertv2_mutlilayer.csv',index=False)

In [None]:
dict_={}
dict_['Computer Science'] = np.sum(preds1,axis=0)
dict_['Physics'] = np.sum(preds2,axis=0)
dict_['Mathematics'] = np.sum(preds3,axis=0)
dict_['Statistics'] = np.sum(preds4,axis=0)
dict_['Quantitative Biology'] = np.sum(preds5,axis=0)
dict_['Quantitative Finance'] = np.sum(preds6,axis=0)

In [None]:
import pickle
def save_obj(obj, name ):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name , 'rb') as f:
        return pickle.load(f)
    
save_obj(dict_,'test_raw_preds_scibertv2_mutlilayer.pkl')   