In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.path.append('..')
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.max_rows=1000
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


import torch
from torch.utils.data import DataLoader,Dataset
import transformers
from torch import nn
from torch import optim
import torch.nn.functional as F

from tqdm import tqdm
import random
import gc
from sklearn.metrics import f1_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from dataset import BertDataV1
from Engine import GpuEngine
from utils import EarlyStopping,SlackWebhook

In [2]:
seed = 100
random.seed(seed)

torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

np.random.seed(seed)

<torch._C.Generator at 0x7f79ff052330>

In [3]:
DATA_DIR = '../'

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [5]:
webhook_url = "YOUR SLACK WEBHOOK"
slack_loggger = SlackWebhook(webhook_url,verbose=True)

In [6]:
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')

train.ABSTRACT = train.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
train.TITLE = train.TITLE.str.replace('\n',' ').str.strip().str.lower()

test.ABSTRACT = test.ABSTRACT.str.replace('\n',' ').str.strip().str.lower()
test.TITLE = test.TITLE.str.replace('\n',' ').str.strip().str.lower()'

In [11]:
BERT_TYPE = 'allenai/scibert_scivocab_uncased'
BERT_CONFIG = transformers.AutoConfig.from_pretrained(BERT_TYPE)
BERT_CONFIG.output_hidden_states = True
BERT_TOKENIZER = transformers.AutoTokenizer.from_pretrained(BERT_TYPE)
BERT_TOKENIZER.model_max_length = 512

TARGET_COLS = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
NUM_WORKERS = 0

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [9]:
class BertForPaperClassificationV1(nn.Module):
    
    def __init__(self,arch_type,config):
        
        super().__init__()
        self.arch_type = arch_type
        self.config = config        
        self.bert = transformers.AutoModel.from_pretrained(arch_type,config = config)
        self.cs = nn.Linear(768,2)
        self.phy = nn.Linear(768,2)        
        self.math = nn.Linear(768,2)        
        self.stat = nn.Linear(768,2)        
        self.bio = nn.Linear(768,2)               
        self.fin = nn.Linear(768,2)
        self.pool = nn.AdaptiveAvgPool1d(1)
    
    def forward(self,input_ids,token_type_ids,attention_mask):
        
        emb = self.bert(input_ids = input_ids, token_type_ids = token_type_ids , attention_mask = attention_mask)
        
        pooled_output = self.pool(emb[0].permute(0,2,1)).squeeze(-1)
        
        output = {}
        
        output['cs'] = self.cs(pooled_output)
        output['phy'] = self.phy(pooled_output)        
        output['math'] = self.math(pooled_output)        
        output['stat'] = self.stat(pooled_output)        
        output['bio'] = self.bio(pooled_output)                
        output['fin'] = self.fin(pooled_output)                
                
        return output

In [10]:
def AvgLoss(preds,targets):
    loss_ = nn.CrossEntropyLoss()
    return torch.stack([loss_(x[0],x[1]) for x in list(zip(preds,targets))]).mean()    
    

In [11]:
folds = MultilabelStratifiedKFold(n_splits=5, random_state=100,shuffle=True)
folds = [(x,y) for x,y in folds.split(train['ID'],train[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']])]                                  

In [12]:
[(len(x),len(y)) for x,y in folds]

[(16768, 4204), (16787, 4185), (16767, 4205), (16810, 4162), (16756, 4216)]

In [13]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [17]:
for i in [0,1,2,3,4]:
    
    train_df = BertDataV1(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][0]], target_cols=TARGET_COLS)
    val_df = BertDataV1(tokenizer=BERT_TOKENIZER,df = train.iloc[folds[i][1]], target_cols=TARGET_COLS)

    train_dl = DataLoader(dataset=train_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=True,shuffle=True)    
    val_dl = DataLoader(dataset=val_df,batch_size=VAL_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

    early_stopping_rounds=5
    maximize=True
    num_epochs=4
    max_lr=0.00001

    mod = BertForPaperClassificationV1(arch_type=BERT_TYPE,config=BERT_CONFIG)

    _ = mod.to(device)

    optimizer = optim.AdamW(params=mod.parameters(),lr=max_lr)

    schedular = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=1,factor=0.5,mode='max',min_lr=0.0000001,verbose=True)

    es=EarlyStopping(patience=early_stopping_rounds,higher_is_better=maximize,tolerance=0.0001,
                    save_path='scibert_uncased_V1_fold{}.path'.format(i),model=mod)    

    fitter = GpuEngine(model = mod,device = device,optimizer= optimizer,schedular=schedular, slack_header='Scibert scivocab uncased maverick711 fold {}'.format(i),
                       es = es,log_path='log_fold{}.txt'.format(i),num_epochs=num_epochs,
                       slack_loggger=slack_loggger,criterion=AvgLoss)
    
    fitter.fit(train_dl,val_dl)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




100%|██████████| 1048/1048 [16:10<00:00,  1.08it/s]
100%|██████████| 263/263 [01:33<00:00,  2.80it/s]


Epoch: 0 train loss: 0.17350908507813365 train_metric: 0.8212357381935453 val_loss: 0.15358216777004366 val_metric: 0.8458260701142747 time 17.75


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:12<00:00,  1.08it/s]
100%|██████████| 263/263 [01:34<00:00,  2.79it/s]


Epoch: 1 train loss: 0.13244181079079062 train_metric: 0.8660321408073465 val_loss: 0.1490715665266017 val_metric: 0.8537638445654214 time 17.79


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:13<00:00,  1.08it/s]
100%|██████████| 263/263 [01:34<00:00,  2.77it/s]


Epoch: 2 train loss: 0.1075408216761354 train_metric: 0.8927161139921025 val_loss: 0.16031292103564138 val_metric: 0.8486297590583042 time 17.81


  0%|          | 0/1048 [00:00<?, ?it/s]

200


100%|██████████| 1048/1048 [16:14<00:00,  1.07it/s]
100%|██████████| 263/263 [01:34<00:00,  2.78it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08183456859542483 train_metric: 0.9206107596441993 val_loss: 0.17513194620665995 val_metric: 0.84477335800185 time 17.82
200


100%|██████████| 1049/1049 [16:16<00:00,  1.07it/s]
100%|██████████| 262/262 [01:34<00:00,  2.76it/s]


Epoch: 0 train loss: 0.17192953024894084 train_metric: 0.8240609583353686 val_loss: 0.15029854557989888 val_metric: 0.8504646038892614 time 17.87


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:17<00:00,  1.07it/s]
100%|██████████| 262/262 [01:34<00:00,  2.77it/s]


Epoch: 1 train loss: 0.13200311879454282 train_metric: 0.8674364675225321 val_loss: 0.1524349519535632 val_metric: 0.8476236197791648 time 17.87


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:20<00:00,  1.07it/s]
100%|██████████| 262/262 [01:34<00:00,  2.78it/s]


Epoch     3: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 2 train loss: 0.10670405047971425 train_metric: 0.8958536295592253 val_loss: 0.166552125731107 val_metric: 0.8452018278750952 time 17.91


  0%|          | 0/1049 [00:00<?, ?it/s]

200


100%|██████████| 1049/1049 [16:22<00:00,  1.07it/s]
100%|██████████| 262/262 [01:35<00:00,  2.75it/s]


Epoch: 3 train loss: 0.07591896035395086 train_metric: 0.9279612756264237 val_loss: 0.176422429789779 val_metric: 0.8462336193080041 time 17.97
200


100%|██████████| 1047/1047 [16:20<00:00,  1.07it/s]
100%|██████████| 263/263 [01:34<00:00,  2.77it/s]


Epoch: 0 train loss: 0.17420810645351892 train_metric: 0.8212652532218226 val_loss: 0.15009901312386947 val_metric: 0.8455377574370709 time 17.93


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:19<00:00,  1.07it/s]
100%|██████████| 263/263 [01:35<00:00,  2.76it/s]


Epoch: 1 train loss: 0.13291787497393268 train_metric: 0.8653049495941189 val_loss: 0.15041573071734943 val_metric: 0.8477033132530121 time 17.92


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:15<00:00,  1.07it/s]
100%|██████████| 263/263 [01:34<00:00,  2.77it/s]


Epoch: 2 train loss: 0.10844716743709308 train_metric: 0.8926174496644296 val_loss: 0.15421771168992296 val_metric: 0.8485632449413287 time 17.86


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:16<00:00,  1.07it/s]
100%|██████████| 263/263 [01:34<00:00,  2.78it/s]


Epoch: 3 train loss: 0.08473418082654077 train_metric: 0.9173439802271971 val_loss: 0.16865629635299428 val_metric: 0.8413581431037791 time 17.86
200


100%|██████████| 1050/1050 [16:19<00:00,  1.07it/s]
100%|██████████| 261/261 [01:35<00:00,  2.75it/s]


Epoch: 0 train loss: 0.17356191553175448 train_metric: 0.8220109027794754 val_loss: 0.15057329516293966 val_metric: 0.8466233892064723 time 17.91


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:20<00:00,  1.07it/s]
100%|██████████| 261/261 [01:34<00:00,  2.76it/s]


Epoch: 1 train loss: 0.13228348706981966 train_metric: 0.8673579348164469 val_loss: 0.1544725644075057 val_metric: 0.8452872699962448 time 17.91


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:26<00:00,  1.06it/s]
100%|██████████| 261/261 [01:35<00:00,  2.74it/s]


Epoch: 2 train loss: 0.10694155103926148 train_metric: 0.8931645328183487 val_loss: 0.1616087316735462 val_metric: 0.8484156437339424 time 18.04


  0%|          | 0/1050 [00:00<?, ?it/s]

200


100%|██████████| 1050/1050 [16:26<00:00,  1.06it/s]
100%|██████████| 261/261 [01:35<00:00,  2.74it/s]


Epoch: 3 train loss: 0.08154494652968078 train_metric: 0.9229744199881024 val_loss: 0.17440256127928316 val_metric: 0.8418449833571089 time 18.03
200


100%|██████████| 1047/1047 [16:18<00:00,  1.07it/s]
100%|██████████| 264/264 [01:36<00:00,  2.74it/s]


Epoch: 0 train loss: 0.17502667943109984 train_metric: 0.817126909518214 val_loss: 0.14777489945264197 val_metric: 0.8496744542320949 time 17.93


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:18<00:00,  1.07it/s]
100%|██████████| 264/264 [01:36<00:00,  2.75it/s]


Epoch: 1 train loss: 0.1331591594629808 train_metric: 0.865315110462194 val_loss: 0.1475137992138202 val_metric: 0.8517059106198943 time 17.93


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:20<00:00,  1.07it/s]
100%|██████████| 264/264 [01:36<00:00,  2.75it/s]


Epoch: 2 train loss: 0.1089160827814878 train_metric: 0.8915708538646481 val_loss: 0.15766628654903433 val_metric: 0.8506694129763132 time 17.93


  0%|          | 0/1047 [00:00<?, ?it/s]

200


100%|██████████| 1047/1047 [16:20<00:00,  1.07it/s]
100%|██████████| 264/264 [01:35<00:00,  2.75it/s]


Epoch     4: reducing learning rate of group 0 to 5.0000e-06.
Epoch: 3 train loss: 0.08203736020371304 train_metric: 0.920665083135392 val_loss: 0.17405243096360673 val_metric: 0.8459260674371955 time 17.93
200


---------------------------------------------

### Inferencing

In [14]:
def predict(test_dl,model,model_path=None):

    if model_path is not None:
        model.load_state_dict(torch.load(model_path))

    model.eval()
    with torch.no_grad():
        pred_list=[]
        tar_list=[]
    for  data_ in tqdm(test_dl, total=len(test_dl)):        
        input_ids = data_['input_ids'].to(device)                        
        token_type_ids = data_['token_type_ids'].to(device)                                    
        attention_mask = data_['attention_mask'].to(device)      

        preds = model(input_ids,token_type_ids,attention_mask)            
        preds = list(preds.values())
        preds = [x.cpu().detach().numpy() for x in preds]
        pred_list.append(preds)
            
    return pred_list

In [None]:
models = glob.glob('*.path')
models

In [None]:
test_df = BertDataV1(tokenizer=BERT_TOKENIZER,df = test)
test_dl = DataLoader(dataset=test_df,batch_size=TRAIN_BATCH_SIZE,num_workers=NUM_WORKERS,drop_last=False,shuffle=False)    

In [None]:
%%time
preds1 = []
preds2 = []
preds3 = []
preds4 = []
preds5 = []
preds6 = []

for _mod in models:
    m = predict(test_dl,mod,_mod)
    preds1.append(np.concatenate([x[0] for x in m]))
    preds2.append(np.concatenate([x[1] for x in m]))
    preds3.append(np.concatenate([x[2] for x in m]))    
    preds4.append(np.concatenate([x[3] for x in m]))
    preds5.append(np.concatenate([x[4] for x in m]))
    preds6.append(np.concatenate([x[5] for x in m]))        

test['Computer Science'] = np.sum(preds1,axis=0).argmax(axis=1)
test['Physics'] = np.sum(preds2,axis=0).argmax(axis=1)
test['Mathematics'] = np.sum(preds3,axis=0).argmax(axis=1)
test['Statistics'] = np.sum(preds4,axis=0).argmax(axis=1)
test['Quantitative Biology'] = np.sum(preds5,axis=0).argmax(axis=1)
test['Quantitative Finance'] = np.sum(preds6,axis=0).argmax(axis=1)

In [None]:
test[['ID','Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']].to_csv('submission_scibert_uncased.csv',index=False)

In [None]:
dict_ = {}
dict_['Computer Science'] = np.sum(preds1,axis=0)
dict_['Physics'] = np.sum(preds2,axis=0)
dict_['Mathematics'] = np.sum(preds3,axis=0)
dict_['Statistics'] = np.sum(preds4,axis=0)
dict_['Quantitative Biology'] = np.sum(preds5,axis=0)
dict_['Quantitative Finance'] = np.sum(preds6,axis=0)

import pickle
def save_obj(obj, name ):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name , 'rb') as f:
        return pickle.load(f)
    
save_obj(dict_,'test_raw_preds_scibert_uncased.pkl')    