In [1]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import multiprocessing as mp
import pandas as pd
import numpy as np
import re
import time
from tqdm.notebook import tqdm
from util import pre_process
import pickle
import torch
import gc
import torch.nn as nn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
data_path = '../../data_2020/'
model_path = 'models/'
other_path = 'others/'
n = 3
MAX_LEN = 512
drop_no_content = True
MIN_LEN = 7
workers = mp.cpu_count()-2
lr = 1e-5
epochs = 3
colsample_bytree = 0.8
seed = 9487
use_cuda = torch.cuda.is_available()
no = "2"
if use_cuda:
    print("using cuda!")
device = torch.device("cuda:"+no) if use_cuda else torch.device("cpu")

using cuda!


In [3]:
# for prediction
test = pd.read_csv(data_path+'validation.csv')
# get meta
train_data = pd.read_csv(data_path+'train_data_merge_bm25_tfidf_{}.csv'.format(n))
test_data = pd.read_csv(data_path+'test_data_merge_bm25_tfidf_{}.csv'.format(n))

In [4]:
candidate = pd.read_csv(data_path+'candidate_paper_pre.csv'.format(n))

In [5]:
sum((train_data['journal'] == 'no-content').values)/train_data.shape[0]

0.4207084693919729

In [6]:
sum((test_data['journal'] == 'no-content').values)/test_data.shape[0]

0.0313082497181351

In [7]:
# train_data_tfidf[train_data.groupby('description_id')['label']]

In [8]:
# train_data_tfidf=train_data.iloc[:528750]

In [9]:
# train_data_tfidf[train_data_tfidf['label']==1 ]

In [10]:
# train_data_tfidf[train_data_tfidf['label']!=1 ]

In [11]:
import math

class gelu(nn.Module):
    
    def __init__(self):
        super(gelu, self).__init__()

    def forward(self, x):
        cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        return x * cdf

    
class MutiLabelModel(nn.Module):
    
    def __init__(self, encoder, emb_size=768, out_size=1): # hidden=256
        super(MutiLabelModel, self).__init__()
        
        self.encoder = encoder
        self.fn_size = emb_size

#         self.out_fn = nn.Sequential(
#             nn.Dropout(0.2),
#             nn.Linear(self.fn_size, self.fn_size//2),
#             gelu(),
#             nn.Dropout(0.2),
#             nn.Linear(self.fn_size//2,100),
#         )
#         self.cos = nn.CosineSimilarity()
        self.out_fn=nn.Linear(self.fn_size*4, 1)
        
    def forward(self, inp1, seg_inp1,inp2, seg_inp2,cat_emb=None, cls_loc=0): # , inp_title, seg_inp_title, cls_loc=0):

        # batch = 1
        embs1 = self.encoder(inp1, token_type_ids=seg_inp1)[0] # [batch, seq, hidden]
        outputs1 = embs1[:, cls_loc, :]
#         outputs1 = self.out_fn(outputs1)
        embs2 = self.encoder(inp2, token_type_ids=seg_inp2)[0] # [batch, seq, hidden]
        outputs2 = embs2[:, cls_loc, :]
#         outputs2 = self.out_fn(outputs2)
        outputs=torch.cat([outputs1,outputs2,torch.abs(outputs1-outputs2),outputs1*outputs2],dim=1)
#         outputs = self.cos(outputs1,outputs2)
        outputs=self.out_fn(outputs)
        return outputs

In [12]:
print("\nloading bert...")
tokenizer = BertTokenizer.from_pretrained('scibert_scivocab_uncased')
encoder = BertModel.from_pretrained('scibert_scivocab_uncased')


loading bert...


In [13]:
model = MutiLabelModel(encoder).to(device)
model.load_state_dict(torch.load(model_path+"model_9487", map_location=device))

<All keys matched successfully>

In [14]:
# thd=0.5
# batch_size=8
# result=[]
# epochs=1
# num_total_steps = np.ceil(train_data.shape[0] / batch_size)*epochs
# num_warmup_steps = int(num_total_steps * 0.5)
# model=model.to(device)
# optim = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
# scheduler = WarmupLinearSchedule(optim, warmup_steps=num_warmup_steps, t_total=num_total_steps)
# for ep in range(epochs):
#     model = model.train()
#     criterion = nn.BCEWithLogitsLoss()
#     optim.zero_grad()
#     total_loss = 0.0
#     pbar =tqdm(enumerate(train_data[['description_text','title_pro','label']].values),total=train_data.shape[0])
#     for sidx, r in pbar:
#         if (sidx+1) % batch_size == 0:
#             optim.step()
#             scheduler.step()
#             optim.zero_grad()
#         input_ids1 = tokenizer.encode('[CLS] '+r[0]+' [SEP] ')[:MAX_LEN]
#         input_ids2 = tokenizer.encode('[CLS] '+r[1]+' [SEP]')[:MAX_LEN]
#         input_ids1 = torch.tensor([input_ids1]).long().to(device)
#         input_ids2 = torch.tensor([input_ids2]).long().to(device)
#         segments_ids1 = [ 0 for i in range(len(input_ids1)) ]
#         segments_ids1 = torch.tensor([segments_ids1]).long().to(device)
#         segments_ids2 = [ 0 for i in range(len(input_ids2)) ]
#         segments_ids2 = torch.tensor([segments_ids2]).long().to(device)
#         target = torch.FloatTensor([r[2]]).to(device).view(1,-1)
#         out = model(input_ids1, segments_ids1,input_ids2, segments_ids2) 
#         l = criterion(out, target)
#         total_loss += l.item()
#         out = torch.sigmoid(out)
#         pred = (out >thd)
#         result.append((out.item(),pred.long().item(),r[2]))
#         l.backward()
#         if sidx!=0:
#             pbar.set_postfix({"loss: ": total_loss/sidx})
#     optim.step()
#     scheduler.step()

In [15]:
# torch.save(model.state_dict(), "model_{}".format(seed))

In [16]:
# candidate_bertembed={}
# for r in tqdm(candidate[['paper_id','title_pro']].values):
#     input_ids1 = tokenizer.encode('[CLS] '+r[1]+' [SEP] ')[:MAX_LEN]
#     input_ids1 = torch.tensor([input_ids1]).long().to(device)
#     segments_ids1 = [ 0 for i in range(len(input_ids1)) ]
#     segments_ids1 = torch.tensor([segments_ids1]).long().to(device)
#     embs1 = model.encoder(input_ids1, token_type_ids=segments_ids1)[0] # [batch, seq, hidden]
#     outputs1 = embs1[:, 0, :]
#     candidate_bertembed[r[0]]=outputs1.cpu().detach().numpy()
    

In [17]:
# import pickle

# with open('candidate_bertembed.pickle', 'wb') as f:
#     pickle.dump(candidate_bertembed, f)

In [18]:
test_pre = pd.read_csv(data_path+'test_pre.csv')
# get meta
train_pre = pd.read_csv(data_path+'train_pre.csv')

In [19]:
train_pre_bertembed={}
for r in tqdm(train_pre[['description_id','description_text_pre']].values):
    input_ids1 = tokenizer.encode('[CLS] '+r[1]+' [SEP] ')[:MAX_LEN]
    input_ids1 = torch.tensor([input_ids1]).long().to(device)
    segments_ids1 = [ 0 for i in range(len(input_ids1)) ]
    segments_ids1 = torch.tensor([segments_ids1]).long().to(device)
    embs1 = model.encoder(input_ids1, token_type_ids=segments_ids1)[0] # [batch, seq, hidden]
    outputs1 = embs1[:, 0, :]
    train_pre_bertembed[r[0]]=outputs1.cpu().detach().numpy()
# # with open('train_pre_bertembed.pickle', 'wb') as f:
# #     pickle.dump(train_pre_bertembed, f)

HBox(children=(FloatProgress(value=0.0, max=62974.0), HTML(value='')))




In [20]:
test_pre_bertembed={}
for r in tqdm(test_pre[['description_id','description_text_pre']].values):
    input_ids1 = tokenizer.encode('[CLS] '+r[1]+' [SEP] ')[:MAX_LEN]
    input_ids1 = torch.tensor([input_ids1]).long().to(device)
    segments_ids1 = [ 0 for i in range(len(input_ids1)) ]
    segments_ids1 = torch.tensor([segments_ids1]).long().to(device)
    embs1 = model.encoder(input_ids1, token_type_ids=segments_ids1)[0] # [batch, seq, hidden]
    outputs1 = embs1[:, 0, :]
    test_pre_bertembed[r[0]]=outputs1.cpu().detach().numpy()
# with open('test_pre_bertembed.pickle', 'wb') as f:
#     pickle.dump(test_pre_bertembed, f)

HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))




In [21]:
for k in list(test_pre_bertembed.keys()):
    if type(k) == float:
        test_pre_bertembed['none'] = test_pre_bertembed[k].copy()
        print('hello')

In [22]:
description2embedding = {}
paper2embedding = {}

for k in tqdm(list(train_pre_bertembed.keys())):
    description2embedding[k+'_train'] = np.array(train_pre_bertembed[k][0])
for k in tqdm(list(test_pre_bertembed.keys())):
    description2embedding[str(k)+'_test'] = np.array(test_pre_bertembed[k][0])
    
# for k in tqdm(list(candidate_bertembed.keys())):
#     paper2embedding[k] = np.array(candidate_bertembed[k][0])
    
# with open(other_path+'paper2embedding_pre.pkl', 'wb') as f:
#     pickle.dump(paper2embedding, f)
with open(other_path+'description2embedding_pre.pkl', 'wb') as f:
    pickle.dump(description2embedding, f)

HBox(children=(FloatProgress(value=0.0, max=62973.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))




In [23]:
len(description2embedding)

97401