### imports
***

In [1]:
import torch
from torch.nn import functional as F

from fse.models.base_s2v import BaseSentence2VecModel
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import fasttext

import pandas as pd
import numpy as np
import swifter
from tqdm.notebook import tqdm
import csv
import pickle
import time



### load data
***

In [2]:
data_path = '../../data_2020/'
model_path = 'BioWordVec_PubMed_MIMICIII_d200.bin'
other_path = 'others/'

In [3]:
# candidate_paper = pd.read_csv(data_path+"candidate_paper_pre.csv")
train_data = pd.read_csv(data_path+"train_pre.csv")
valid_data = pd.read_csv(data_path+"test_pre.csv")
# candidate_paper = candidate_paper[~candidate_paper['paper_id'].isnull()]

In [4]:
train_data = train_data.fillna('none')
valid_data = valid_data.fillna('none')
# candidate_paper = candidate_paper.fillna('none')

In [5]:
train_data.head()

Unnamed: 0,description_id,paper_id,description_text,key_text,key_text_pre,description_text_pre
0,77bef2,5c0f7919da562944ac759a0f,Angiogenesis is reflected as newly formed vess...,"Moreover, Wnt-1-inducible secreted protein-1 (...",moreover wnt-1-inducible wnt 1 inducible secre...,angiogenesi reflect newly form vessel endothel...
1,42360e,5c1360beda56295a0896fda3,Cardiac fibrosis is a common process in remode...,There is evidence showing that the down-regula...,there evidence show down-regulation down regul...,cardiac fibrosi common process remodel heart M...
2,9bf5e0,5d1b36e83a55ac0a0e8bb84e,"Agmatine, formed by the decarboxylation of L-a...","Agmatine, formed by the decarboxylation of L-a...",agmatine form decarboxylation l-arginine l arg...,agmatine form decarboxylation l-arginine l arg...
3,22e485,5d2709fd3a55ac2cfc28108f,The ob gene product leptin has been demonstrat...,"The aminoguanidine carboxylate, BVT.12777 (Fig...",the aminoguanidine carboxylate BVT.12777 figur...,the ob gene product leptin demonstrate activat...
4,30856c,55a392d1c91b587b095b6fcc,"Lauterbach M et al., have concluded at the end...","Lauterbach M et , have concluded at the end of...",lauterbach M et conclude end study germany ana...,lauterbach M et al. conclude end study germany...


In [6]:
valid_data.head()

Unnamed: 0,description_id,description_text,key_text,key_text_pre,description_text_pre
0,00032c,Refer to Table 2 or Methods for a brief descri...,Colons (:) indicated interaction terms..,colon indicate interaction terms..,refer table method brief description variable ...
1,000676,Sixty-nine female subjects with a mean age of ...,Handedness was evaluated according to the proc...,handedness evaluate accord procedure propose a...,sixty-nine sixty nine female subject mean age ...
2,000b24,Our behavioral and imaging findings differed f...,"Recently, Chiu et used a modified IGT, namel...",recently chiu et use modified IGT namely sooch...,our behavioral imaging finding differ previou ...
3,000c20,A novel Ehrlichia transmitted by Amblyomma ame...,"ruminantium, caused transient febrile illness,...",ruminantium cause transient febrile illness fo...,A novel ehrlichium transmit amblyomma american...
4,000c90,The dorsal fronto-striatal circuit plays an im...,"One of these functions is set-shifting, which ...",one function set-shifting set shifting refer a...,the dorsal fronto-striatal fronto striatal cir...


### preprocess
***

In [7]:
stop_words = set(stopwords.words('english'))

def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()
    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]
    return ' '.join(tokens)

train_data['key_text_pre'] = train_data['key_text_pre'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values
train_data['description_text_pre'] = train_data['description_text_pre'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values
valid_data['key_text_pre'] = valid_data['key_text_pre'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values
valid_data['description_text_pre'] = valid_data['description_text_pre'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values
# candidate_paper['title_pro'] = candidate_paper['title_pro'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values
# candidate_paper['abstract_pre'] = candidate_paper['abstract_pre'].swifter.allow_dask_on_strings().apply(lambda x: preprocess_sentence(x)).values

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=96.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=96.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=96.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=96.0, style=ProgressStyle(description_wi…




### embedding
***

In [8]:
model = fasttext.load_model(model_path)

def get_vec_ft(x):
    vec = [model[word] for word in x.split()]
    if len(vec) == 0:
        return np.nan
    else:
        return np.mean(np.array(vec), axis=0)

def random_vec():
    return np.random.normal(0, 0.1, 200)

paper2embedding = {}
description2embedding = {}

# for i, r in tqdm(candidate_paper.iterrows(), total=candidate_paper.shape[0]):
#     paper2embedding[r['paper_id']] = {}
#     # abstract
#     ab = r['abstract_pre']
#     if ab == 'no_content' or ab == 'none' or ab == 'n o n e':
#         paper2embedding[r['paper_id']]['abstract'] = random_vec()
#     else:
#         paper2embedding[r['paper_id']]['abstract'] = get_vec_ft(ab)
#     # title
#     title = r['title_pro']
#     paper2embedding[r['paper_id']]['title'] = get_vec_ft(title)




In [9]:
# with open(other_path+'paper2embedding_s2v.pkl', 'wb') as f:
#     pickle.dump(paper2embedding, f)

In [10]:
for i, r in tqdm(train_data.iterrows(), total=train_data.shape[0]):
    description2embedding[r['description_id']+'_train'] = {}
    # description_text
    dcp = r['description_text_pre']
    if dcp == 'none' or dcp == 'n o n e':
        description2embedding[r['description_id']+'_train']['description_text'] = random_vec()
    else:
        description2embedding[r['description_id']+'_train']['description_text'] = get_vec_ft(dcp)
    # key_text
    key = r['key_text_pre']
    if key == 'none' or key == 'n o n e':
        description2embedding[r['description_id']+'_train']['key_text'] = random_vec()
    else:
        description2embedding[r['description_id']+'_train']['key_text'] = get_vec_ft(key)
        
for i, r in tqdm(valid_data.iterrows(), total=valid_data.shape[0]):
    description2embedding[r['description_id']+'_test'] = {}
    # description_text
    dcp = r['description_text_pre']
    if dcp == 'none' or dcp == 'n o n e':
        description2embedding[r['description_id']+'_test']['description_text'] = random_vec()
    else:
        description2embedding[r['description_id']+'_test']['description_text'] = get_vec_ft(dcp)
    # key_text
    key = r['key_text_pre']
    if key == 'none' or key == 'n o n e':
        description2embedding[r['description_id']+'_test']['key_text'] = random_vec()
    else:
        description2embedding[r['description_id']+'_test']['key_text'] = get_vec_ft(key)

HBox(children=(FloatProgress(value=0.0, max=62974.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))




In [11]:
with open(other_path+'description2embedding_s2v.pkl', 'wb') as f:
    pickle.dump(description2embedding, f)