### prediction generating procedure
***
+ run `preprocess` cell (done)
+ run the following steps at the same time for testing data
    + recall
        + five kinds of recall
            + bm25: run `recall - BM25` cell (done)
            + idf: run `recall - idf` cell (done)
            + s2v: run `sent2vec_recall.ipynb` directly (done)
            + blue: run `sent_bert_blue.ipynb` in hpcuda then run `bluebert_recall.ipynb` directly (done)
            + key: run `keywords_recall.ipynb` directly (done)
        + run the next three cells to merge all of them
    + embeddings for feature
        + `gen_vectors.ipynb`: get Word2Vec, FastText, and SIF (done)
        + `bert.ipynb`: get `description2embedding_pre.pkl` (done)
        + `sent2vec_embedding.ipynb`: get `description2embedding_s2v.pkl` (done)
        + `sent_bert_sci.ipynb`: get `description2embedding.pkl` (done)
+ run super fat `get features` cell after the above are all done
+ run `lgb.ipynb` directly to get prediction

### import
***

In [1]:
from gensim.summarization.bm25 import BM25
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim import corpora, similarities, models
from gensim.summarization import bm25
import fasttext
from fse.models.base_s2v import BaseSentence2VecModel

import Levenshtein
import pandas as pd
import swifter
import numpy as np
import math

from tqdm.notebook import tqdm
import pickle
import os
import re
import csv
import multiprocessing as mp
from multiprocessing import Process, cpu_count, Manager, Pool
from sklearn.externals import joblib
import time
import h5py
import gc
import collections
import warnings
from util import *

warnings.filterwarnings('ignore')



### parameter
***

In [2]:
data_path = '../../data_2020/'
model_path = 'models/'
other_path = 'others/'
ns = {'bm25': 40, 'idf': 10, 's2v': 20, 'blue': 5, 'key': 5}
thds = {'bm25': 400, 'idf': 400, 's2v': 400, 'blue': 400, 'key': 400}
n = sum(list(ns.values()))
drop_no_content = True
test_only = True
train_only = False
MIN_LEN = 7
idf_thd = 5.0
paper_thd = 47500
workers = mp.cpu_count()//3

In [3]:
# tf
def one(x):
    return (x+1e-10)/(x+1e-10)

# idf
def df2idf_thd(docfreq, totaldocs, log_base=2.0, add=0.0):
    idf = add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
    return idf if idf >= idf_thd else add

### preprocess
***

In [4]:
train = pd.read_csv(data_path+'train_release.csv')
test = pd.read_csv(data_path+'validation.csv')
candidate = pd.read_csv(data_path+'candidate_paper.csv')
train = train[~train['description_id'].isnull()]
candidate = candidate[~candidate['paper_id'].isnull()]

print(train.isna().sum())
print(test.isna().sum())
print(candidate.shape)

def digest(text):
    backup = text[:]
    text = text.replace('al.', '').split('. ')
    t=''
    pre_text=[]
    len_text=len(text)-1
    add=True
    pre=''
    while len_text>=0:
        index=text[len_text]
        index+=pre
        if len(index.split(' '))<=3 :
            add=False
            pre=index+pre
        else:
            add=True
            pre=''
        if add:
            pre_text.append(index)
        len_text-=1
    if len(pre_text)==0:
        pre_text=text
    pre_text.reverse()
    for index in pre_text:
        if index.find('[**##**]') != -1:
            index = re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+','',index)
            index+='. '
            t+=index
    return t

train['key_text'] = train['description_text'].swifter.allow_dask_on_strings().apply(lambda x:digest(x) if str(x)!='nan' else '')
test['key_text'] = test['description_text'].swifter.allow_dask_on_strings().apply(lambda x:digest(x) if str(x)!='nan' else '')

train['key_text_pre'] = train['key_text'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' else ''))
test['key_text_pre'] = test['key_text'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' else ''))

train['description_text_pre'] = train['description_text'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' else ''))
test['description_text_pre'] = test['description_text'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' else ''))

train.to_csv(data_path+'train_pre.csv',index=False)
test.to_csv(data_path+'test_pre.csv',index=False)

candidate['title_pro'] = candidate['title'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' else ''))
candidate['abstract_pre'] = candidate['abstract'].swifter.allow_dask_on_strings().apply(lambda x:' '.join(pre_process(x) if str(x)!='nan' and x!='NO_CONTENT' else ''))
candidate['keywords'] = candidate['keywords'].swifter.allow_dask_on_strings().apply(lambda x: x.lower() if str(x)!='nan' else '')
candidate.to_csv(data_path+'candidate_paper_pre.csv',index=False)

### prepare recall
***

In [5]:
papers = pd.read_csv(data_path+'candidate_paper.csv')
papers = papers[papers['paper_id'].notnull()]
print(papers.shape)

# fillna
papers['abstract'] = papers['abstract'].fillna('')
papers['title'] = papers['title'].fillna('')
papers['keywords'] = papers['keywords'].fillna('')

# concate all
train = papers['title'].values + ' ' + \
        papers['abstract'].values + ' ' + \
        papers['keywords'].apply(lambda x: x.replace(';', ' ')).values
train_item_id = list(papers['paper_id'].values)

# save paper_id
with open(other_path+'paper_id.pkl', 'wb') as fw:
    pickle.dump(train_item_id, fw)

# save paper content
with open(other_path+'train_content.pkl', 'wb') as fw:
    with Pool(processes=workers) as pool:
        train = pool.map(pre_process, tqdm(train))
    pickle.dump(train, fw)

dictionary = corpora.Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]

# tfidf
tfidf_model = models.TfidfModel(corpus, 
                                wlocal=one,
                                wglobal=df2idf_thd,
                                dictionary=dictionary)
corpus_tfidf = tfidf_model[corpus]

# save things of tfidf
dictionary.save(other_path+'train_dictionary.dict')
tfidf_model.save(other_path+'train_idf.model')
corpora.MmCorpus.serialize(other_path+'train_corpuse.mm', corpus)
featurenum = len(dictionary.token2id.keys())
index = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)
index.save(other_path+'train_index.index')

### recall - BM25
***

In [6]:
# run the whole cell except the train part
dictionary = corpora.Dictionary.load(other_path+'train_dictionary.dict')
index = similarities.SparseMatrixSimilarity.load(other_path+'train_index.index')
item_id_list = joblib.load(other_path+'paper_id.pkl')
with open(other_path+'train_content.pkl', 'rb') as fr:
    corpus = pickle.load(fr)

# bm25
print('get bm25')
bm25Model = BM25(corpus)

def get_recall_number_bm25(val, n):
    docs = val['description_text'].values
    ids = val['description_id'].values
    submit = np.zeros((len(docs), n+1)).astype(np.str)
    count = len(docs)
    bar = tqdm(range(count))
    for i in bar:
        doc = docs[i]
        id = ids[i]
        scores = np.array(bm25Model.get_scores(doc))
        related_doc_indices = scores.argsort()[:-n-1:-1]
        col = [id]+[item_id_list[index] for index in related_doc_indices]
        submit[i] = col
    return submit

def pool_extract_bm25_tfidf(data, f, n, chunk_size, worker=5):
    cpu_worker = os.cpu_count()
    print('cpu: {}'.format(cpu_worker))
    if worker == -1 or worker > cpu_worker:
        worker = cpu_worker
    print('used cpu: {}'.format(worker))
    t1 = time.time()
    len_data = len(data)
    start = 0
    end = 0
    p = Pool(worker)
    res = []
    pbar = tqdm(total=worker)
    def update(*a):
        pbar.update()
    while end < len_data:
        end = start + chunk_size
        if end > len_data:
            end = len_data
        rslt = p.apply_async(f, (data[start:end],n), callback=update)
        start = end
        res.append(rslt)
    p.close()
    p.join()
    t2 = time.time()
    print((t2 - t1)/60)
    results = np.concatenate([i.get() for i in res], axis=0)
    return results

# valid
valid = pd.read_csv(data_path+'test_pre.csv')
valid['key_text_pre'] = valid['key_text_pre'].apply(lambda x: x.split(' ') if str(x)!='nan' else [])
valid['key_text_pre_len'] = valid['key_text_pre'].apply(lambda x: len(x))
valid.loc[valid['key_text_pre_len'] < MIN_LEN, 'key_text_pre'] = valid.loc[valid['key_text_pre_len'] < MIN_LEN][
    'description_text'].apply(lambda x: pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+','',x))).values

ids = list(valid['description_id'].values)
docs = list(valid['key_text_pre'].values)
print(valid.shape)

valid = pd.DataFrame({'description_id': ids, 'description_text': docs})
submit = pool_extract_bm25_tfidf(valid, get_recall_number_bm25, n, valid.shape[0]//workers+1, worker=workers)
df = pd.DataFrame(submit)
df.to_csv(data_path+'test_pairs_{}number_bm25_2.csv'.format(n), header=None, index=False)

# train
train = pd.read_csv(data_path+'train_pre.csv')
train['key_text_pre'] = train['key_text_pre'].apply(lambda x: x.split(' ') if str(x)!='nan' else [])
train['key_text_pre_len'] = train['key_text_pre'].apply(lambda x: len(x))
train.loc[train['key_text_pre_len'] < MIN_LEN, 'key_text_pre'] = train.loc[train['key_text_pre_len'] < MIN_LEN][
    'description_text'].apply(lambda x: pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x))).values

ids = list(train['description_id'].values)
docs = list(train['key_text_pre'].values)
print(train.shape)

train = pd.DataFrame({'description_id': ids, 'description_text': docs})
submit = pool_extract_bm25_tfidf(train, get_recall_number_bm25, n, train.shape[0]//workers+1, worker=workers)
df = pd.DataFrame(submit)
df.to_csv(data_path+'train_pairs_{}number_bm25_2.csv'.format(n), header=None, index=False)

### recall - idf
***

In [7]:
# run the whole cell except the train part
tfidf = models.TfidfModel.load(other_path+"train_idf.model")
train_index = index

def get_recall_number_tfidf(val, n):
    docs = val['description_text'].values
    ids = val['description_id'].values
    submit = np.zeros((len(docs), n+1)).astype(np.str)
    count = len(docs)
    bar = tqdm(range(count))
    for i in bar:
        doc = docs[i]
        id = ids[i]
        vec = dictionary.doc2bow(doc)
        test_vec = tfidf[vec]
        sim = train_index.get_similarities(test_vec)
        related_doc_indices = sim.argsort()[:-n-1:-1]
        col = [id]+[item_id_list[index] for index in related_doc_indices]
        submit[i] = col
    return submit

# valid
submit = pool_extract_bm25_tfidf(valid, get_recall_number_tfidf, n, valid.shape[0]//workers+1, worker=workers)
df = pd.DataFrame(submit)
df.to_csv(data_path+'test_pairs_{}number_idf_2.csv'.format(n), header=None, index=False)

# train
submit = pool_extract_bm25_tfidf(train, get_recall_number_tfidf, n, train.shape[0]//workers+1, worker=workers)
df = pd.DataFrame(submit)
df.to_csv(data_path+'train_pairs_{}number_idf_2.csv'.format(n), header=None, index=False)

### combine above
***

In [8]:
# # change `test_only` to be True and everything will work fine
train_pre = pd.read_csv(data_path+'train_pre.csv')
test_pre = pd.read_csv(data_path+'test_pre.csv')
candidate_pre = pd.read_csv(data_path+'candidate_paper_pre.csv')

pid2journal = candidate_pre[['paper_id', 'journal']]
pid2journal = {row[0]: row[1] for row in pid2journal.values}
pid2year = candidate_pre[['paper_id', 'year']].fillna(-1)
pid2year = {row[0]: row[1] for row in pid2year.values}
pid2abstract = candidate_pre[['paper_id', 'abstract_pre']].fillna('')
pid2abstract = {row[0]: row[1] for row in pid2abstract.values}
pid2intrain = {pid: 1 for pid in train_pre['paper_id'].values}
pid2idx = {pid: idx for idx, pid in enumerate(candidate_pre['paper_id'])}

train_pre['label'] = 1
train_pre = train_pre.drop_duplicates('description_id')

def get_recall(path, take, filter_content=False):
    total = 500 if take == 'key' else 1000
    data = pd.read_csv(path, header=None, names=['description_id']+[i for i in range(1, total+1)]).drop_duplicates('description_id').values
    description_id = []
    pid = []
    global count
    for i in tqdm(data, total=data.shape[0]):
        if not filter_content:
            description_id.extend([i[0]]*ns[take])
            pid.extend(list(i[1:ns[take]+1]))
        else:
            tmp = []
            for id in list(i[1: thds[take]+1]):
                if ns[take] == 0:
                    break
                if (not pd.isna(id)) and pid2journal[id] != 'no-content' \
                                     and pid2year[id] != -1 \
                                     and pid2idx[id] < paper_thd:
                    tmp.append(id)
                    if len(tmp) >= ns[take]:
                        break
            if len(tmp) < ns[take]:
                count += 1
            description_id.extend([i[0]]*len(tmp))
            pid.extend(tmp)
    return pd.DataFrame({'description_id':description_id, 'pid':pid})

# bm25
print('recall bm25...')
count = 0
if not train_only:
    re_path = data_path+'test_pairs_1000number_bm25.csv'
    test_recall = get_recall(re_path, 'bm25', drop_no_content)
    test_data_bm25 = test_pre.merge(test_recall, on='description_id', how='inner')
    print(count/test_pre.shape[0])

count = 0
if not test_only:
    re_path = data_path+'train_pairs_1000number_bm25.csv'
    train_recall = get_recall(re_path, 'bm25', False)
    train_data_bm25 = train_pre.merge(train_recall, on='description_id', how='inner')
    print(count/train_pre.shape[0])
    
# idf
print('recall idf...')
count = 0
if not train_only:
    re_path = data_path+'test_pairs_1000number_idf.csv'
    test_recall = get_recall(re_path, 'idf', drop_no_content)
    test_data_idf = test_pre.merge(test_recall, on='description_id', how='inner')
    print(count/test_pre.shape[0])

count = 0
if not test_only:
    re_path = data_path+'train_pairs_1000number_idf.csv'
    train_recall = get_recall(re_path, 'idf', False)
    train_data_idf = train_pre.merge(train_recall, on='description_id', how='inner')
    print(count/train_pre.shape[0])

# s2v
print('recall s2v...')
count = 0
if not train_only:
    re_path = data_path+'test_pairs_1000number_s2v.csv'
    test_recall = get_recall(re_path, 's2v', drop_no_content)
    test_data_s2v = test_pre.merge(test_recall, on='description_id', how='inner')
    print(count/test_pre.shape[0])

count = 0
if not test_only:
    re_path = data_path+'train_pairs_1000number_s2v.csv'
    train_recall = get_recall(re_path, 's2v', False)
    train_data_s2v = train_pre.merge(train_recall, on='description_id', how='inner')
    print(count/train_pre.shape[0])
    
# blue
print('recall blue...')
count = 0
if not train_only:
    re_path = data_path+'test_pairs_1000number_bluebert.csv'
    test_recall = get_recall(re_path, 'blue', drop_no_content)
    test_data_blue = test_pre.merge(test_recall, on='description_id', how='inner')
    print(count/test_pre.shape[0])

count = 0
if not test_only:
    re_path = data_path+'train_pairs_1000number_bluebert.csv'
    train_recall = get_recall(re_path, 'blue', False)
    train_data_blue = train_pre.merge(train_recall, on='description_id', how='inner')
    print(count/train_pre.shape[0])

# key
print('recall key...')
count = 0
if not train_only:
    re_path = data_path+'test_pairs_500number_key.csv'
    test_recall = get_recall(re_path, 'key', drop_no_content)
    test_data_key = test_pre.merge(test_recall, on='description_id', how='inner')
    print(count/test_pre.shape[0])

count = 0
if not test_only:
    re_path = data_path+'train_pairs_500number_key.csv'
    train_recall = get_recall(re_path, 'key', False)
    train_data_key = train_pre.merge(train_recall, on='description_id', how='inner')
    print(count/train_pre.shape[0])

# concat
train_data = None
test_data = None

if not test_only:
    train_data = pd.concat([train_data_bm25, train_data_idf, train_data_s2v, train_data_blue, train_data_key],
                           axis=0,
                           sort=True)
    train_data['label'] = list(map(lambda x,y: int(x==y),train_data['pid'],train_data['paper_id']))
    print(train_data.shape)
    
if not train_only:
    test_data = pd.concat([test_data_bm25, test_data_idf, test_data_s2v, test_data_blue, test_data_key],
                          axis=0,
                          sort=True)
    print(test_data.shape)

recall bm25...


HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))


0.2679214592773324
recall idf...


HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))


0.046589984896014874
recall s2v...


HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))


0.10889392355059835
recall blue...


HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))


0.00787149994190775
recall key...


HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))


0.7238294411525502
(2476883, 6)


In [9]:
# drop duplicates
if not test_only:
    train_data = train_data.drop_duplicates()
    print(train_data.shape)
if not train_only:
    test_data = test_data.drop_duplicates()
    print(test_data.shape)
    
# merge candidate
if not test_only:
    train_data['paper_id'] = train_data['pid'].values
    train_data.pop('pid')
    train_data = train_data.merge(candidate_pre, on='paper_id', how='left')
    print(train_data.shape)
    train_data = train_data[train_data['paper_id'].notnull()]
    print(train_data.shape)
    print('recall:', sum(train_data['label'].values)/len(train_data['description_id'].unique()))

if not train_only:
    test_data['paper_id'] = test_data['pid'].values
    test_data.pop('pid')
    test_data = test_data.merge(candidate_pre, on='paper_id', how='left')
    print(test_data.shape)
    test_data = test_data[test_data['paper_id'].notnull()]
    print(test_data.shape)

del candidate_pre
gc.collect()

# journal != 'no-content'
if not test_only:
    dids = train_data['description_id'].unique()
    did2pid = {row[0]: row[1] for row in train_pre[['description_id', 'paper_id']].values}
    have_content_ids = []

    for did in dids:
        if pid2journal[did2pid[did]] != 'no-content':
            have_content_ids.append(did)

    tmp = train_data[train_data['description_id'].isin(have_content_ids)]
    print('recall (have content):', sum(tmp['label'].values)/len(tmp['description_id'].unique()))

(2079282, 6)
(2079282, 13)
(2079282, 13)


In [10]:
# ns = {'bm25': 50, 'idf': 10, 's2v': 20, 'blue': 5, 'key': 2}
# recall = 0.60436
# 1/0

In [11]:
# drop description_id that all labels are 0
if not test_only:
    temp = train_data[['description_id','label']].groupby('description_id')['label'].apply(lambda x:len(set(list(x)))).reset_index()
    temp.columns = ['description_id','label']
    ids = temp[temp['label'] == 2]['description_id'].values
    train_data = train_data[train_data['description_id'].isin(ids)]
    
# finally write to file
if not test_only:
    train_data.to_csv(data_path+'train_data_merge_{}.csv'.format(n), index=False)
    print(train_data.shape)
    print(train_data.label.value_counts(True))
if not train_only:
    # there are fucking missing values in 'description_id'...
    test_data['description_id'] = test_data['description_id'].fillna('none')
    test_data.to_csv(data_path+'test_data_merge_{}_{}.csv'.format(n, paper_thd), index=False)
    print(test_data.shape)

(2079282, 13)


### get features
***

In [None]:
dictionary = corpora.Dictionary.load(other_path+'train_dictionary.dict')
tfidf = models.TfidfModel.load(other_path+"train_idf.model")
word2idf = {tfidf.id2word.id2token[k]: v for k, v in tfidf.idfs.items() if v > 10}
index = similarities.SparseMatrixSimilarity.load(other_path+'train_index.index')
item_id_list = joblib.load(other_path+'paper_id.pkl')
with open(other_path+'train_content.pkl', 'rb') as fr:
    corpus = pickle.load(fr)

# bert embeddings
# # SciBERT
with open(other_path+'paper2embedding.pkl', 'rb') as f:
    paper2embedding = pickle.load(f)
with open(other_path+'description2embedding.pkl', 'rb') as f:
    description2embedding = pickle.load(f)
# # BlueBERT actually :-p
with open(other_path+'paper2embedding_blue.pkl', 'rb') as f:
    paper2embedding_bio = pickle.load(f)
with open(other_path+'description2embedding_blue.pkl', 'rb') as f:
    description2embedding_bio = pickle.load(f)
# # sent2vec
with open(other_path+'paper2embedding_s2v.pkl', 'rb') as f:
    paper2embedding_s2v = pickle.load(f)
with open(other_path+'description2embedding_s2v.pkl', 'rb') as f:
    description2embedding_s2v = pickle.load(f)
# # pre-trained SciBERT
with open(other_path+'paper2embedding_pre.pkl', 'rb') as f:
    paper2embedding_pre = pickle.load(f)
with open(other_path+'description2embedding_pre.pkl', 'rb') as f:
    description2embedding_pre = pickle.load(f)
    
# bm25
bm25Model = BM25(corpus)

del corpus
gc.collect()

##################################features works##########################################################

def get_features(data_or, postpostfix, vec_model):
    def get_df_grams(train_sample, values, cols):
        def create_ngram_set(input_list, ngram_value=2):
            return set(zip(*[input_list[i:] for i in range(ngram_value)]))

        def get_n_gram(df, values=2):
            train_query = df.values
            train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query]
            train_query_n = []
            for input_list in train_query:
                train_query_n_gram = set()
                for value in range(2, values + 1):
                    train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value)
                train_query_n.append(train_query_n_gram)
            return train_query_n

        train_query = get_n_gram(train_sample[cols[0]], values)
        train_title = get_n_gram(train_sample[cols[1]], values)
        sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y),
                           train_query, train_title))
        sim_number_rate = list(map(lambda x, y: len(x&y) / (len(x)+1e-10) if len(x)!=0 else 0,
                           train_query, train_title))
        return sim, sim_number_rate

    def get_num_key(x, y):
        if y == '':
            return -1
        y = y.strip(';').split(';')
        num = 0
        for i in y:
            if i in x:
                num += 1
        return num

    def get_num_common_words_and_ratio(merge, col):
        # merge data
        merge = merge[col]
        merge.columns = ['q1', 'q2']

        q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
        q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values

        q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values
        q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values

        q1_word_len_set = merge.q1.apply(lambda x: len(set(x.split(' ')))).values
        q2_word_len_set = merge.q2.apply(lambda x: len(set(x.split(' ')))).values

        result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))]
        result_ratio_q = [result[i] / (q1_word_len[i]+1e-10) for i in range(len(q1_word_set))]
        result_ratio_t = [result[i] / (q2_word_len[i]+1e-10) for i in range(len(q1_word_set))]

        result_ratio_q_set = [result[i] / (q1_word_len_set[i]+1e-10) for i in range(len(q1_word_set))]
        result_ratio_t_set = [result[i] / (q2_word_len_set[i]+1e-10) for i in range(len(q1_word_set))]

        return result, result_ratio_q, result_ratio_t, q1_word_len, q2_word_len, q1_word_len_set, q2_word_len_set, result_ratio_q_set, result_ratio_t_set

    def jaccard(x, y):
        x = set(x)
        y = set(y)
        return float(len(x & y) / (len(x | y)+1e-10))

    def get_sim(doc, corpus):
        corpus = corpus.split(' ')
        corpus_vec = [dictionary.doc2bow(corpus)]
        corpus_tfidf = tfidf[corpus_vec]
        featurenum = len(dictionary.token2id.keys())
        index_i = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)
        doc = doc.split(' ')
        vec = dictionary.doc2bow(doc)
        vec_tfidf = tfidf[vec]
        sim = index_i.get_similarities(vec_tfidf)
        return sim[0]

    # tfidf
    def get_simlilary(query, title):
        def get_weight_counter_and_tf_idf(x, y):
            x = x.split()
            y = y.split()
            corups = x + y
            obj = dict(collections.Counter(corups))
            x_weight = []
            y_weight = []
            idfs = []
            for key in obj.keys():
                idf = 1
                w = obj[key]
                if key in x:
                    idf += 1
                    x_weight.append(w)
                else:
                    x_weight.append(0)
                if key in y:
                    idf += 1
                    y_weight.append(w)
                else:
                    y_weight.append(0)
                idfs.append(math.log(3.0 / idf) + 1)
            return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs),
                    np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))]

        weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y),
                          query, title))
        x_weight_couner = []
        y_weight_couner = []
        x_weight_tfidf = []
        y_weight_tfidf = []
        words = []
        for i in weight:
            x_weight_couner.append(i[0])
            y_weight_couner.append(i[1])
            x_weight_tfidf.append(i[2])
            y_weight_tfidf.append(i[3])
            words.append(i[4])

        def mhd_simlilary(x, y):
            return np.linalg.norm(x - y, ord=1)

        mhd_simlilary_counter = list(map(lambda x, y: mhd_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        mhd_simlilary_tfidf = list(map(lambda x, y: mhd_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        def cos_simlilary(x, y):
            return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

        cos_simlilary_counter = list(map(lambda x, y: cos_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        cos_simlilary_tfidf = list(map(lambda x, y: cos_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        def Euclidean_simlilary(x, y):
            return np.sqrt(np.sum(x - y) ** 2)

        Euclidean_simlilary_counter = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                               x_weight_couner, y_weight_couner))
        Euclidean__simlilary_tfidf = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                              x_weight_tfidf, y_weight_tfidf))

        return mhd_simlilary_counter, mhd_simlilary_tfidf, cos_simlilary_counter, \
               cos_simlilary_tfidf, Euclidean_simlilary_counter, Euclidean__simlilary_tfidf

    def get_vec(x):
        vec = []
        for word in x.split():
            if word in vec_model:
                vec.append(vec_model[word])
        if len(vec) == 0:
            return np.nan
        else:
            return np.mean(np.array(vec), axis=0)
        
    def get_vec_ft(x):
        vec = [fasttext_model[word] for word in x.split()]
        if len(vec) == 0:
            return np.nan
        else:
            return np.mean(np.array(vec), axis=0)
    
    def get_vec_sif(x, postfix):
        return sif_model.sv[tag2idx[x+postfix+postpostfix]]
    
    def get_vec_sif_paper(x, postfix):
        return sif_model.sv[tag2idx[x+postfix]]

    # get bm25
    def get_bm25(p_id, query):
        query = str(query).split(' ')
        score = bm25Model.get_score(query, item_id_list.index(p_id))
        return score

    def apply_fun(df):
        df.columns = ['d_id', 'key', 'doc']
        query_id_group = df.groupby(['d_id'])
        bm_list = []
        for name, group in query_id_group:
            corpus = group['doc'].values.tolist()
            corpus = [sentence.strip().split() if sentence != '' else ['none'] \
                      for sentence in corpus]
            query = group['key'].values[0].strip().split()
            bm25Model = BM25(corpus)
            bmscore = bm25Model.get_scores(query)
            bm_list.extend(bmscore)
        return bm_list
    
    def get_bert_embedding(id, take):
        if take == 'abstract' or take == 'title':
            return paper2embedding[id][take]
        else:
            return description2embedding[id+postpostfix][take]
        
    def get_bert_embedding_bio(id, take):
        if take == 'abstract' or take == 'title':
            return paper2embedding_bio[id][take]
        else:
            return description2embedding_bio[id+postpostfix][take]
        
    def get_bert_embedding_s2v(id, take):
        if take == 'abstract' or take == 'title':
            return paper2embedding_s2v[id][take]
        else:
            return description2embedding_s2v[id+postpostfix][take]
        
    def get_bert_embedding_pre(id, take):
        if take == 'title':
            return paper2embedding_pre[id]
        else:
            return description2embedding_pre[id+postpostfix]
        
    def get_num_upper(x, y, rate=True):
        if y == '':
            return -1
        x = [word for word in x.split() if word.isupper() and word != 'A']
        y = [word for word in y.split() if word.isupper() and word != 'A']
        set_y = set(y)
        ret = 0
        for word in x:
            if word in set_y:
                ret += 1
        ret = ret/(len(y)+1e-10) if rate else ret
        return ret
    
    def get_num_psu_key(x, y, thd, rate=True):
        def filter_len(string):
            ret = [word for word in string.split() if len(word) >= MIN_LEN and word in word2idf]
            ret = [word for word in ret if word2idf[word] > thd]
            return ret

        def top_k(string, k=10):
            ret = sorted(string.split(), key=lambda x: len(x), reverse=True)[:k]
            ret = [kw for kw in ret if len(kw) >= 8 and kw in word2idf]
            ret = [kw for kw in ret if word2idf[kw] > thd]
            return ret
        
        if y == '':
            return -1
        x = filter_len(x)
        y = top_k(y)
        set_y = set(y)
        ret = 0
        for word in x:
            if word in set_y:
                ret += 1
        ret = ret/(len(y)+1e-10) if rate else ret
        return ret
    
    data = data_or.copy()

    data['key_text_pre'] = data['key_text_pre'].apply(
        lambda x: x.replace('.', ' '))
    
    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: '' if len(x) < 9 else x)

    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: '' if x.split(' ') == ['n', 'o', 'n', 'e'] else x)
    
    prefix = 'num_'
    data[prefix + 'key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' ')))
    data[prefix + 'description_text_pre_len'] = data['description_text_pre'].apply(lambda x: len(x.split(' ')))

    data.loc[data[prefix + 'key_text_len'] < MIN_LEN, 'key_text_pre'] = data[data[prefix + 'key_text_len'] < MIN_LEN][
        'description_text'].apply(
        lambda x: ' '.join(pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x)))).values

    # psu_key
    # # 10
    data[prefix + 'key_in_psu_key_number_rate_kt_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_ka_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_rate_dt_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_da_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_psu_key_number_kt_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_ka_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_dt_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_da_10'] = list(map(lambda x,y: get_num_psu_key(x,y,10,False),data['description_text_pre'],data['abstract_pre']))
    # # 11
    data[prefix + 'key_in_psu_key_number_rate_kt_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_ka_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_rate_dt_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_da_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_psu_key_number_kt_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_ka_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_dt_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_da_11'] = list(map(lambda x,y: get_num_psu_key(x,y,11,False),data['description_text_pre'],data['abstract_pre']))
    # # 12
    data[prefix + 'key_in_psu_key_number_rate_kt_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_ka_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_rate_dt_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_da_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_psu_key_number_kt_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_ka_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_dt_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_da_12'] = list(map(lambda x,y: get_num_psu_key(x,y,12,False),data['description_text_pre'],data['abstract_pre']))
    # # 13
    data[prefix + 'key_in_psu_key_number_rate_kt_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_ka_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_rate_dt_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_da_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_psu_key_number_kt_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_ka_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_dt_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_da_13'] = list(map(lambda x,y: get_num_psu_key(x,y,13,False),data['description_text_pre'],data['abstract_pre']))
    # # 14
    data[prefix + 'key_in_psu_key_number_rate_kt_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_ka_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_rate_dt_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_rate_da_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_psu_key_number_kt_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_ka_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_psu_key_number_dt_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_psu_key_number_da_14'] = list(map(lambda x,y: get_num_psu_key(x,y,14,False),data['description_text_pre'],data['abstract_pre']))
    
    # upper
    data[prefix + 'key_in_upper_number_rate_kt'] = list(map(lambda x,y: get_num_upper(x,y),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_upper_number_rate_ka'] = list(map(lambda x,y: get_num_upper(x,y),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_upper_number_rate_dt'] = list(map(lambda x,y: get_num_upper(x,y),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_upper_number_rate_da'] = list(map(lambda x,y: get_num_upper(x,y),data['description_text_pre'],data['abstract_pre']))
    
    data[prefix + 'key_in_upper_number_kt'] = list(map(lambda x,y: get_num_upper(x,y,False),data['key_text_pre'],data['title_pro']))
    data[prefix + 'key_in_upper_number_ka'] = list(map(lambda x,y: get_num_upper(x,y,False),data['key_text_pre'],data['abstract_pre']))
    data[prefix + 'key_in_upper_number_dt'] = list(map(lambda x,y: get_num_upper(x,y,False),data['description_text_pre'],data['title_pro']))
    data[prefix + 'key_in_upper_number_da'] = list(map(lambda x,y: get_num_upper(x,y,False),data['description_text_pre'],data['abstract_pre']))
    
    # sent2vec
    data['key_text_s2v'] = data['description_id'].apply(lambda x: get_bert_embedding_s2v(x, 'key_text'))
    data['title_s2v'] = data['paper_id'].apply(lambda x: get_bert_embedding_s2v(x, 'title'))
    data['abstract_s2v'] = data['paper_id'].apply(lambda x: get_bert_embedding_s2v(x, 'abstract'))
    data['description_text_s2v'] = data['description_id'].apply(lambda x: get_bert_embedding_s2v(x, 'description_text'))
    # pretrained bert
    data['title_bert_pre'] = data['paper_id'].apply(lambda x: get_bert_embedding_pre(x, 'title'))
    data['description_text_bert_pre'] = data['description_id'].apply(lambda x: get_bert_embedding_pre(x, 'description_text'))
    # sif
    data['key_text_pre_sif'] = data['description_id'].apply(lambda x: get_vec_sif(x, '_key'))
    data['title_pro_sif'] = data['paper_id'].apply(lambda x: get_vec_sif_paper(x, '_title'))
    data['abstract_pre_sif'] = data['paper_id'].apply(lambda x: get_vec_sif_paper(x, '_abstract'))
    data['description_text_pre_sif'] = data['description_id'].apply(lambda x: get_vec_sif(x, '_description'))
    # word2vec
    data['key_text_pre_vec'] = data['key_text_pre'].apply(lambda x: get_vec(x))
    data['title_pro_vec'] = data['title_pro'].apply(lambda x: get_vec(x))
    data['abstract_pre_vec'] = data['abstract_pre'].apply(lambda x: get_vec(x))
    data['description_text_pre_vec'] = data['description_text_pre'].apply(lambda x: get_vec(x))
    # fasttext
    data['key_text_pre_fasttext'] = data['key_text_pre'].apply(lambda x: get_vec_ft(x))
    data['title_pro_fasttext'] = data['title_pro'].apply(lambda x: get_vec_ft(x))
    data['abstract_pre_fasttext'] = data['abstract_pre'].apply(lambda x: get_vec_ft(x))
    data['description_text_pre_fasttext'] = data['description_text_pre'].apply(lambda x: get_vec_ft(x))
    # scibert
    data['key_text_bert'] = data['description_id'].apply(lambda x: get_bert_embedding(x, 'key_text'))
    data['title_bert'] = data['paper_id'].apply(lambda x: get_bert_embedding(x, 'title'))
    data['abstract_bert'] = data['paper_id'].apply(lambda x: get_bert_embedding(x, 'abstract'))
    data['description_text_bert'] = data['description_id'].apply(lambda x: get_bert_embedding(x, 'description_text'))
    # biobert
    data['key_text_biobert'] = data['description_id'].apply(lambda x: get_bert_embedding_bio(x, 'key_text'))
    data['title_biobert'] = data['paper_id'].apply(lambda x: get_bert_embedding_bio(x, 'title'))
    data['abstract_biobert'] = data['paper_id'].apply(lambda x: get_bert_embedding_bio(x, 'abstract'))
    data['description_text_biobert'] = data['description_id'].apply(lambda x: get_bert_embedding_bio(x, 'description_text'))
    
    data[prefix + 'cate_pa_isnull'] = data['abstract_pre'].apply(lambda x: 1 if x == '' else 0)
    data[prefix + 'cate_pkeywords_isnull'] = data['keywords'].apply(lambda x: 1 if x == '' else 0)

    data[prefix + 'key_in_key_word_number'] = list(map(lambda x,y: get_num_key(x,y),data['key_text_pre'],data['keywords']))
    data[prefix + 'key_in_key_word_number_rate'] = list(map(lambda x,y: 0 if x==-1 else x/(len(y.strip(';').split(';'))+1e-10),data[prefix+'key_in_key_word_number'],
                                                data['keywords']))
    
    #append
    data[prefix + 'key_in_key_word_number2'] = list(map(lambda x,y: get_num_key(x,y),data['description_text_pre'],data['keywords']))
    data[prefix + 'key_in_key_word_number2_rate'] = list(map(lambda x,y: 0 if x==-1 else x/(len(y.strip(';').split(';'))+1e-10),data[prefix+'key_in_key_word_number2'],
                                                data['keywords']))
    
    data[prefix + 'common_words_k_pt'], \
    data[prefix + 'common_words_k_pt_k'], \
    data[prefix + 'common_words_k_pt_pt'], \
    data[prefix + 'k_len'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set'], \
    data[prefix + 'common_words_k_pt_pt_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'title_pro'])

    data[prefix + 'common_words_k_at'], \
    data[prefix + 'common_words_k_at_k'], \
    data[prefix + 'common_words_k_at_at'], \
    data[prefix + 'k_len'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set'], \
    data[prefix + 'common_words_k_at_at_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'abstract_pre'])

    #append
    data[prefix + 'common_words_k_pt_2'], \
    data[prefix + 'common_words_k_pt_k_2'], \
    data[prefix + 'common_words_k_pt_pt_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set_2'], \
    data[prefix + 'common_words_k_pt_pt_set_2'] = get_num_common_words_and_ratio(data, ['description_text_pre', 'title_pro'])

    data[prefix + 'common_words_k_at_2'], \
    data[prefix + 'common_words_k_at_k_2'], \
    data[prefix + 'common_words_k_at_at_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set_2'], \
    data[prefix + 'common_words_k_at_at_set_2'] = get_num_common_words_and_ratio(data, ['description_text_pre', 'abstract_pre'])

    data[prefix + 'jaccard_sim_k_pt'] = list(map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['abstract_pre']))

    #append
    data[prefix + 'jaccard_sim_k_pt2'] = list(map(lambda x, y: jaccard(x, y), data['description_text_pre'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa2'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['description_text_pre']))

    data[prefix + 'edict_distance_k_pt'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1e-10), data['key_text_pre'], data['title_pro']))
    data[prefix + 'edict_jaro'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'edict_ratio'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'edict_jaro_winkler'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), data['key_text_pre'], data['title_pro']))

    data[prefix + 'edict_distance_k_pa'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1e-10), data['key_text_pre'],
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), data['key_text_pre'], data['abstract_pre']))
    data[prefix + 'edict_ratio_pa'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), data['key_text_pre'], data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), data['key_text_pre'], data['abstract_pre']))

    #append
    data[prefix + 'edict_distance_k_pt_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1e-10), data['description_text_pre'], data['title_pro']))
    data[prefix + 'edict_jaro_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), data['description_text_pre'], data['title_pro']))
    data[prefix + 'edict_ratio_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), data['description_text_pre'], data['title_pro']))
    data[prefix + 'edict_jaro_winkler_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), data['description_text_pre'], data['title_pro']))

    data[prefix + 'edict_distance_k_pa_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1e-10), data['description_text_pre'],
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), data['description_text_pre'], data['abstract_pre']))
    data[prefix + 'edict_ratio_pa_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), data['description_text_pre'], data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), data['description_text_pre'], data['abstract_pre']))

    data[prefix + 'sim'] = list(map(lambda x, y: get_sim(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'sim_pa'] = list(map(lambda x, y: get_sim(x, y), data['key_text_pre'], data['abstract_pre']))

    #append
    data[prefix + 'sim_2'] = list(map(lambda x, y: get_sim(x, y), data['description_text_pre'], data['title_pro']))
    data[prefix + 'sim_pa_2'] = list(map(lambda x, y: get_sim(x, y), data['description_text_pre'], data['abstract_pre']))


    data[prefix + 'mhd_similiary'], data[prefix + 'tf_mhd_similiary'], \
    data[prefix + 'cos_similiary'], data[prefix + 'tf_cos_similiary'], \
    data[prefix + 'os_similiary'], data[prefix + 'tf_os_similiary'] = get_simlilary(data['key_text_pre'],data['title_pro'])


    data[prefix + 'mhd_similiary_pa'], data[prefix + 'tf_mhd_similiary_pa'], \
    data[prefix + 'cos_similiary_pa'], data[prefix + 'tf_cos_similiary_pa'], \
    data[prefix + 'os_similiary_pa'], data[prefix + 'tf_os_similiary_pa'] = get_simlilary(data['key_text_pre'],data['abstract_pre'])
    
    # cos
    data[prefix + 'cos_mean_word2vec'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec'] = data[prefix + 'cos_mean_word2vec'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_word2vec'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'cos_mean_fasttext'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'cos_mean_fasttext'] = data[prefix + 'cos_mean_fasttext'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_fasttext'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'cos_mean_sif'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'cos_mean_sif'] = data[prefix + 'cos_mean_sif'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_sif'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'cos_mean_bert'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_bert'], data['title_bert']))
    data[prefix + 'cos_mean_bert'] = data[prefix + 'cos_mean_bert'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_bert'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_bert'], data['title_bert']))
    data[prefix + 'cos_mean_biobert'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_biobert'], data['title_biobert']))
    data[prefix + 'cos_mean_biobert'] = data[prefix + 'cos_mean_biobert'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_biobert'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_biobert'], data['title_biobert']))
    data[prefix + 'cos_mean_s2v'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_s2v'], data['title_s2v']))
    data[prefix + 'cos_mean_s2v'] = data[prefix + 'cos_mean_s2v'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_s2v'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_s2v'], data['title_s2v']))

    # mhd
    data[prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'mhd_mean_fasttext'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'mhd_mean_sif'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'mhd_mean_bert'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_bert'], data['title_bert']))
    data[prefix + 'mhd_mean_biobert'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_biobert'], data['title_biobert']))
    data[prefix + 'mhd_mean_s2v'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_s2v'], data['title_s2v']))

    # cos
    data[prefix + 'cos_mean_word2vec_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa'] = data[prefix + 'cos_mean_word2vec_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_word2vec_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'cos_mean_fasttext_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_fasttext'], data['abstract_pre_fasttext']))
    data[prefix + 'cos_mean_fasttext_pa'] = data[prefix + 'cos_mean_fasttext_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_fasttext_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_fasttext'], data['abstract_pre_fasttext']))
    data[prefix + 'cos_mean_sif_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'cos_mean_sif_pa'] = data[prefix + 'cos_mean_sif_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_sif_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'cos_mean_bert_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_bert'], data['abstract_bert']))
    data[prefix + 'cos_mean_bert_pa'] = data[prefix + 'cos_mean_bert_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_bert_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_bert'], data['abstract_bert']))
    data[prefix + 'cos_mean_biobert_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_biobert'], data['abstract_biobert']))
    data[prefix + 'cos_mean_biobert_pa'] = data[prefix + 'cos_mean_biobert_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_biobert_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_biobert'], data['abstract_biobert']))
    data[prefix + 'cos_mean_s2v_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['key_text_s2v'], data['abstract_s2v']))
    data[prefix + 'cos_mean_s2v_pa'] = data[prefix + 'cos_mean_s2v_pa'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_s2v_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['key_text_s2v'], data['abstract_s2v']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'mhd_mean_fasttext_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_vec'], data['abstract_pre_fasttext']))
    data[prefix + 'mhd_mean_sif_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'mhd_mean_bert_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_bert'], data['abstract_bert']))
    data[prefix + 'mhd_mean_biobert_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_biobert'], data['abstract_biobert']))
    data[prefix + 'mhd_mean_s2v_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['key_text_s2v'], data['abstract_s2v']))

    #append
    data[prefix + 'cos_mean_word2vec_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec_2'] = data[prefix + 'cos_mean_word2vec_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_word2vec_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'cos_mean_fasttext_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'cos_mean_fasttext_2'] = data[prefix + 'cos_mean_fasttext_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_fasttext_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'cos_mean_sif_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'cos_mean_sif_2'] = data[prefix + 'cos_mean_sif_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_sif_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'cos_mean_bert_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_bert'], data['title_bert']))
    data[prefix + 'cos_mean_bert_2'] = data[prefix + 'cos_mean_bert_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_bert_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_bert'], data['title_bert']))
    data[prefix + 'cos_mean_biobert_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_biobert'], data['title_biobert']))
    data[prefix + 'cos_mean_biobert_2'] = data[prefix + 'cos_mean_biobert_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_biobert_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_biobert'], data['title_biobert']))
    data[prefix + 'cos_mean_s2v_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_s2v'], data['title_s2v']))
    data[prefix + 'cos_mean_s2v_2'] = data[prefix + 'cos_mean_s2v_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_s2v_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_s2v'], data['title_s2v']))
    data[prefix + 'cos_mean_bert_pre_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_bert_pre'], data['title_bert_pre']))
    data[prefix + 'cos_mean_bert_pre_2'] = data[prefix + 'cos_mean_bert_pre_2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_bert_pre_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_bert_pre'], data['title_bert_pre']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_vec'], data['title_pro_vec']))
    data[prefix + 'mhd_mean_fasttext_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_fasttext'], data['title_pro_fasttext']))
    data[prefix + 'mhd_mean_sif_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_sif'], data['title_pro_sif']))
    data[prefix + 'mhd_mean_bert_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_bert'], data['title_bert']))
    data[prefix + 'mhd_mean_biobert_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_biobert'], data['title_biobert']))
    data[prefix + 'mhd_mean_s2v_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_s2v'], data['title_s2v']))
    data[prefix + 'mhd_mean_bert_pre_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_bert_pre'], data['title_bert_pre']))
    
    # cos
    data[prefix + 'cos_mean_word2vec_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa2'] = data[prefix + 'cos_mean_word2vec_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_word2vec_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'cos_mean_fasttext_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_fasttext'], data['abstract_pre_fasttext']))
    data[prefix + 'cos_mean_fasttext_pa2'] = data[prefix + 'cos_mean_fasttext_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_fasttext_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_fasttext'], data['abstract_pre_fasttext']))
    data[prefix + 'cos_mean_sif_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'cos_mean_sif_pa2'] = data[prefix + 'cos_mean_sif_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_sif_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'cos_mean_bert_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_bert'], data['abstract_bert']))
    data[prefix + 'cos_mean_bert_pa2'] = data[prefix + 'cos_mean_bert_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_bert_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_bert'], data['abstract_bert']))
    data[prefix + 'cos_mean_biobert_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_biobert'], data['abstract_biobert']))
    data[prefix + 'cos_mean_biobert_pa2'] = data[prefix + 'cos_mean_biobert_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_biobert_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_biobert'], data['abstract_biobert']))
    data[prefix + 'cos_mean_s2v_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  data['description_text_s2v'], data['abstract_s2v']))
    data[prefix + 'cos_mean_s2v_pa2'] = data[prefix + 'cos_mean_s2v_pa2'].apply(
        lambda x: np.nan if np.isnan(x).any() else x)
    data[prefix + 'os_mean_s2v_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 data['description_text_s2v'], data['abstract_s2v']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_vec'], data['abstract_pre_vec']))
    data[prefix + 'mhd_mean_fasttext_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_fasttext'], data['abstract_pre_fasttext']))
    data[prefix + 'mhd_mean_sif_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_pre_sif'], data['abstract_pre_sif']))
    data[prefix + 'mhd_mean_bert_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_bert'], data['abstract_bert']))
    data[prefix + 'mhd_mean_biobert_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_biobert'], data['abstract_biobert']))
    data[prefix + 'mhd_mean_s2v_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), data['description_text_s2v'], data['abstract_s2v']))

    data[prefix + 'n_gram_sim'], data[prefix + 'sim_numeber_rate'] = get_df_grams(data,2,['key_text_pre','title_pro'])
    data[prefix + 'n_gram_sim_pa'], data[prefix + 'sim_numeber_rate_pa'] = get_df_grams(data,2,['key_text_pre','abstract_pre'])
    data[prefix + 'n_gram_sim_tri'], data[prefix + 'sim_numeber_rate_tri'] = get_df_grams(data,3,['key_text_pre','title_pro'])
    data[prefix + 'n_gram_sim_pa_tri'], data[prefix + 'sim_numeber_rate_pa_tri'] = get_df_grams(data,3,['key_text_pre','abstract_pre'])

    #append
    data[prefix + 'n_gram_sim_2'], data[prefix + 'sim_numeber_rate_2'] = get_df_grams(data,2,['description_text_pre','title_pro'])
    data[prefix + 'n_gram_sim_pa_2'], data[prefix + 'sim_numeber_rate_pa_2'] = get_df_grams(data,2,['description_text_pre','abstract_pre'])
    data[prefix + 'n_gram_sim_tri_2'], data[prefix + 'sim_numeber_rate_tri_2'] = get_df_grams(data,3,['description_text_pre','title_pro'])
    data[prefix + 'n_gram_sim_pa_tri_2'], data[prefix + 'sim_numeber_rate_pa_tri_2'] = get_df_grams(data,3,['description_text_pre','abstract_pre'])

    data[prefix + 'bm_25_all'] = list(map(lambda x, y: get_bm25(x, y), data['paper_id'], data['key_text_pre']))
    #append
    data[prefix + 'bm_25_all_2'] = list(map(lambda x, y: get_bm25(x, y), data['paper_id'], data['description_text_pre']))

    data[prefix + 'bm25'] = apply_fun(data[['description_id', 'key_text_pre', 'title_pro']])
    data[prefix + 'bm25_pa'] = apply_fun(data[['description_id', 'key_text_pre', 'abstract_pre']])

    #append
    data[prefix + 'bm25_2'] = apply_fun(data[['description_id', 'description_text_pre', 'title_pro']])
    data[prefix + 'bm25_pa_2'] = apply_fun(data[['description_id', 'description_text_pre', 'abstract_pre']])

    feat = []
    for col in data.columns:
        if re.match(prefix, col) != None:
            feat.append(col)
    data = data[feat]

    return data

word2vec_path = model_path+'word2vec.model'
fasttext_path = model_path+'fasttext2.bin'
sif_path = model_path+'sif.model'

vec_model = Word2Vec.load(word2vec_path)
fasttext_model = fasttext.load_model(fasttext_path)
sif_model = BaseSentence2VecModel.load(sif_path)

with open(model_path+'tag2idx.pkl', 'rb') as f:
    tag2idx = pickle.load(f)

t1 = time.time()
cols = ['description_id', 'description_text', 'description_text_pre',
        'key_text_pre', 'paper_id', 'keywords', 'title_pro', 'abstract_pre']

if not test_only:
    train_data = pd.read_csv(data_path+'train_data_merge_{}.csv'.format(n))[cols]
    print(train_data.shape)
    train_feat = pool_extract(data=train_data.fillna(''),
                              f=get_features,
                              vec_model=vec_model,
                              postpostfix='_train',
                              chunk_size=train_data.shape[0]//workers+1,
                              worker=workers)
    train_feat.to_csv(data_path+'train_data_merge_{}_featall.csv'.format(n), index=False)
    print(train_feat.shape)
    del train_data, train_feat
    gc.collect()

if not train_only:
    test_data = pd.read_csv(data_path+'test_data_merge_{}_{}.csv'.format(n, paper_thd))[cols]
    print(test_data.shape)
    test_feat = pool_extract(data=test_data.fillna(''),
                             f=get_features,
                             vec_model=vec_model,
                             postpostfix='_test',
                             chunk_size=test_data.shape[0]//workers+1,
                             worker=workers)
    test_feat.to_csv(data_path+'test_data_merge_{}_{}_featall.csv'.format(n, paper_thd), index=False)
    print(test_feat.shape)

print('success')
t2 = time.time()
print((t2-t1) / 60)