In [121]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import difflib
from fuzzywuzzy import fuzz
import re
from collections import Counter
from nltk.corpus import stopwords

In [122]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
information_train = pd.read_csv('information_train.csv', sep = '\t')
information_test = pd.read_csv('information_test.csv', sep = '\t')
train_info = train.merge(information_train, on = 'pmid', how = 'inner')
train_info['pub_date'] = pd.to_datetime(train_info['pub_date'])
test_info = test.merge(information_test, on = 'pmid', how = 'inner')
test_info['pub_date'] = pd.to_datetime(test_info['pub_date'])
ref_all = [int(j) for i in train_info['ref_list'].values for j in eval(i)]

In [3]:
train_info.head(2)

Unnamed: 0,pmid,ref_list,abstract,article_title,author_str,pub_date,set,full_Text
0,17074820,"['15153999', '15213210', '7668302']","Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",2006-11-01,13,
1,15153999,"['12721363', '9096352', '10788337', '9114021',...",Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,2008-02-25,13,


# Preparing Training Set

In [46]:
train_info['ref_list_new'] = train_info['ref_list'].apply(lambda x : eval(x))
exp_train_info = train_info['ref_list_new'].apply(pd.Series).stack().rename('ref_list_new') .to_frame().reset_index(1, drop=True).join(train_info['pmid']).reset_index(drop=True).drop_duplicates()

In [47]:
exp_train_info.shape, train_info.shape

((11517, 2), (3522, 9))

In [69]:
exp_train_info['ref_list_new'] = pd.to_numeric(pd.Series(exp_train_info['ref_list_new']),errors='coerce')
exp_train_info['label'] = 1
exp_train_info.head()

Unnamed: 0,ref_list_new,pmid,label
0,15153999,17074820,1
1,15213210,17074820,1
2,7668302,17074820,1
3,12721363,15153999,1
4,9096352,15153999,1


In [62]:
exp_train_info['ref_list_new'].dtype

dtype('int64')

In [70]:
non_ref_list = []
pmid_list = []
for i in train_info.index:
    rw = train_info.iloc[i,:]
    dt = rw['pub_date']
    st = rw['set']
    ref = eval(rw['ref_list'])
    pmids = train_info[(train_info['pub_date'] < dt) & (train_info['set'] == st)]['pmid']
    req_pmids = list(set(pmids) - set(ref))
    for vv in req_pmids:
        non_ref_list.append(vv)
        pmid_list.append(int(rw['pmid']))

In [71]:
non_exp_train_info = pd.DataFrame({'pmid':pmid_list,'ref_list_new':non_ref_list})
non_exp_train_info['label'] = 0

In [72]:
non_exp_train_info.head()

Unnamed: 0,pmid,ref_list_new,label
0,17074820,11466240,0
1,17074820,8524801,0
2,17074820,8651264,0
3,17074820,9529345,0
4,17074820,9311748,0


In [74]:
final_train = exp_train_info.append(non_exp_train_info)
exp_train_info.shape, non_exp_train_info.shape, final_train.shape

((11517, 3), (857252, 3), (868769, 3))

In [77]:
train_full = final_train.merge(information_train, on = 'pmid', how = 'inner').merge(information_train, left_on = 'ref_list_new', right_on = 'pmid', how = 'inner')
train_full.head(2)

Unnamed: 0,label,pmid_x,ref_list_new,abstract_x,article_title_x,author_str_x,pub_date_x,set_x,full_Text_x,abstract_y,article_title_y,author_str_y,pmid_y,pub_date_y,set_y,full_Text_y
0,1,17074820,15153999,"Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",2006-11-01,13,,Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,15153999,2008-02-25,13,
1,1,17074820,15213210,"Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",2006-11-01,13,,Alleviating health disparities in the United S...,Genetic Research and Health Disparities,"Pamela Sankar, Mildred K. Cho, Celeste M. Cond...",15213210,2008-02-20,13,


In [88]:
train_full.to_csv('train_full.csv', index = False)

# Preparing Test Set

In [81]:
test.shape

(2034, 1)

In [87]:
final_test = pd.DataFrame([(int(i), int(j)) for i in test['pmid'] for j in test['pmid'] if i != j])
final_test.columns = ['pmid', 'ref_list_new']
print(final_test.shape)
final_test.head()

(4135122, 2)


Unnamed: 0,pmid,ref_list_new
0,14058267,4550818
1,14058267,14222809
2,14058267,4164675
3,14058267,6211173
4,14058267,4180008


In [89]:
test_full = final_test.merge(information_test, on = 'pmid', how = 'inner').merge(information_test, left_on = 'ref_list_new', right_on = 'pmid', how = 'inner')
test_full.head(2)

Unnamed: 0,pmid_x,ref_list_new,abstract_x,article_title_x,author_str_x,pub_date_x,set_x,full_Text_x,abstract_y,article_title_y,author_str_y,pmid_y,pub_date_y,set_y,full_Text_y
0,14058267,4550818,A technique is described for collecting thorac...,The absorption of oleic acid in the bile fistu...,"D. R. Saunders, A. M. Dawson",1963-09-01,15,,F-merogenotes derived from F14 by transduction...,Ordering of Mutant Sites in the Isoleucine-Val...,"Nancy J. Marsh, D. E. Duggan",4550818,1972-02-01,9,
1,14222809,4550818,Direct electron microscopic evidence is report...,ULTRASTRUCTURE OF ISOLATED KIDNEY MITOCHONDRIA...,"Mario H. Burgos, Agustin Aoki, Fabio L. Sacerdote",1964-11-01,19,,F-merogenotes derived from F14 by transduction...,Ordering of Mutant Sites in the Isoleucine-Val...,"Nancy J. Marsh, D. E. Duggan",4550818,1972-02-01,9,


In [90]:
test_full.to_csv('test_full.csv', index = False)

In [95]:
test_full_new = test_full[(test_full['set_x'] == test_full['set_y']) & (test_full['pub_date_x']>test_full['pub_date_y'])]
test_full.shape, test_full_new.shape

((4135122, 15), (274877, 15))

In [98]:
test_full_new.to_csv('test_full_new.csv', index = False)

In [96]:
train_test = train_full.append(test_full_new)

In [97]:
train_test.columns

Index(['abstract_x', 'abstract_y', 'article_title_x', 'article_title_y',
       'author_str_x', 'author_str_y', 'full_Text_x', 'full_Text_y', 'label',
       'pmid_x', 'pmid_y', 'pub_date_x', 'pub_date_y', 'ref_list_new', 'set_x',
       'set_y'],
      dtype='object')

# Similarity Calculation

In [117]:
def author_name_match(li_a, li_b):
    lia = [re.sub(r'\W+', ' ', i).lower().strip() for i in li_a.split(',')]
    lib = [re.sub(r'\W+', ' ', i).lower().strip() for i in li_b.split(',')]
    print(lia)
    print(lib)
    return len(set(lia) & set(lib))

In [124]:
stop = set(stopwords.words('english'))
def similarity_extract(a,b):
    a = re.sub(r'\W+', ' ', a).lower().strip()
    b = re.sub(r'\W+', ' ', b).lower().strip()
    a = ' '.join(kk for kk in [stemmer.stem(wd) for wd in a.split(' ') if wd not in stop and wd != ''])
    b = ' '.join(vv for vv in [stemmer.stem(wd) for wd in b.split(' ') if wd not in stop and wd != ''])
    return [fuzz.ratio(a,b),fuzz.partial_ratio(a,b),fuzz.token_sort_ratio(a,b),fuzz.token_set_ratio(a,b)]

In [125]:
similarity_extract('A technique is described for collecting thorac', 'F-merogenotes derived from F14 by transduction...')

[29, 35, 13, 13]

In [100]:
abstract_x, article_title_x, full_Text_x

[43, 43, 43, 20]

In [None]:
train_test['abstract_score'] = train_test[['abstract_x', 'abstract_y']].apply(lambda x : similarity_extract(x[0], x[1]), axis = 1)
train_test['article_title_score'] = train_test[['article_title_x', 'article_title_y']].apply(lambda x : similarity_extract(x[0], x[1]), axis = 1)
train_test['full_text_score'] = train_test[['article_title_x', 'article_title_y']].apply(lambda x : similarity_extract(x[0], x[1]), axis = 1)

In [None]:
for i in [0,1,2,3]:
    train_test['abstract_score' + str(i)] = train_test['abstract_score'].apply(lambda x :  x[i])
    train_test['article_title_score' + str(i)] = train_test['article_title_score'].apply(lambda x :  x[i])
    train_test['full_text_score' + str(i)] = train_test['full_text_score'].apply(lambda x :  x[i])

In [None]:
train_test['author_str_similarity'] = train_test[['author_str_x', 'author_str_y']].apply(lambda x : author_name_match(x[0], x[1]), axis = 1)
train_test['date_diff'] = train_test['pub_date_x'] - train_test['pub_date_y']

In [None]:
train_test.head()

In [None]:
train_test[['pmid_x', 'ref_list_new','abstract_score0', 'abstract_score1', 'abstract_score2', 'abstract_score3', 'article_title_score0', 'article_title_score1', 'article_title_score2', 'article_title_score3', 'full_text_score0', 'full_text_score1', 'full_text_score2', 'full_text_score3', 'author_str_similarity', 'date_diff', 'label']].to_csv('train_test_v1.csv', index = False)

In [None]:
#Create 'all_x' from these 3 columns : abstract_x, article_title_x, full_Text_x
train_test['all_x'] = train_test['abstract_x'] + train_test['article_title_x'] + train_test['full_Text_x']

#Create 'all_y' from these 3 columns : abstract_y, article_title_y, full_Text_y
train_test['all_y'] = train_test['abstract_y'] + train_test['article_title_y'] + train_test['full_Text_y']


In [None]:
1) tfidf, fasttext, word2vec of 'all' column : finding cosine similarity between 'all_x' & 'all_y' vectors
2) using deep lstm siamese network for text similarity between 'all_x' & 'all_y'

# Modeling

# Function for calculating F1 score