In [1]:
import pandas as pd
import numpy as np
import difflib
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')
import re
from collections import Counter



In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,pmid,ref_list
0,17074820,"['15153999', '15213210', '7668302']"
1,15153999,"['12721363', '9096352', '10788337', '9114021',..."
2,15213210,"['11466240', '12184798']"
3,7668302,['1539589']
4,12721363,"['9465087', '11842208', '11309498', '9465125',..."


In [3]:
train.shape

(3522, 2)

In [4]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,pmid
0,14058267
1,4550818
2,14222809
3,4164675
4,6211173


In [5]:
test.shape

(2034, 1)

In [6]:
information_train = pd.read_csv('information_train.csv', sep = '\t')
information_test = pd.read_csv('information_test.csv', sep = '\t')
information_train.head()

Unnamed: 0,abstract,article_title,author_str,pmid,pub_date,set,full_Text
0,"Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",17074820,2006-11-01,13,
1,Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,15153999,2008-02-25,13,
2,Alleviating health disparities in the United S...,Genetic Research and Health Disparities,"Pamela Sankar, Mildred K. Cho, Celeste M. Cond...",15213210,2008-02-20,13,
3,Protecting the confidentiality of genetic rese...,Certificates of confidentiality: a valuable to...,"C L Earley, L C Strong",7668302,1995-09-01,13,
4,Whereas the human linkage map appears on limit...,Linkage disequilibrium in human populations,"Christine Lonjou, Weihua Zhang, Andrew Collins...",12721363,2003-05-13,13,


In [7]:
information_train.shape

(3522, 7)

In [8]:
train_info = train.merge(information_train, on = 'pmid', how = 'inner')
train_info['pub_date'] = pd.to_datetime(train_info['pub_date'])
train_info.shape

(3522, 8)

In [9]:
train_info.head()

Unnamed: 0,pmid,ref_list,abstract,article_title,author_str,pub_date,set,full_Text
0,17074820,"['15153999', '15213210', '7668302']","Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",2006-11-01,13,
1,15153999,"['12721363', '9096352', '10788337', '9114021',...",Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,2008-02-25,13,
2,15213210,"['11466240', '12184798']",Alleviating health disparities in the United S...,Genetic Research and Health Disparities,"Pamela Sankar, Mildred K. Cho, Celeste M. Cond...",2008-02-20,13,
3,7668302,['1539589'],Protecting the confidentiality of genetic rese...,Certificates of confidentiality: a valuable to...,"C L Earley, L C Strong",1995-09-01,13,
4,12721363,"['9465087', '11842208', '11309498', '9465125',...",Whereas the human linkage map appears on limit...,Linkage disequilibrium in human populations,"Christine Lonjou, Weihua Zhang, Andrew Collins...",2003-05-13,13,


In [10]:
information_test.head()

Unnamed: 0,abstract,article_title,author_str,pmid,pub_date,set,full_Text
0,Cell lines selected in multiple steps for incr...,"The gene for a novel protein, a member of the ...","M M Chaudhuri, P N Tonin, W H Lewis, P R Srini...",1311171,1992-02-01,17,
1,Prolyl 4-hydroxylase (EC 1.14.11.2) is an esse...,Inhibition of prolyl 4-hydroxylase by hydroxya...,"C J Cunliffe, T J Franklin",3028370,1986-10-15,17,
2,From the structure-activity relationships of k...,Time-dependent inactivation of chick-embryo pr...,"V Gunzler, H M Hanauske-Abel, R Myllyla, J Moh...",3036081,1987-02-15,17,
3,The anthracyclines doxorubicin and daunorubici...,Syncatalytic inactivation of prolyl 4-hydroxyl...,"V Gunzler, H M Hanauske-Abel, R Myllyla, D D K...",2840891,1988-04-15,17,
4,The levels of lysine hydroxylase protein and t...,Minoxidil specifically decreases the expressio...,"T Hautala, J Heikkinen, K I Kivirikko, R Myllyla",1314568,1992-04-01,17,


In [11]:
test_info = test.merge(information_test, on = 'pmid', how = 'inner')
test_info['pub_date'] = pd.to_datetime(test_info['pub_date'])
test_info.shape

(2034, 7)

In [12]:
test_info.head()

Unnamed: 0,pmid,abstract,article_title,author_str,pub_date,set,full_Text
0,14058267,A technique is described for collecting thorac...,The absorption of oleic acid in the bile fistu...,"D. R. Saunders, A. M. Dawson",1963-09-01,15,
1,4550818,F-merogenotes derived from F14 by transduction...,Ordering of Mutant Sites in the Isoleucine-Val...,"Nancy J. Marsh, D. E. Duggan",1972-02-01,9,
2,14222809,Direct electron microscopic evidence is report...,ULTRASTRUCTURE OF ISOLATED KIDNEY MITOCHONDRIA...,"Mario H. Burgos, Agustin Aoki, Fabio L. Sacerdote",1964-11-01,19,
3,4164675,In vitro synthesis of β1C and immune globulins...,β1C and immune globulin formation in vitro by ...,"Vera J. Stecher, G. Jeanette Thorbecke",1967-04-01,15,
4,6211173,1. A substantial increase of the initial rate ...,Kinetic mechanism of mitochondrial adenosine t...,"E A Vasilyeva, I B Minkov, A F Fitin, A D Vino...",1982-01-15,19,


** Check pmid if present in ref_lists **

In [13]:
ref_all = [int(j) for i in train_info['ref_list'].values for j in eval(i)]
ref_all[:10]

[15153999,
 15213210,
 7668302,
 12721363,
 9096352,
 10788337,
 9114021,
 10330360,
 11466240,
 12184798]

In [19]:
set(ref_all) - set(train_info['pmid'].values)

set()

In [20]:
len(set(ref_all))

3086

In [21]:
3086 * 3522

10868892

** Check if reference list exist from a different set **

In [14]:
train_info['set'].unique()

array([13, 18, 16, 14,  3,  2,  6,  8,  5])

In [15]:
for i in train_info['set'].unique():
    ref_list_i = [int(j) for i in train_info[train_info['set'] == i]['ref_list'].values for j in eval(i)]
    print("for set" + str(i))
    print(set(ref_list_i) - set(train_info[train_info['set'] == i]['pmid'].values))

for set13
set()
for set18
set()
for set16
set()
for set14
set()
for set3
set()
for set2
set()
for set6
set()
for set8
set()
for set5
set()


# Remember 
#1) citation of publications only before the given date
#2) citation of publication is from the same set only

# Trivial Analysis

In [25]:
li = []
for i in test_info.index:
    rw = test_info.iloc[i,:]
    dt = rw['pub_date']
    st = rw['set']
    #text = str(rw['abstract']) + str(rw['article_title']) + str(rw['author_str']) + str(rw['full_Text'])
    reqd_dt = test_info[(test_info['pub_date'] < dt) & (test_info['set'] == st)]
    #reqd_dt['all'] = reqd_dt['abstract'] + reqd_dt['article_title'] + reqd_dt['author_str'] + reqd_dt['full_Text']
    reqd_dt['score_abstract'] = reqd_dt['abstract'].apply(lambda x: fuzz.ratio(str(x).lower(), str(rw['abstract']).lower()))
    reqd_dt['score_title'] = reqd_dt['article_title'].apply(lambda x: fuzz.ratio(str(x).lower(), str(rw['article_title']).lower()))
    reqd_dt['score_author'] = reqd_dt['author_str'].apply(lambda x: fuzz.ratio(str(x).lower(), str(rw['author_str']).lower()))
    reqd_dt['score_full'] = reqd_dt['full_Text'].apply(lambda x: fuzz.ratio(str(x).lower(), str(rw['full_Text']).lower()))
    reqd_dt['score'] = reqd_dt['score_abstract'] + reqd_dt['score_title'] + reqd_dt['score_author'] + reqd_dt['score_full']
    #re.sub(r'\W+', '', your_string)
    th = reqd_dt['score'].quantile(.99)
    reqd_dt_new = reqd_dt[reqd_dt['score'] > th]
    li.append(str([str(kk) for kk in list(reqd_dt_new.sort_values('score')['pmid'])]))


In [17]:
li[:3]

["['13398971', '14461382', '13035721']",
 "['4887520', '4945194', '14047236', '4887519', '4935321', '4907879', '5337773', '5327904', '13987374', '4912521', '4907880', '4895215', '4930242']",
 "['14208516', '14898034', '13398436', '14066417', '14381435']",
 "['5844380', '13761024', '13912024', '14461382']"]

In [19]:
train_info['pub_date'].dtype

dtype('<M8[ns]')

In [26]:
test_info['ref_list'] = li
test_info[['pmid', 'ref_list']].to_csv('sub_4.csv', index = False)

In [81]:
Counter([len(eval(i)) for i in train_info['ref_list'].values])

Counter({1: 1232,
         2: 721,
         3: 470,
         4: 322,
         5: 200,
         6: 156,
         7: 111,
         8: 98,
         9: 58,
         10: 30,
         11: 37,
         12: 17,
         13: 11,
         14: 13,
         15: 9,
         16: 7,
         17: 12,
         18: 2,
         19: 4,
         20: 3,
         21: 1,
         23: 2,
         24: 2,
         26: 1,
         27: 1,
         48: 1,
         54: 1})

In [84]:
np.mean([len(eval(i)) for i in train_info['ref_list'].values])

3.270868824531516

In [47]:
test_info.shape

(2034, 7)

In [46]:
test_info[test_info['pub_date'] < pd.to_datetime('1982-01-15')].shape

(1582, 7)

In [23]:
# Training set preparation : for a given publication all the publications beofore that date are given one if referenced else zero