In [1]:
import matplotlib
%matplotlib inline

# library imports
import numpy as np
import pandas as pd
import scipy as sc

import matplotlib.pyplot as plt
import nltk, string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.metrics.pairwise import polynomial_kernel

import time


In [2]:
# Load Train and Test CSV

headerNames = ["abstract","article_title","author_str","pmid","pub_date","set","full_Text"]
summary_headerNames = ["pmid","ref_list"]
prefix = "./data/"


# ID cannot be used for prediction 
# hence setting index_col = 0 takes care of removing ID field from dataset in both train and test dataframes.
info_traindf = pd.read_csv(prefix + "information_train.csv", header=None, sep='\t', names=headerNames) #, index_col=0 header=None, delim_whitespace=False,  names=headerNames, index_col=0,)
info_testdf = pd.read_csv(prefix + "information_test.csv", header=None, sep='\t', names=headerNames) #, index_col=0 header=None, delim_whitespace=False,  names=headerNames, index_col=0,)

traindf = pd.read_csv(prefix + "train.csv", sep=',', header=None, delim_whitespace=False,  names=summary_headerNames,index_col=0) 
testdf = pd.read_csv(prefix + "test.csv", sep=',', header=None, delim_whitespace=False,  names=summary_headerNames,index_col=0)

#sample data for a quick run
#traindf = traindf.sample(frac=0.25, replace=True)

print("info_traindf.shape >> ",info_traindf.shape)
print("info_testdf.shape >> ",info_testdf.shape)
print("traindf.shape >> ",traindf.shape)
print("testdf.shape >> ",testdf.shape)


info_traindf.shape >>  (3522, 7)
info_testdf.shape >>  (2034, 7)
traindf.shape >>  (3522, 1)
testdf.shape >>  (2034, 1)


In [3]:
info_traindf.head(5)

Unnamed: 0,abstract,article_title,author_str,pmid,pub_date,set,full_Text
0,"Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",17074820,2006-11-01,13,
1,Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,15153999,2008-02-25,13,
2,Alleviating health disparities in the United S...,Genetic Research and Health Disparities,"Pamela Sankar, Mildred K. Cho, Celeste M. Cond...",15213210,2008-02-20,13,
3,Protecting the confidentiality of genetic rese...,Certificates of confidentiality: a valuable to...,"C L Earley, L C Strong",7668302,1995-09-01,13,
4,Whereas the human linkage map appears on limit...,Linkage disequilibrium in human populations,"Christine Lonjou, Weihua Zhang, Andrew Collins...",12721363,2003-05-13,13,


In [4]:
'''
if 'full_Text' in info_traindf.columns:
    info_traindf = info_traindf.drop('full_Text', axis=1)
'''    
#print(info_traindf[info_traindf['pmid'] ==(19325880)])
#print(info_traindf[info_traindf['pmid'] ==(14762000)])
#print(info_traindf[info_traindf['pmid'] ==(20581186)])




"\nif 'full_Text' in info_traindf.columns:\n    info_traindf = info_traindf.drop('full_Text', axis=1)\n"

In [5]:
info_traindf.fillna('')

Unnamed: 0,abstract,article_title,author_str,pmid,pub_date,set,full_Text
0,"Among bioethicists and members of the public, ...",The routinisation of genomics and genetics: im...,"M W Foster, C D M Royal, R R Sharp",17074820,2006-11-01,13,
1,Genomics resources that use samples from ident...,Integrating ethics and science in the Internat...,,15153999,2008-02-25,13,
2,Alleviating health disparities in the United S...,Genetic Research and Health Disparities,"Pamela Sankar, Mildred K. Cho, Celeste M. Cond...",15213210,2008-02-20,13,
3,Protecting the confidentiality of genetic rese...,Certificates of confidentiality: a valuable to...,"C L Earley, L C Strong",7668302,1995-09-01,13,
4,Whereas the human linkage map appears on limit...,Linkage disequilibrium in human populations,"Christine Lonjou, Weihua Zhang, Andrew Collins...",12721363,2003-05-13,13,
5,We have examined differences in diversity at 6...,Microsatellite diversity and the demographic h...,"Lynn B. Jorde, Alan R. Rogers, Michael Bamshad...",9096352,1997-04-01,13,
6,Because defects in the phenylalanine hydroxyla...,Haplotypes and linkage disequilibrium at the p...,"J R Kidd, A J Pakstis, H Zhao, R B Lu, F E Oko...",10788337,2000-06-01,13,
7,It is often taken for granted that the human s...,An apportionment of human DNA diversity,"Guido Barbujani, Arianna Magagni, Eric Minch, ...",9114021,1997-04-29,13,
8,The practicality and moral value of community ...,The role of community review in evaluating the...,"M W Foster, R R Sharp, W L Freeman, M Chino, D...",10330360,1999-06-01,13,
9,One widely used measure of familial aggregatio...,"Inflation of sibling recurrence-risk ratio, du...",S W Guo,9634526,1998-07-01,13,


In [6]:
unique_trn_sets = info_traindf['set'].unique()
print("Unique Sets in Training document ", unique_trn_sets)

Unique Sets in Training document  [13 18 16 14  3  2  6  8  5]


In [7]:
#info_traindf['set'].map(str) + " " + 
#+ info_traindf['pmid'].map(str) +" "
#info_traindf['author_str'].map(str) +" "  +
# info_traindf['pub_date'].map(str) +" " + \
info_traindf['combined'] = info_traindf['article_title'].map(str) +" " + \
info_traindf['abstract'].map(str) # +" " + info_traindf['full_Text'].map(str) 


#print("newinfo_traindf[combined] >> ", info_traindf['combined'])
info_traindf[info_traindf['pmid'] == 19325880]


overall_threshold = 0.25 # 0.25 0.20 #0.1725 #16.25

In [8]:

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))



def find_similar(tfidf_matrix, index, top_n = 5, threshold = 0.1):
    cosine_similarities = cosine_similarity(tfidf_matrix[index:index+1], tfidf_matrix).flatten() #linear_kernel
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices if cosine_similarities[index] > threshold ]#[0:top_n]



In [9]:
nltk.download('punkt') # first-time only
outputdf = pd.DataFrame()
pmid = list()
reflists = list()

for set_id in unique_trn_sets:
    trndf13 = pd.DataFrame()
    trndf13['all'] = info_traindf[info_traindf['set'] == set_id]['combined']
    trndf13['set'] = set_id
    trndf13['pmid'] = info_traindf[info_traindf['set'] == set_id]['pmid']
    trndf13['all']=trndf13['all'].str.replace('\W', ' ')
    
    #print(trndf13)
    print("Working on set number ................ ", set_id)

    count = 0
    

    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    vectorizer = TfidfVectorizer(tokenizer=normalize, analyzer='word', stop_words='english')

    tfidf = vectorizer.fit_transform(trndf13['all'].tolist())
    #print("tfidf >> ", tfidf)

    
    for set_idx, trndf13_row in trndf13.iterrows():
        #print("set_idx ", set_idx)
        #print("trndf13_row ", trndf13_row)


        print("Source document .... ",trndf13_row['pmid'])
        #print("Finding matches .... ",trndf13.iloc[set_idx,2])

        #pmid,ref_list

        pmid.append(trndf13_row['pmid'])


        refLst = list()
        for index, score in find_similar(tfidf, index=count, top_n = 25,threshold = overall_threshold):
            #print (score, trndf13.loc[index])
            refLst.append(str(trndf13.iloc[index,2])) 

        count += 1
        reflists.append(refLst)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ShayAnwesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Working on set number ................  13
Source document ....  17074820
Source document ....  15153999
Source document ....  15213210
Source document ....  7668302
Source document ....  12721363
Source document ....  9096352
Source document ....  10788337
Source document ....  9114021
Source document ....  10330360
Source document ....  9634526
Source document ....  11466240
Source document ....  12184798
Source document ....  14561327
Source document ....  12900503
Source document ....  1539589
Source document ....  9465087
Source document ....  11842208
Source document ....  11309498
Source document ....  9465125
Source document ....  9990074
Source document ....  12486239
Source document ....  8454213
Source document ....  8986803
Source document ....  8849901
Source document ....  8524801
Source document ....  8651264
Source

Source document ....  10884408
Source document ....  12649434
Source document ....  11823218
Source document ....  9647837
Source document ....  9683472
Source document ....  14532045
Source document ....  10759521
Source document ....  7592351
Source document ....  9294455
Source document ....  9294456
Source document ....  9515921
Source document ....  9573181
Source document ....  7608084
Source document ....  9435261
Source document ....  9501228
Source document ....  8990197
Source document ....  8096622
Source document ....  8631663
Source document ....  9006020
Source document ....  9573179
Source document ....  9624497
Source document ....  9765585
Source document ....  7927711
Source document ....  6811555
Source document ....  6973759
Source document ....  7022452
Source document ....  7022453
Source document ....  1906066
Source document ....  1339408
Source document ....  8419308
Source document ....  8432589
Source document ....  8432618
Source document ....  8449892
Sourc

Source document ....  14602910
Source document ....  15968079
Source document ....  9603824
Source document ....  22201529
Working on set number ................  16
Source document ....  8944203
Source document ....  1370845
Source document ....  8380182
Source document ....  1691208
Source document ....  2155917
Source document ....  2157735
Source document ....  2159539
Source document ....  2161866
Source document ....  2549095
Source document ....  2819870
Source document ....  2986127
Source document ....  6097599
Source document ....  3404526
Source document ....  2326177
Source document ....  2160802
Source document ....  1847008
Source document ....  1850635
Source document ....  6325492
Source document ....  4148753
Source document ....  6269067
Source document ....  284385
Source document ....  3060247
Source document ....  2919166
Source document ....  2991576
Source document ....  2447830
Source document ....  2993351
Source document ....  2852677
Source document ....  632

Source document ....  7022155
Source document ....  6287227
Source document ....  6300437
Source document ....  287063
Source document ....  6340110
Source document ....  6575390
Source document ....  3156376
Source document ....  3951988
Source document ....  6812046
Source document ....  2582352
Source document ....  3889841
Source document ....  3748808
Source document ....  3774550
Source document ....  3588303
Source document ....  3267215
Source document ....  6289302
Source document ....  6262809
Source document ....  6457158
Source document ....  6172778
Source document ....  6278410
Source document ....  6174938
Source document ....  7099970
Source document ....  6292491
Source document ....  6184674
Source document ....  4594039
Source document ....  183379
Source document ....  198578
Source document ....  149110
Source document ....  209219
Source document ....  228055
Source document ....  6244590
Source document ....  6246534
Source document ....  6255202
Source document 

Source document ....  6185921
Source document ....  2426657
Source document ....  1069307
Source document ....  4600264
Source document ....  4530273
Source document ....  1090574
Source document ....  6154930
Source document ....  7029468
Source document ....  6253883
Source document ....  6164798
Source document ....  6296764
Source document ....  6304692
Source document ....  6193288
Source document ....  6312090
Source document ....  6961439
Source document ....  6306269
Source document ....  6310157
Source document ....  6312421
Source document ....  6304629
Source document ....  6308556
Source document ....  3018280
Source document ....  2433467
Source document ....  2983123
Source document ....  2999190
Source document ....  6294322
Source document ....  6310521
Source document ....  283395
Source document ....  6801656
Source document ....  6641721
Source document ....  6316273
Source document ....  230501
Source document ....  7008030
Source document ....  7312622
Source docum

Source document ....  12754337
Source document ....  18379440
Source document ....  18434538
Source document ....  18535185
Source document ....  18506455
Source document ....  18546284
Source document ....  16567625
Source document ....  16613895
Source document ....  17008334
Source document ....  18625456
Source document ....  18631953
Source document ....  18684309
Source document ....  18590784
Source document ....  18634762
Source document ....  18378771
Source document ....  18474722
Source document ....  15615853
Source document ....  15851728
Source document ....  16195246
Source document ....  9811325
Source document ....  17291356
Source document ....  17012339
Source document ....  18090434
Source document ....  18226108
Source document ....  18234899
Source document ....  16151530
Source document ....  11438710
Source document ....  15657137
Source document ....  16174740
Source document ....  16344478
Source document ....  17646382
Source document ....  17721864
Source do

Source document ....  16201997
Source document ....  18222580
Source document ....  18307477
Source document ....  18511716
Source document ....  18711139
Source document ....  19299256
Source document ....  15016247
Source document ....  19528519
Source document ....  20662955
Working on set number ................  3
Source document ....  6246891
Source document ....  1827140
Source document ....  7699333
Source document ....  4299129
Source document ....  4299130
Source document ....  2521952
Source document ....  2787384
Source document ....  2787380
Source document ....  2660142
Source document ....  2965006
Source document ....  3572301
Source document ....  1689065
Source document ....  2153073
Source document ....  6412070
Source document ....  2358782
Source document ....  6086308
Source document ....  7831294
Source document ....  2324686
Source document ....  8315381
Source document ....  8415665
Source document ....  8026467
Source document ....  7964471
Source document ...

Source document ....  2822007
Source document ....  4214106
Source document ....  6571733
Source document ....  3481263
Source document ....  2883656
Source document ....  2945890
Source document ....  3496417
Source document ....  6969782
Source document ....  6175720
Source document ....  6206184
Source document ....  2417941
Source document ....  2995979
Source document ....  3088206
Source document ....  6425835
Source document ....  2421253
Source document ....  3083415
Source document ....  3088570
Source document ....  3030728
Source document ....  6438633
Source document ....  6512493
Source document ....  3079910
Source document ....  2939459
Source document ....  3011406
Source document ....  3011412
Source document ....  6216259
Source document ....  3720730
Source document ....  6330526
Source document ....  6438631
Source document ....  2998757
Source document ....  3090517
Source document ....  3092186
Source document ....  3462701
Source document ....  3095838
Source doc

Source document ....  370833
Source document ....  386282
Source document ....  6933438
Source document ....  6117826
Source document ....  6401795
Source document ....  6445398
Source document ....  6966320
Source document ....  6265934
Source document ....  6270681
Source document ....  6172538
Source document ....  6174677
Source document ....  6180121
Working on set number ................  2
Source document ....  16927957
Source document ....  16277606
Source document ....  3488550
Source document ....  8019746
Source document ....  11532965
Source document ....  8170966
Source document ....  10385247
Source document ....  9484853
Source document ....  7679023
Source document ....  6958342
Source document ....  14392240
Source document ....  6172582
Source document ....  6132387
Source document ....  2043924
Source document ....  411921
Source document ....  926021
Source document ....  5049809
Source document ....  2850055
Source document ....  2158369
Source document ....  25381

Source document ....  409448
Source document ....  23372174
Source document ....  23488511
Source document ....  20577717
Source document ....  17514069
Source document ....  17433863
Source document ....  17434706
Source document ....  17931822
Source document ....  18565693
Source document ....  15726103
Source document ....  16168145
Source document ....  16517548
Source document ....  15726102
Source document ....  17147789
Working on set number ................  8
Source document ....  7542571
Source document ....  7618941
Source document ....  8094737
Source document ....  7763061
Source document ....  1315049
Source document ....  7905016
Source document ....  8642250
Source document ....  7685670
Source document ....  7923883
Source document ....  7906508
Source document ....  3115202
Source document ....  1900405
Source document ....  8450067
Source document ....  2462353
Source document ....  2479030
Source document ....  2303780
Source document ....  2117641
Source document 

Source document ....  6300879
Source document ....  6326110
Source document ....  3878084
Source document ....  3084562
Source document ....  3534886
Source document ....  6204001
Source document ....  3538015
Source document ....  3493319
Source document ....  2580936
Source document ....  2409209
Source document ....  2427637
Source document ....  3040667
Source document ....  3819645
Source document ....  3596805
Source document ....  3035555
Source document ....  3624442
Source document ....  3624443
Source document ....  2951480
Source document ....  3005467
Source document ....  2846743
Source document ....  3263646
Source document ....  3040885
Source document ....  2902638
Source document ....  2789518
Source document ....  2583098
Source document ....  2462609
Source document ....  3490536
Source document ....  2466767
Source document ....  3128632
Source document ....  1694221
Source document ....  2460872
Source document ....  3141930
Source document ....  2476570
Source doc

Source document ....  6222136
Source document ....  6238120
Source document ....  2581259
Source document ....  3860874
Source document ....  2413153
Source document ....  6946459
Source document ....  3135800
Source document ....  3131772
Source document ....  5922092
Source document ....  15776568
Source document ....  1168693
Source document ....  4390900
Source document ....  4507613
Source document ....  4631990
Source document ....  4178112
Source document ....  5387730
Source document ....  4390899
Source document ....  5011103
Source document ....  4334720
Source document ....  14223932
Source document ....  4162237
Source document ....  4703226
Source document ....  5487541
Source document ....  4264575
Source document ....  4641855
Source document ....  6025317
Source document ....  4226271
Source document ....  4992784
Source document ....  6019133
Source document ....  4960742
Source document ....  14151101
Source document ....  4226164
Source document ....  4911896
Source 

In [10]:
print(len(pmid))
print(len(reflists))
outputdf['pmid'] = pmid
outputdf['ref_list'] = reflists
print(outputdf)
outputdf.to_csv("output/output_trn_base"+str(time.time())+".csv", sep=",", index=False)

3522
3522
          pmid                                           ref_list
0     17074820                                         [15153999]
1     15153999                                         [17074820]
2     15213210                                                 []
3      7668302                                                 []
4     12721363  [11842208, 12486239, 10430595, 9770501, 104459...
5      9096352              [10400923, 7668280, 9326337, 9114021]
6     10788337  [8178829, 8105688, 2569271, 7913583, 2569272, ...
7      9114021                                [11932244, 9096352]
8     10330360                                                 []
9      9634526                                          [8651264]
10    11466240                                                 []
11    12184798                                                 []
12    14561327                                         [12486239]
13    12900503                     [10022981, 10655242, 10611357]


In [11]:
#print(traindf['ref_list'].tolist())
#print(outputdf['ref_list'].tolist())
'''
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

binarizer = MultiLabelBinarizer()

binarizer.fit(traindf['ref_list'].tolist())

f1_score(binarizer.transform(traindf['ref_list'].tolist()), 
         binarizer.transform(outputdf['ref_list'].tolist()), 
         average='weighted')
'''
#f1_score(traindf['ref_list'].tolist(), outputdf['ref_list'].tolist(), average='weighted') 


"\nfrom sklearn.preprocessing import MultiLabelBinarizer\nfrom sklearn.metrics import f1_score\n\nbinarizer = MultiLabelBinarizer()\n\nbinarizer.fit(traindf['ref_list'].tolist())\n\nf1_score(binarizer.transform(traindf['ref_list'].tolist()), \n         binarizer.transform(outputdf['ref_list'].tolist()), \n         average='weighted')\n"

In [12]:
unique_tst_sets = info_testdf['set'].unique()
info_testdf.fillna('')
print("Unique Sets in Test document ", unique_tst_sets)

Unique Sets in Test document  [17 10  4  9 11  1 15  7 19]


In [13]:
#info_testdf['set'].map(str) + " " + 
# + info_testdf['pmid'].map(str) +" "
#info_testdf['author_str'].map(str) +" "  +
#info_testdf['pub_date'].map(str) +" " + \
info_testdf['combined'] = info_testdf['article_title'].map(str) +" " + \
info_testdf['abstract'].map(str) # +" " + info_testdf['full_Text'].map(str) 


#print("info_testdf[combined] >> ", info_testdf['combined'])
info_testdf[info_testdf['pmid'] == 5812469]

Unnamed: 0,abstract,article_title,author_str,pmid,pub_date,set,full_Text,combined
327,The two morphologically different constituents...,THE ELASTIC FIBER I. The Separation and Partia...,"Russell Ross, Paul Bornstein",5812469,1969-02-01,10,,THE ELASTIC FIBER I. The Separation and Partia...


In [14]:
nltk.download('punkt') # first-time only
outputdftst = pd.DataFrame()
pmidtst = list()
refliststst = list()

for set_id in unique_tst_sets:
    testdf1 = pd.DataFrame()
    testdf1['all'] = info_testdf[info_testdf['set'] == set_id]['combined']
    testdf1['set'] = set_id
    testdf1['pmid'] = info_testdf[info_testdf['set'] == set_id]['pmid']
    testdf1['all']=testdf1['all'].str.replace('\W', ' ')
    
    #print(trndf13)
    print("Working on set number ................ ", set_id)

    count = 0
    

    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    vectorizer = TfidfVectorizer(tokenizer=normalize, analyzer='word', stop_words='english')

    tfidf = vectorizer.fit_transform(testdf1['all'].tolist())
    #print("tfidf >> ", tfidf)

    
    for set_idx, testdf1_row in testdf1.iterrows():
        #print("set_idx ", set_idx)
        #print("trndf13_row ", trndf13_row)


        print("Source document .... ",testdf1_row['pmid'])
        #print("Finding matches .... ",trndf13.iloc[set_idx,2])

        #pmid,ref_list

        pmidtst.append(testdf1_row['pmid'])


        refLst = list()
        for index, score in find_similar(tfidf, index=count, top_n = 25,threshold = overall_threshold):
            #print (score, trndf13.loc[index])
            refLst.append(str(testdf1.iloc[index,2])) 

        count += 1
        refliststst.append(refLst)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ShayAnwesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Working on set number ................  17
Source document ....  1311171
Source document ....  3028370
Source document ....  3036081
Source document ....  2840891
Source document ....  1314568
Source document ....  2025220
Source document ....  2025222
Source document ....  3036064
Source document ....  1987130
Source document ....  8615825
Source document ....  1339284
Source document ....  2994628
Source document ....  3140780
Source document ....  2557001
Source document ....  1329722
Source document ....  486087
Source document ....  3931636
Source document ....  8503862
Source document ....  230821
Source document ....  1850989
Source document ....  8198555
Source document ....  6779811
Source document ....  8192668
Source document ....  3025593
Source document ....  4504350
Source document ....  3034602
Source document .... 

Source document ....  6572915
Source document ....  2650680
Source document ....  1318023
Source document ....  1318033
Source document ....  6817329
Source document ....  6418245
Source document ....  1715916
Source document ....  1726797
Source document ....  2451020
Source document ....  2457089
Source document ....  2484210
Source document ....  1703572
Source document ....  1898359
Source document ....  5340539
Source document ....  4372600
Source document ....  238033
Source document ....  49421
Source document ....  5808318
Source document ....  1156396
Source document ....  4722896
Source document ....  4333037
Source document ....  5055788
Source document ....  4200724
Source document ....  5685862
Source document ....  12754
Source document ....  167718
Source document ....  4900611
Source document ....  5812469
Source document ....  4327463
Source document ....  1054517
Source document ....  241998
Source document ....  14228505
Source document ....  564692
Source document .

Working on set number ................  4
Source document ....  6788894
Source document ....  6809100
Source document ....  3311265
Source document ....  3509
Source document ....  6229546
Source document ....  70433
Source document ....  71738
Source document ....  624905
Source document ....  702050
Source document ....  571893
Source document ....  288080
Source document ....  6449521
Source document ....  5771188
Source document ....  5771189
Source document ....  6833369
Source document ....  7049214
Source document ....  2813399
Source document ....  2426709
Source document ....  2824656
Source document ....  187622
Source document ....  3085757
Source document ....  6977612
Source document ....  309919
Source document ....  4805001
Source document ....  6924936
Source document ....  5049426
Source document ....  5080705
Source document ....  4362915
Source document ....  574143
Source document ....  6987662
Source document ....  7328117
Source document ....  58959
Source documen

Source document ....  1117030
Source document ....  1141379
Source document ....  201645
Source document ....  618900
Source document ....  150426
Source document ....  115892
Source document ....  511941
Source document ....  511942
Source document ....  7350172
Source document ....  162790
Source document ....  10605441
Source document ....  914894
Source document ....  511932
Source document ....  4356571
Source document ....  7358795
Source document ....  7017713
Source document ....  5697984
Source document ....  4372293
Source document ....  1059102
Source document ....  4530317
Source document ....  4360946
Source document ....  1097576
Source document ....  650151
Source document ....  627836
Source document ....  1069997
Source document ....  1244421
Source document ....  328516
Source document ....  217006
Source document ....  329287
Source document ....  701356
Source document ....  268636
Source document ....  925079
Source document ....  1054823
Source document ....  3313

Source document ....  4942764
Source document ....  4942914
Source document ....  4945181
Source document ....  4945191
Source document ....  4945192
Source document ....  4945194
Source document ....  4928016
Source document ....  4994599
Source document ....  4338582
Source document ....  14292991
Source document ....  4587600
Source document ....  4565418
Source document ....  4598307
Source document ....  4598002
Source document ....  4551145
Source document ....  4556462
Source document ....  4555414
Source document ....  4555413
Source document ....  4555412
Source document ....  4869214
Source document ....  4342813
Source document ....  4550819
Source document ....  4552226
Source document ....  4553837
Source document ....  4558662
Source document ....  4560697
Source document ....  4562404
Source document ....  4591470
Source document ....  4591486
Source document ....  14205619
Source document ....  14047236
Source document ....  14127571
Source document ....  5322723
Source

Source document ....  19870525
Source document ....  19870523
Source document ....  19870465
Source document ....  19870463
Source document ....  19869620
Source document ....  19870740
Source document ....  19870335
Source document ....  19871401
Source document ....  19870596
Source document ....  19870814
Source document ....  19871016
Source document ....  19871042
Source document ....  19871138
Source document ....  16744547
Source document ....  16744548
Source document ....  19868830
Source document ....  19869085
Source document ....  19870145
Source document ....  19869556
Source document ....  19870029
Source document ....  19869455
Source document ....  19869454
Source document ....  19870613
Source document ....  19869862
Source document ....  19870386
Source document ....  19870380
Source document ....  19870226
Source document ....  19869648
Source document ....  19869586
Source document ....  19870232
Source document ....  19870414
Source document ....  19870324
Source d

Working on set number ................  7
Source document ....  11161411
Source document ....  10987769
Source document ....  8520272
Source document ....  7950739
Source document ....  7640237
Source document ....  7669587
Source document ....  8260381
Source document ....  6716424
Source document ....  3362861
Source document ....  1320125
Source document ....  8439512
Source document ....  8460634
Source document ....  3443812
Source document ....  1757764
Source document ....  5312521
Source document ....  2297482
Source document ....  2390471
Source document ....  7059469
Source document ....  2736211
Source document ....  7427299
Source document ....  3741766
Source document ....  6614001
Source document ....  4039107
Source document ....  6849791
Source document ....  3814490
Source document ....  3655620
Source document ....  3426932
Source document ....  2713247
Source document ....  2713248
Source document ....  7459241
Source document ....  7041938
Source document ....  3094

In [15]:
print(len(pmidtst))
print(len(refliststst))
outputdftst['pmid'] = pmidtst
outputdftst['ref_list'] = refliststst
print(outputdftst)
outputdftst.to_csv("output/output_tst_base"+str(time.time())+".csv", sep=",", index=False)

2034
2034
          pmid                                           ref_list
0      1311171               [3025593, 2851999, 3034602, 2543975]
1      3028370                         [2994628, 230821, 3036064]
2      3036081                                 [2840891, 2994628]
3      2840891  [3036081, 230821, 2994628, 2557001, 6289804, 6...
4      1314568        [2557001, 219849, 3034602, 7753822, 949336]
5      2025220                        [2025221, 6615425, 4004778]
6      2025222  [6615425, 2025221, 1327760, 3896234, 3566712, ...
7      3036064  [2851981, 2994628, 2557001, 6293467, 230821, 2...
8      1987130                        [1694525, 2644235, 2107074]
9      8615825  [1323838, 7753822, 2552442, 8385607, 2557001, ...
10     1339284                                 [3467361, 3927292]
11     2994628  [230821, 3028370, 3036064, 6274310, 1850989, 3...
12     3140780                                 [6274310, 2851981]
13     2557001  [3034602, 1323838, 7753822, 204299, 1327760, 8...
