In [1]:
import pandas as pd
import re
import random
import uuid

random.seed(42)

In [2]:
data = pd.read_parquet("data/mlt_data_publications.parquet")
data = data[~data.s2FieldsOfStudy.isna()]

### Create (author, wrote, paper) triples & QA pairs with id for training

In [3]:
TOTAL_UNIQUE_AUTHORS = 3000 # max unique authors: 44,574

In [4]:
authors = []
for row in data.itertuples():
    for author in row.authors:
        names = [author.get('name')]
        
        try:
            aliases = author.get('aliases').tolist()
        except AttributeError:
            aliases = []
        
        names += aliases
        
        names = [' '.join(set(re.sub(r'\s+', ' ', name).split(' ')) )for name in names]

        for name in list(set(names)):
            authors.append({'author': name, 'publication': row.title, 
                            'paperId': row.paperId, 'authorId': author.get('authorId')})

In [5]:
authors = pd.DataFrame(authors)

## Reduce the dataset! This should be removed in the final version (?)
authors['len_name'] = authors.author.apply(len)
authors.sort_values(['paperId','authorId','len_name'], ascending=False, inplace=True)
authors.drop_duplicates(subset=['paperId','authorId'], keep='first', inplace=True)

authors = authors.sample(TOTAL_UNIQUE_AUTHORS, random_state=42).reset_index(drop=True)
len(authors)

3000

In [6]:
authors.rename(columns={'author': 'subject','publication':'object','authorId':'subjectId','paperId':'objectId'}, inplace=True)
authors['property'] = 'wrote'
authors = authors[['subject','property','object','subjectId','objectId']]
authors

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,wrote,Solving PDEs with radial basis functions *,1751907,d397570eef10925f7ebc2da644e54f0a55ba2f13
1,Christopher Anthony O'callaghan,wrote,Direct Visualization of Antigen-specific CD8+T...,79187876,8c134d5124bb3eaae85e09d7d7cb60cc296a0ab7
2,Fisher Susan H.,wrote,Bacillus subtilis 168 Contains Two Differentia...,2220105,38677364e0210277add24a01ebabaed982455145
3,Morilla Antonio González,wrote,Speculative execution via address prediction a...,144359098,542d7ddb6a2efa4a9a55c63bdc1e5fbae129df56
4,Till Ballendat,wrote,Proxemic interaction: designing for a proximit...,2254514,be4ddea1bf8ee8f803b90425257892b31f6f5b87
...,...,...,...,...,...
2995,Zhang Ji-feng,wrote,Adaptive Tracking Control of Linear Systems Wi...,2108131021,9e7acd24a26de60bb59cf5cccf3fff537015129f
2996,Chen-Yu Lee,wrote,Sliced Wasserstein Discrepancy for Unsupervise...,50521003,865100f1b248723f48fc5d2c68be0421fd24ff48
2997,Yang Ou Xin,wrote,An Ethnographic Understanding of Software (In)...,1932524,70824eba2ff8c76b132dab6251a8558a1c7524a9
2998,Govindan R.,wrote,Neighborhood-Centric Congestion Control for Mu...,1747970,94841f75c189a6f2313213eea461e9a8f62d20fa


In [7]:
qa_author_papers = [
    (f'Who wrote the paper titled "{row.object}"?', row.subject, row.subjectId)
    for row in authors.itertuples()
]
len(qa_author_papers), qa_author_papers[:3]

(3000,
 [('Who wrote the paper titled "Solving PDEs with radial basis functions *"?',
   'N. Flyer',
   '1751907'),
  ('Who wrote the paper titled "Direct Visualization of Antigen-specific CD8+T Cells during the Primary Immune Response to Epstein-Barr Virus In Vivo"?',
   "Christopher Anthony O'callaghan",
   '79187876'),
  ('Who wrote the paper titled "Bacillus subtilis 168 Contains Two Differentially Regulated Genes Encoding l-Asparaginase"?',
   'Fisher Susan H.',
   '2220105')])

In [8]:
pd.to_pickle(qa_author_papers, 'data/qa_subsets/qa_authors.pkl')

### Create (paper1, related with, paper2) triples & QA pairs with id for training

In [9]:
papers = data[data.paperId.isin(authors.objectId.unique())]

titles_map = {row.paperId:row.title for row in data.itertuples()}

papers.query('con_type!="base"', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.query('con_type!="base"', inplace=True)


In [10]:
papers['object'] = papers.source.map(titles_map)

papers.rename(columns={'source':'objectId','paperId':'subjectId','title':'subject'}, inplace=True)
papers['property'] = 'related with'

papers = papers[['subject','property','object','subjectId','objectId']]
papers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['object'] = papers.source.map(titles_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.rename(columns={'source':'objectId','paperId':'subjectId','title':'subject'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['property'] = 'related with'


Unnamed: 0,subject,property,object,subjectId,objectId
3006,A lossless text compression technique using sy...,related with,A genetic algorithm approach for verification ...,abc60da0c3eea5e1519cc4f2eb97008187898777,ffe288376b4e62a9d84f28b249b0c92f7116686a
3014,A Formal Look at Dependency Grammars and Phras...,related with,D-Tree Substitution Grammars,575543a479c42f997f07b65b61c2639f94fe30de,ffb5f1cd15a620ba6f77f3368f71606b5940cba0
3019,Developmental Constraints and Evolution: A Per...,related with,The innovation triad: an EvoDevo agenda.,12cb50be697a13e76e0b11cc1ec3aaaa8bf41312,ff952b688cdb7eb8baf780507171ce6e122d8af4
3023,Characterization of early pathogenic effects a...,related with,Genomic sequence analysis identifies Jembrana ...,732786b565985999c1cccfcdb88aa7f3dc4184ca,ff8a238d958ed44f4fae78e5ef99d5383083deb5
3029,The Linear Programming Approach to Approximate...,related with,Duality and linear programs for stability and ...,8a14ac38f66996913c4d7f3a3141294a602fd8f3,ff351f5cccba47c7ae38b38a310279fb07232cab
...,...,...,...,...,...
13163,Sensitivity of HIV-1 to entry inhibitors corre...,related with,Comprehensive Cross-Clade Neutralization Analy...,83d108dfa0f1d8c01e07fd1354c3a72f12465f91,031a0bd9cf17a9acff2d56ae934a3451998e3925
13167,Schema Creation in Programming,related with,The automated tutoring of introductory compute...,ab450671629c22c233c56001e858a0e9c4c24979,01d5a49f2e0cf6b583d2d0c4edf327ff55a9f18f
13175,The price of stability for network design with...,related with,On Nash Equilibria for a Network Creation Game,be868a4e978129f5b43316c389ea11f6eff5ef45,0122bee5707e8062c40c593f3dac02ccf6008f98
13176,On a network creation game,related with,On Nash Equilibria for a Network Creation Game,2cf44c2edd3e0da932ca3e94cbe385998fdecbeb,0122bee5707e8062c40c593f3dac02ccf6008f98


In [11]:
qa_related_papers = [
    (f"Which paper is cited or referenced in the paper titled '{row.object}'?", row.subject, row.subjectId)
    for row in papers.itertuples()
]

len(qa_related_papers), qa_related_papers[:3]

(2009,
 [("Which paper is cited or referenced in the paper titled 'A genetic algorithm approach for verification of the syllable-based text compression technique'?",
   'A lossless text compression technique using syllable based morphology',
   'abc60da0c3eea5e1519cc4f2eb97008187898777'),
  ("Which paper is cited or referenced in the paper titled 'D-Tree Substitution Grammars'?",
   'A Formal Look at Dependency Grammars and Phrase-Structure Grammars, with Special Consideration of Word-Order Phenomena',
   '575543a479c42f997f07b65b61c2639f94fe30de'),
  ("Which paper is cited or referenced in the paper titled 'The innovation triad: an EvoDevo agenda.'?",
   'Developmental Constraints and Evolution: A Perspective from the Mountain Lake Conference on Development and Evolution',
   '12cb50be697a13e76e0b11cc1ec3aaaa8bf41312')])

In [12]:
pd.to_pickle(qa_related_papers, 'data/qa_subsets/qa_cites_refs.pkl')

### Create (author, published in, venue) triples & QA pairs with id for training

In [13]:
venue_id_map = {v:str(uuid.uuid3(uuid.NAMESPACE_OID, v)) for v in data.venue.unique()}

In [14]:
authors_venues = authors.merge(data[['paperId','venue']], left_on='objectId', right_on='paperId', how='left')
authors_venues = authors_venues[['subject','property','venue','subjectId']].rename(columns={'venue':'object'})
authors_venues['objectId'] = authors_venues.object.map(venue_id_map)
authors_venues['property'] = 'published in'
authors_venues

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,published in,Acta Numerica,1751907,a141218b-2f0b-31b0-8127-b7b1b626bbf3
1,Christopher Anthony O'callaghan,published in,Journal of Experimental Medicine,79187876,878d96ac-106b-38c0-88a2-dbf7a7d33fae
2,Fisher Susan H.,published in,Journal of Bacteriology,2220105,79862a52-98a5-3529-8348-6124894a92ea
3,Morilla Antonio González,published in,International Conference on Supercomputing,144359098,8df687ec-6d98-31e9-ad8a-16a2838e6436
4,Till Ballendat,published in,International Conference on Intelligent Tutori...,2254514,e8365676-c4aa-3cab-aa6b-70ab4a213f23
...,...,...,...,...,...
2995,Zhang Ji-feng,published in,IEEE Transactions on Automatic Control,2108131021,aee98688-4078-3a5e-80e5-381f5a00e475
2996,Chen-Yu Lee,published in,Computer Vision and Pattern Recognition,50521003,ecad6aa0-9345-3f5d-8712-d49f5222c18e
2997,Yang Ou Xin,published in,SOUPS @ USENIX Security Symposium,1932524,fc0d5184-e930-35b5-ad38-92acca823d84
2998,Govindan R.,published in,IEEE/ACM Transactions on Networking,1747970,84f11ece-2e6e-3069-954b-6b302611876d


In [15]:
qa_author_venue = [
    (f'"In which venue has the author {row.subject} published?"', row.object, row.objectId)
    for row in authors_venues.itertuples()
]

len(qa_author_venue), qa_author_venue[:3]

(3000,
 [('"In which venue has the author N. Flyer published?"',
   'Acta Numerica',
   'a141218b-2f0b-31b0-8127-b7b1b626bbf3'),
  ('"In which venue has the author Christopher Anthony O\'callaghan published?"',
   'Journal of Experimental Medicine',
   '878d96ac-106b-38c0-88a2-dbf7a7d33fae'),
  ('"In which venue has the author Fisher Susan H. published?"',
   'Journal of Bacteriology',
   '79862a52-98a5-3529-8348-6124894a92ea')])

In [16]:
pd.to_pickle(qa_author_venue, 'data/qa_subsets/qa_author_venue.pkl')

### Create (author, works in field, topic) triples & QA pairs with id for training

In [17]:
topics = data[['paperId','s2FieldsOfStudy']].explode('s2FieldsOfStudy')
topics = topics[~topics.s2FieldsOfStudy.isna()]
topics['object'] = [i.get('category') for i in topics.s2FieldsOfStudy]

topic_id_map = {v:str(uuid.uuid3(uuid.NAMESPACE_OID, v)) for v in topics.object.unique()}

In [18]:
authors_topics = authors[['subject','subjectId','objectId']].merge(topics[['paperId','object']], left_on='objectId', right_on='paperId', how='left')

authors_topics.drop(columns=['objectId','paperId'], inplace=True)
authors_topics.drop_duplicates(inplace=True)
authors_topics['objectId'] = authors_topics.object.map(topic_id_map)
authors_topics['property'] = 'works in field'

authors_topics

Unnamed: 0,subject,subjectId,object,objectId,property
0,N. Flyer,1751907,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field
1,N. Flyer,1751907,Mathematics,54b1834c-8853-358a-83c2-b959e3d18092,works in field
2,N. Flyer,1751907,Physics,98b63cd9-ce2e-30dc-b7d8-4e7972d092a6,works in field
3,N. Flyer,1751907,Engineering,f312a29c-d83f-3882-ac6d-28e92749d432,works in field
4,Christopher Anthony O'callaghan,79187876,Biology,7f1bb586-b024-3edf-8198-7bec8a37d19a,works in field
...,...,...,...,...,...
9306,Chen-Yu Lee,50521003,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field
9307,Chen-Yu Lee,50521003,Mathematics,54b1834c-8853-358a-83c2-b959e3d18092,works in field
9309,Yang Ou Xin,1932524,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field
9314,Asoh Hideki,7142317,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field


In [19]:
qa_author_topic = [
    (f'"In which field of study does the author {row.subject} work?"', row.object, row.objectId)
    for row in authors_topics.itertuples()
]
len(qa_author_topic), qa_author_topic[:3]

(5990,
 [('"In which field of study does the author N. Flyer work?"',
   'Computer Science',
   '5c7c2038-be47-3681-94f9-921a75cfb5df'),
  ('"In which field of study does the author N. Flyer work?"',
   'Mathematics',
   '54b1834c-8853-358a-83c2-b959e3d18092'),
  ('"In which field of study does the author N. Flyer work?"',
   'Physics',
   '98b63cd9-ce2e-30dc-b7d8-4e7972d092a6')])

In [20]:
pd.to_pickle(qa_author_topic, 'data/qa_subsets/qa_author_topic.pkl')

### Create (author1, co-authored with, author2) triples & QA pairs with id for training

In [21]:
coauthors = authors.merge(authors, on='objectId', suffixes=('_1','_2'))
coauthors = coauthors[coauthors.subjectId_1 != coauthors.subjectId_2]
coauthors['property'] = 'collaborates with'
coauthors = coauthors[['subject_1','property','subject_2','subjectId_1','subjectId_2']]
coauthors.rename(columns={'subject_1':'subject','subjectId_1':'subjectId','subject_2':'object','subjectId_2':'objectId'}, inplace=True)
coauthors.reset_index(drop=True, inplace=True)
coauthors

Unnamed: 0,subject,property,object,subjectId,objectId
0,L. Smith Rachel,collaborates with,L. Clarke,2157686591,1712081
1,Iavarone A.,collaborates with,Zhang Wei,2700430,2155468112
2,Iavarone A.,collaborates with,Shen R.,2700430,2261512066
3,Iavarone A.,collaborates with,Jaegil Kim,2700430,46454427
4,Iavarone A.,collaborates with,A. Unterberg,2700430,3290065
...,...,...,...,...,...
2128,Harrington R.,collaborates with,W. Rużyłło,1932201,152556367
2129,Blair Olkin Catherine,collaborates with,Mckinnon Beall William,5752279,145741171
2130,Blair Olkin Catherine,collaborates with,Heather Alison Elliott,5752279,49039752
2131,Blair Olkin Catherine,collaborates with,J.j. Kavelaars,5752279,100889387


In [22]:
qa_coauthors = [
    (f'"Which author has co-authored with {row.subject}?"', row.object, row.objectId)
    for row in coauthors.itertuples()
]
len(qa_coauthors), qa_coauthors[:3]

(2133,
 [('"Which author has co-authored with L. Smith Rachel?"',
   'L. Clarke',
   '1712081'),
  ('"Which author has co-authored with Iavarone A.?"',
   'Zhang Wei',
   '2155468112'),
  ('"Which author has co-authored with Iavarone A.?"',
   'Shen R.',
   '2261512066')])

In [23]:
pd.to_pickle(qa_coauthors, 'data/qa_subsets/qa_coauthors.pkl')

### Create (venue, published, paper) triples & QA pairs with id for training

In [24]:
papers_venue = data[data.paperId.isin(authors.objectId.unique())]
papers_venue = papers_venue[['paperId','title','venue']]
papers_venue.rename(columns={'paperId':'objectId','title':'object', 'venue':'subject'}, inplace=True)
papers_venue['subjectId'] = papers_venue.subject.map(venue_id_map)
papers_venue['property'] = 'published'   
papers_venue

Unnamed: 0,objectId,object,subject,subjectId,property
5,ac9748ea3945eb970cc32a37db7cfdfd0f22e74c,Ridge-based vessel segmentation in color image...,IEEE Transactions on Medical Imaging,503ae1ae-49f2-38e2-a863-43cae297e161,published
16,bf5a4480f09d97cb27402cda19fd126101fe0a44,Protein homology detection by HMM?CHMM comparison,Bioinform.,746503af-d318-3a57-a0cb-2e2c7b2bb16e,published
21,10028f490a9dc0e4c024ab40e0bee9f3e027f875,Using mutual information for selecting feature...,IEEE Trans. Neural Networks,54532f94-1c58-3c31-b392-dba0aabed500,published
28,7bffc397f8a82a23862d2bacee7bb7bbfac2417e,Cross-Layer combining of adaptive Modulation a...,IEEE Transactions on Wireless Communications,4152a496-e184-36b5-bc13-dea0c364b6bd,published
34,287b6e2ad1225b5a8291d8ccf150ab1ecdf2af7f,Trajectory clustering: a partition-and-group f...,ACM SIGMOD Conference,23b04e45-9408-3589-9c53-e174b24175b0,published
...,...,...,...,...,...
13163,83d108dfa0f1d8c01e07fd1354c3a72f12465f91,Sensitivity of HIV-1 to entry inhibitors corre...,Proceedings of the National Academy of Science...,ee93573d-bbfe-3687-b713-01a9cd1d9ac2,published
13167,ab450671629c22c233c56001e858a0e9c4c24979,Schema Creation in Programming,Cognitive Sciences,ad03f33f-3ac9-3268-b3ec-d34c3ab0bdc2,published
13175,be868a4e978129f5b43316c389ea11f6eff5ef45,The price of stability for network design with...,45th Annual IEEE Symposium on Foundations of C...,92ea820d-24a7-32fb-9985-da0935560256,published
13176,2cf44c2edd3e0da932ca3e94cbe385998fdecbeb,On a network creation game,ACM SIGACT-SIGOPS Symposium on Principles of D...,832d458a-420f-326e-9d35-d5da7ccef936,published


In [25]:
qa_venue_paper = [
    (f'Which paper was published in the venue "{row.subject}"?', row.object, row.objectId)
    for row in papers_venue.itertuples()
]

len(qa_venue_paper), qa_venue_paper[:3]

(2573,
 [('Which paper was published in the venue "IEEE Transactions on Medical Imaging"?',
   'Ridge-based vessel segmentation in color images of the retina',
   'ac9748ea3945eb970cc32a37db7cfdfd0f22e74c'),
  ('Which paper was published in the venue "Bioinform."?',
   'Protein homology detection by HMM?CHMM comparison',
   'bf5a4480f09d97cb27402cda19fd126101fe0a44'),
  ('Which paper was published in the venue "IEEE Trans. Neural Networks"?',
   'Using mutual information for selecting features in supervised neural net learning',
   '10028f490a9dc0e4c024ab40e0bee9f3e027f875')])

In [26]:
pd.to_pickle(qa_venue_paper, 'data/qa_subsets/qa_venue_paper.pkl')

#### Join triplets & all QA training pairs

In [27]:
final_corpus = pd.concat([authors, papers, authors_venues, authors_topics, coauthors, papers_venue], ignore_index=True)
final_corpus

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,wrote,Solving PDEs with radial basis functions *,1751907,d397570eef10925f7ebc2da644e54f0a55ba2f13
1,Christopher Anthony O'callaghan,wrote,Direct Visualization of Antigen-specific CD8+T...,79187876,8c134d5124bb3eaae85e09d7d7cb60cc296a0ab7
2,Fisher Susan H.,wrote,Bacillus subtilis 168 Contains Two Differentia...,2220105,38677364e0210277add24a01ebabaed982455145
3,Morilla Antonio González,wrote,Speculative execution via address prediction a...,144359098,542d7ddb6a2efa4a9a55c63bdc1e5fbae129df56
4,Till Ballendat,wrote,Proxemic interaction: designing for a proximit...,2254514,be4ddea1bf8ee8f803b90425257892b31f6f5b87
...,...,...,...,...,...
18700,Proceedings of the National Academy of Science...,published,Sensitivity of HIV-1 to entry inhibitors corre...,ee93573d-bbfe-3687-b713-01a9cd1d9ac2,83d108dfa0f1d8c01e07fd1354c3a72f12465f91
18701,Cognitive Sciences,published,Schema Creation in Programming,ad03f33f-3ac9-3268-b3ec-d34c3ab0bdc2,ab450671629c22c233c56001e858a0e9c4c24979
18702,45th Annual IEEE Symposium on Foundations of C...,published,The price of stability for network design with...,92ea820d-24a7-32fb-9985-da0935560256,be868a4e978129f5b43316c389ea11f6eff5ef45
18703,ACM SIGACT-SIGOPS Symposium on Principles of D...,published,On a network creation game,832d458a-420f-326e-9d35-d5da7ccef936,2cf44c2edd3e0da932ca3e94cbe385998fdecbeb


In [28]:
final_corpus.to_parquet('data/triples_corpus.parquet')

In [29]:
qa = qa_related_papers + qa_author_papers + qa_author_venue + qa_author_topic + qa_coauthors + qa_venue_paper

In [30]:
len(qa)

18705

In [31]:
pd.to_pickle(qa, 'data/qa_training.pkl')

#### Save evaluation set

In [32]:
evaluation = random.choices(qa_related_papers, k=250) +\
             random.choices(qa_author_papers, k=250) +\
             random.choices(qa_author_venue, k=250) +\
             random.choices(qa_author_topic, k=250) +\
             random.choices(qa_coauthors, k=250) +\
             random.choices(qa_venue_paper, k=250) 

In [33]:
evaluation = pd.DataFrame(evaluation, columns=['question', 'answer', 'answerId'])
evaluation.drop_duplicates('question', inplace=True)
evaluation.dropna(inplace=True)
evaluation.reset_index(inplace=True, names='id')
evaluation

Unnamed: 0,id,question,answer,answerId
0,0,Which paper is cited or referenced in the pape...,Hybrid Genetic Algorithms: A Review,340b165fd2108f29e9b03629c1e1373cd4163ecf
1,1,Which paper is cited or referenced in the pape...,Identification of a gene encoding an acyl CoA:...,c5776d7726c055461f8b18faaf72459ed419cf12
2,2,Which paper is cited or referenced in the pape...,Online Walking Motion Generation with Automati...,c230eb7ddc823258f52d5e1555ea568a0829cd48
3,3,Which paper is cited or referenced in the pape...,The Network Architecture of the Connection Mac...,07fcb2a39e6c57eb3949292cdfd8b9a736b9d695
4,4,Which paper is cited or referenced in the pape...,NeuS: Learning Neural Implicit Surfaces by Vol...,cf5647cb2613f5f697729eab567383006dcd4913
...,...,...,...,...
1271,1489,"Which paper was published in the venue ""Confer...",Stability issues in OSPF routing,70203dfc30bb61c2c6a6c52b0d6cfb4b85c80be7
1272,1490,"Which paper was published in the venue ""Fourti...",Optimal operation of distribution system with ...,a4aa72a46bf0b47156655562ae9ca5a8818080f3
1273,1493,"Which paper was published in the venue ""Evolut...",PERSPECTIVE: A CRITIQUE OF SEWALL WRIGHT'S SHI...,eebfa46949f6a841aef2924b6e1fb307a0bdf551
1274,1497,"Which paper was published in the venue ""Intern...",Opinion Word Expansion and Target Extraction t...,4fe594f1f0358a00b19e2bb950e45988decab9b7


In [34]:
corpus = {row.answerId:row.answer for row in evaluation.itertuples()}
queries = {str(row.id):row.question for row in evaluation.itertuples()}
relevant_docs = {str(row.id):row.answerId for row in evaluation.itertuples()}

In [35]:
evaluation = dict(corpus=corpus, queries=queries, relevant_docs=relevant_docs)

In [36]:
pd.to_pickle(evaluation, 'data/qa_evaluation.pkl')