Import libraries

In [9]:
import pandas as pd
import re
import random
import uuid


Set a random seed for reproducibility and read the data from a Parquet file

In [10]:
random.seed(42)

data = pd.read_parquet("data/mlt_data_publications.parquet")
data = data[~data.s2FieldsOfStudy.isna()]

In [11]:
data.head(3)

Unnamed: 0,paperId,title,abstract,venue,referenceCount,citationCount,influentialCitationCount,s2FieldsOfStudy,publicationDate,authors,con_type,source
0,bb01353f818ca226b53433163893efc56c3df32d,RADAR: an in-building RF-based user location a...,The proliferation of mobile computing devices ...,Proceedings IEEE INFOCOM 2000. Conference on C...,29.0,8786.0,1048.0,"[{'category': 'Computer Science', 'source': 'e...",2000-03-26,"[{'affiliations': [], 'aliases': ['P. Bahl', '...",base,bb01353f818ca226b53433163893efc56c3df32d
1,f99ae00d1244aea3623471a24ee94f1f408398ab,Model selection and model averaging in phyloge...,Model selection is a topic of special relevanc...,Systematic Biology,180.0,3738.0,807.0,"[{'category': 'Biology', 'source': 'external'}...",2004-10-01,"[{'affiliations': [], 'aliases': ['D Posada', ...",base,f99ae00d1244aea3623471a24ee94f1f408398ab
2,b26f2037f769d5ffc5f7bdcec2de8da28ec14bee,Dense Passage Retrieval for Open-Domain Questi...,Open-domain question answering relies on effic...,Conference on Empirical Methods in Natural Lan...,55.0,2082.0,631.0,"[{'category': 'Computer Science', 'source': 'e...",2020-04-10,"[{'affiliations': [], 'aliases': ['Vlad Karpuk...",base,b26f2037f769d5ffc5f7bdcec2de8da28ec14bee


### Create (author, wrote, paper) triples & QA pairs with id for training

Define a constant for the total number of unique authors

In [12]:
TOTAL_UNIQUE_AUTHORS = 2000 # max unique authors: 44,574

Iterate through each row of the dataframe, extract author information, clean and standardize names and aliases, then append unique author-publication records to the list with author name, publication title, paper ID, and author ID.

In [13]:
authors = []
for row in data.itertuples():
    for author in row.authors:
        names = [author.get('name')]
        
        try:
            aliases = author.get('aliases').tolist()
        except AttributeError:
            aliases = []
        
        names += aliases
        
        names = [' '.join(set(re.sub(r'\s+', ' ', name).split(' ')) )for name in names]

        for name in list(set(names)):
            authors.append({'author': name, 'publication': row.title, 
                            'paperId': row.paperId, 'authorId': author.get('authorId')})

Trim down the dataset by removing duplicate author-publication records, keeping only the first occurrence, and then randomly sample to get a subset of unique authors for further analysis.

In [14]:
authors = pd.DataFrame(authors)

## Reduce the dataset! This should be removed in the final version (?)
authors['len_name'] = authors.author.apply(len)
authors.sort_values(['paperId','authorId','len_name'], ascending=False, inplace=True)
authors.drop_duplicates(subset=['paperId','authorId'], keep='first', inplace=True)

authors = authors.sample(TOTAL_UNIQUE_AUTHORS, random_state=42).reset_index(drop=True)
len(authors)

2000

Rename columns to represent subject and object entities, assign 'wrote' as the property between authors and publications, and reorganize the dataframe to adhere to a subject-property-object structure.

In [15]:
authors.rename(columns={'author': 'subject','publication':'object','authorId':'subjectId','paperId':'objectId'}, inplace=True)
authors['property'] = 'wrote'
authors = authors[['subject','property','object','subjectId','objectId']]
authors

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,wrote,Solving PDEs with radial basis functions *,1751907,d397570eef10925f7ebc2da644e54f0a55ba2f13
1,Anthony Christopher O'callaghan,wrote,Direct Visualization of Antigen-specific CD8+T...,79187876,8c134d5124bb3eaae85e09d7d7cb60cc296a0ab7
2,Susan H. Fisher,wrote,Bacillus subtilis 168 Contains Two Differentia...,2220105,38677364e0210277add24a01ebabaed982455145
3,Antonio González Morilla,wrote,Speculative execution via address prediction a...,144359098,542d7ddb6a2efa4a9a55c63bdc1e5fbae129df56
4,Ballendat Till,wrote,Proxemic interaction: designing for a proximit...,2254514,be4ddea1bf8ee8f803b90425257892b31f6f5b87
...,...,...,...,...,...
1995,Atzori L.,wrote,Trustworthiness Management in the Social Inter...,1720529,5a97f384f7614848bcee66a5865bdd32dbf4e1ac
1996,Bhoedjang A. F. Raoul,wrote,MagPIe: MPI's collective communication operati...,1680452,dd25b7916d46b0b329d8965bc4e205b818ce6ac1
1997,R. Jain,wrote,"Comprehensive, Integrative Genomic Analysis of...",144004586,e76642c7e25784cd3bfe6c3c7d4d48d43ed94ace
1998,Chen Wen-Tsuen,wrote,Secure Broadcasting Using the Secure Lock,2109090572,ba8e57476a61bad5cfeb18b4acd66633cbcc46df


Generate questions in the form of "Who wrote the paper titled '...'?" for each author-publication pair in the dataframe.

In [16]:
qa_author_papers = [
    (f'Who wrote the paper titled "{row.object}"?', row.subject, row.subjectId)
    for row in authors.itertuples()
]
len(qa_author_papers), qa_author_papers[:3]

(2000,
 [('Who wrote the paper titled "Solving PDEs with radial basis functions *"?',
   'N. Flyer',
   '1751907'),
  ('Who wrote the paper titled "Direct Visualization of Antigen-specific CD8+T Cells during the Primary Immune Response to Epstein-Barr Virus In Vivo"?',
   "Anthony Christopher O'callaghan",
   '79187876'),
  ('Who wrote the paper titled "Bacillus subtilis 168 Contains Two Differentially Regulated Genes Encoding l-Asparaginase"?',
   'Susan H. Fisher',
   '2220105')])

Save the generated questions about authors and their papers to a pickle file.

In [17]:
pd.to_pickle(qa_author_papers, 'data/qa_subsets/qa_authors.pkl')

### Create (paper1, related with, paper2) triples & QA pairs with id for training

Filter the papers dataframe based on the unique object IDs (paper IDs) present in the authors dataframe. Then, create a mapping of paper IDs to titles. Finally, filter the papers dataframe further to exclude entries with 'base' content type.

In [18]:
papers = data[data.paperId.isin(authors.objectId.unique())]

titles_map = {row.paperId:row.title for row in data.itertuples()}

papers.query('con_type!="base"', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.query('con_type!="base"', inplace=True)


Update the 'papers' dataframe by mapping the paper IDs to their respective titles using the 'titles_map'. Then, rename the columns to match the format of the 'authors' dataframe. Add a new column 'property' with the value 'related with'. Finally, select and reorder the columns accordingly.

In [19]:
papers['object'] = papers.source.map(titles_map)

papers.rename(columns={'source':'objectId','paperId':'subjectId','title':'subject'}, inplace=True)
papers['property'] = 'related with'

papers = papers[['subject','property','object','subjectId','objectId']]
papers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['object'] = papers.source.map(titles_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.rename(columns={'source':'objectId','paperId':'subjectId','title':'subject'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers['property'] = 'related with'


Unnamed: 0,subject,property,object,subjectId,objectId
3006,A lossless text compression technique using sy...,related with,A genetic algorithm approach for verification ...,abc60da0c3eea5e1519cc4f2eb97008187898777,ffe288376b4e62a9d84f28b249b0c92f7116686a
3014,A Formal Look at Dependency Grammars and Phras...,related with,D-Tree Substitution Grammars,575543a479c42f997f07b65b61c2639f94fe30de,ffb5f1cd15a620ba6f77f3368f71606b5940cba0
3019,Developmental Constraints and Evolution: A Per...,related with,The innovation triad: an EvoDevo agenda.,12cb50be697a13e76e0b11cc1ec3aaaa8bf41312,ff952b688cdb7eb8baf780507171ce6e122d8af4
3023,Characterization of early pathogenic effects a...,related with,Genomic sequence analysis identifies Jembrana ...,732786b565985999c1cccfcdb88aa7f3dc4184ca,ff8a238d958ed44f4fae78e5ef99d5383083deb5
3029,The Linear Programming Approach to Approximate...,related with,Duality and linear programs for stability and ...,8a14ac38f66996913c4d7f3a3141294a602fd8f3,ff351f5cccba47c7ae38b38a310279fb07232cab
...,...,...,...,...,...
13149,All One Needs to Know about Metaverse: A Compl...,related with,Multimodal feedback for the acquisition of sma...,b00cc531f4872dcff8577fc238f70a088eef3b56,05953fab70c47eff45154bdebd004539bfa4da45
13150,Semantic pointing: improving target acquisitio...,related with,Multimodal feedback for the acquisition of sma...,f86a29b66ab93bcf9e2e51718c47e738b719e095,05953fab70c47eff45154bdebd004539bfa4da45
13159,Full-Length Human Immunodeficiency Virus Type ...,related with,Comprehensive Cross-Clade Neutralization Analy...,30d19c81318b9c0db8ab9564b62472f6e1a4ab2a,031a0bd9cf17a9acff2d56ae934a3451998e3925
13162,HIV-1 Nomenclature Proposal,related with,Comprehensive Cross-Clade Neutralization Analy...,25e27805d2c0e031dd48fb8d06cfb90623e1fb2f,031a0bd9cf17a9acff2d56ae934a3451998e3925


In [20]:
qa_related_papers = [
    (f"Which paper is cited or referenced in the paper titled '{row.object}'?", row.subject, row.subjectId)
    for row in papers.itertuples()
]

len(qa_related_papers), qa_related_papers[:3]

(1381,
 [("Which paper is cited or referenced in the paper titled 'A genetic algorithm approach for verification of the syllable-based text compression technique'?",
   'A lossless text compression technique using syllable based morphology',
   'abc60da0c3eea5e1519cc4f2eb97008187898777'),
  ("Which paper is cited or referenced in the paper titled 'D-Tree Substitution Grammars'?",
   'A Formal Look at Dependency Grammars and Phrase-Structure Grammars, with Special Consideration of Word-Order Phenomena',
   '575543a479c42f997f07b65b61c2639f94fe30de'),
  ("Which paper is cited or referenced in the paper titled 'The innovation triad: an EvoDevo agenda.'?",
   'Developmental Constraints and Evolution: A Perspective from the Mountain Lake Conference on Development and Evolution',
   '12cb50be697a13e76e0b11cc1ec3aaaa8bf41312')])

Save the 'qa_related_papers' list as a pickle file.

In [21]:
pd.to_pickle(qa_related_papers, 'data/qa_subsets/qa_cites_refs.pkl')

### Create (author, published in, venue) triples & QA pairs with id for training

Generate unique identifiers for venues, merge author and venue data, rename columns appropriately, generate unique identifiers for venues, and update the property accordingly to 'published in'.

In [22]:
venue_id_map = {v:str(uuid.uuid3(uuid.NAMESPACE_OID, v)) for v in data.venue.unique()}

In [23]:
authors_venues = authors.merge(data[['paperId','venue']], left_on='objectId', right_on='paperId', how='left')
authors_venues = authors_venues[['subject','property','venue','subjectId']].rename(columns={'venue':'object'})
authors_venues['objectId'] = authors_venues.object.map(venue_id_map)
authors_venues['property'] = 'published in'
authors_venues

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,published in,Acta Numerica,1751907,a141218b-2f0b-31b0-8127-b7b1b626bbf3
1,Anthony Christopher O'callaghan,published in,Journal of Experimental Medicine,79187876,878d96ac-106b-38c0-88a2-dbf7a7d33fae
2,Susan H. Fisher,published in,Journal of Bacteriology,2220105,79862a52-98a5-3529-8348-6124894a92ea
3,Antonio González Morilla,published in,International Conference on Supercomputing,144359098,8df687ec-6d98-31e9-ad8a-16a2838e6436
4,Ballendat Till,published in,International Conference on Intelligent Tutori...,2254514,e8365676-c4aa-3cab-aa6b-70ab4a213f23
...,...,...,...,...,...
1995,Atzori L.,published in,IEEE Transactions on Knowledge and Data Engine...,1720529,f9d2dcbd-106f-346d-af17-eabece646fb0
1996,Bhoedjang A. F. Raoul,published in,ACM SIGPLAN Symposium on Principles & Practice...,1680452,5be572f0-f0f5-3229-a061-3140b08f3527
1997,R. Jain,published in,New England Journal of Medicine,144004586,aa6c8145-f281-309c-af23-1ad8f0ad667e
1998,Chen Wen-Tsuen,published in,IEEE Trans. Software Eng.,2109090572,a0023640-b9d1-3c30-84c1-3a128326ec61


Create a list of questions and answers about authors' publication venues, extracting data from the merged DataFrame of authors and venues, then generate unique identifiers for venues and form questions such as "In which venue has the author {author_name} published?"

In [24]:
qa_author_venue = [
    (f'"In which venue has the author {row.subject} published?"', row.object, row.objectId)
    for row in authors_venues.itertuples()
]

len(qa_author_venue), qa_author_venue[:3]

(2000,
 [('"In which venue has the author N. Flyer published?"',
   'Acta Numerica',
   'a141218b-2f0b-31b0-8127-b7b1b626bbf3'),
  ('"In which venue has the author Anthony Christopher O\'callaghan published?"',
   'Journal of Experimental Medicine',
   '878d96ac-106b-38c0-88a2-dbf7a7d33fae'),
  ('"In which venue has the author Susan H. Fisher published?"',
   'Journal of Bacteriology',
   '79862a52-98a5-3529-8348-6124894a92ea')])

In [25]:
pd.to_pickle(qa_author_venue, 'data/qa_subsets/qa_author_venue.pkl')

### Create (author, works in field, topic) triples & QA pairs with id for training

Extracts topics from the dataset and maps them to unique identifiers using UUIDs.

In [26]:
topics = data[['paperId','s2FieldsOfStudy']].explode('s2FieldsOfStudy')
topics = topics[~topics.s2FieldsOfStudy.isna()]
topics['object'] = [i.get('category') for i in topics.s2FieldsOfStudy]

topic_id_map = {v:str(uuid.uuid3(uuid.NAMESPACE_OID, v)) for v in topics.object.unique()}

Segment links authors with the topics they work in, using unique identifiers for topics.

In [27]:
authors_topics = authors[['subject','subjectId','objectId']].merge(topics[['paperId','object']], left_on='objectId', right_on='paperId', how='left')

authors_topics.drop(columns=['objectId','paperId'], inplace=True)
authors_topics.drop_duplicates(inplace=True)
authors_topics['objectId'] = authors_topics.object.map(topic_id_map)
authors_topics['property'] = 'works in field'

authors_topics

Unnamed: 0,subject,subjectId,object,objectId,property
0,N. Flyer,1751907,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field
1,N. Flyer,1751907,Mathematics,54b1834c-8853-358a-83c2-b959e3d18092,works in field
2,N. Flyer,1751907,Physics,98b63cd9-ce2e-30dc-b7d8-4e7972d092a6,works in field
3,N. Flyer,1751907,Engineering,f312a29c-d83f-3882-ac6d-28e92749d432,works in field
4,Anthony Christopher O'callaghan,79187876,Biology,7f1bb586-b024-3edf-8198-7bec8a37d19a,works in field
...,...,...,...,...,...
6140,Bhoedjang A. F. Raoul,1680452,Engineering,f312a29c-d83f-3882-ac6d-28e92749d432,works in field
6141,R. Jain,144004586,Biology,7f1bb586-b024-3edf-8198-7bec8a37d19a,works in field
6142,R. Jain,144004586,Medicine,3179df26-84b2-3e99-95c0-93f5e29a96b4,works in field
6145,Chen Wen-Tsuen,2109090572,Computer Science,5c7c2038-be47-3681-94f9-921a75cfb5df,works in field


Creates questions about the field of study each author works in, based on the provided data.

In [28]:
qa_author_topic = [
    (f'"In which field of study does the author {row.subject} work?"', row.object, row.objectId)
    for row in authors_topics.itertuples()
]
len(qa_author_topic), qa_author_topic[:3]

(3952,
 [('"In which field of study does the author N. Flyer work?"',
   'Computer Science',
   '5c7c2038-be47-3681-94f9-921a75cfb5df'),
  ('"In which field of study does the author N. Flyer work?"',
   'Mathematics',
   '54b1834c-8853-358a-83c2-b959e3d18092'),
  ('"In which field of study does the author N. Flyer work?"',
   'Physics',
   '98b63cd9-ce2e-30dc-b7d8-4e7972d092a6')])

In [29]:
pd.to_pickle(qa_author_topic, 'data/qa_subsets/qa_author_topic.pkl')

### Create (author1, co-authored with, author2) triples & QA pairs with id for training

Generate pairs of authors who collaborate on papers to denote their collaborative relationships.

In [30]:
coauthors = authors.merge(authors, on='objectId', suffixes=('_1','_2'))
coauthors = coauthors[coauthors.subjectId_1 != coauthors.subjectId_2]
coauthors['property'] = 'collaborates with'
coauthors = coauthors[['subject_1','property','subject_2','subjectId_1','subjectId_2']]
coauthors.rename(columns={'subject_1':'subject','subjectId_1':'subjectId','subject_2':'object','subjectId_2':'objectId'}, inplace=True)
coauthors.reset_index(drop=True, inplace=True)
coauthors

Unnamed: 0,subject,property,object,subjectId,objectId
0,A. Iavarone,collaborates with,Zhang Wei,2700430,2155468112
1,A. Iavarone,collaborates with,R. Shen,2700430,2261512066
2,A. Iavarone,collaborates with,Kim Jaegil,2700430,46454427
3,A. Iavarone,collaborates with,A. Unterberg,2700430,3290065
4,A. Iavarone,collaborates with,Getz G.,2700430,2110594
...,...,...,...,...,...
978,R. Jain,collaborates with,Zhang Jianan,144004586,101594815
979,R. Jain,collaborates with,Berger M.,144004586,2113458299
980,R. Jain,collaborates with,E. Lisle Mose,144004586,3100330
981,R. Jain,collaborates with,Crain D.,144004586,40612256


Create questions to identify authors who have collaborated with specific individuals.

In [31]:
qa_coauthors = [
    (f'"Which author has co-authored with {row.subject}?"', row.object, row.objectId)
    for row in coauthors.itertuples()
]
len(qa_coauthors), qa_coauthors[:3]

(983,
 [('"Which author has co-authored with A. Iavarone?"',
   'Zhang Wei',
   '2155468112'),
  ('"Which author has co-authored with A. Iavarone?"',
   'R. Shen',
   '2261512066'),
  ('"Which author has co-authored with A. Iavarone?"',
   'Kim Jaegil',
   '46454427')])

In [32]:
pd.to_pickle(qa_coauthors, 'data/qa_subsets/qa_coauthors.pkl')

### Create (venue, published, paper) triples & QA pairs with id for training

Extract papers along with their venue information and format them for further processing.

In [33]:
papers_venue = data[data.paperId.isin(authors.objectId.unique())]
papers_venue = papers_venue[['paperId','title','venue']]
papers_venue.rename(columns={'paperId':'objectId','title':'object', 'venue':'subject'}, inplace=True)
papers_venue['subjectId'] = papers_venue.subject.map(venue_id_map)
papers_venue['property'] = 'published'   
papers_venue

Unnamed: 0,objectId,object,subject,subjectId,property
5,ac9748ea3945eb970cc32a37db7cfdfd0f22e74c,Ridge-based vessel segmentation in color image...,IEEE Transactions on Medical Imaging,503ae1ae-49f2-38e2-a863-43cae297e161,published
16,bf5a4480f09d97cb27402cda19fd126101fe0a44,Protein homology detection by HMM?CHMM comparison,Bioinform.,746503af-d318-3a57-a0cb-2e2c7b2bb16e,published
21,10028f490a9dc0e4c024ab40e0bee9f3e027f875,Using mutual information for selecting feature...,IEEE Trans. Neural Networks,54532f94-1c58-3c31-b392-dba0aabed500,published
28,7bffc397f8a82a23862d2bacee7bb7bbfac2417e,Cross-Layer combining of adaptive Modulation a...,IEEE Transactions on Wireless Communications,4152a496-e184-36b5-bc13-dea0c364b6bd,published
50,921d07635bfc4aaf7bdf6646844a7a771e6f070f,Optimization by Direct Search: New Perspective...,SIAM Review,31c36d56-7fc8-31e0-a185-4fd559e5bae1,published
...,...,...,...,...,...
13149,b00cc531f4872dcff8577fc238f70a088eef3b56,All One Needs to Know about Metaverse: A Compl...,arXiv.org,6c3a99f1-e1f4-3d27-9a01-85948c3c2108,published
13150,f86a29b66ab93bcf9e2e51718c47e738b719e095,Semantic pointing: improving target acquisitio...,International Conference on Human Factors in C...,4aa50f84-3032-3672-a81e-834d9eae9d09,published
13159,30d19c81318b9c0db8ab9564b62472f6e1a4ab2a,Full-Length Human Immunodeficiency Virus Type ...,Journal of Virology,dda431ea-b236-3c6c-8775-ccc799ebfdd4,published
13162,25e27805d2c0e031dd48fb8d06cfb90623e1fb2f,HIV-1 Nomenclature Proposal,Science,69b09919-4f6d-379f-bcb9-b36eef733a4a,published


Generate questions about papers and their respective venues.

In [34]:
qa_venue_paper = [
    (f'Which paper was published in the venue "{row.subject}"?', row.object, row.objectId)
    for row in papers_venue.itertuples()
]

len(qa_venue_paper), qa_venue_paper[:3]

(1789,
 [('Which paper was published in the venue "IEEE Transactions on Medical Imaging"?',
   'Ridge-based vessel segmentation in color images of the retina',
   'ac9748ea3945eb970cc32a37db7cfdfd0f22e74c'),
  ('Which paper was published in the venue "Bioinform."?',
   'Protein homology detection by HMM?CHMM comparison',
   'bf5a4480f09d97cb27402cda19fd126101fe0a44'),
  ('Which paper was published in the venue "IEEE Trans. Neural Networks"?',
   'Using mutual information for selecting features in supervised neural net learning',
   '10028f490a9dc0e4c024ab40e0bee9f3e027f875')])

In [35]:
pd.to_pickle(qa_venue_paper, 'data/qa_subsets/qa_venue_paper.pkl')

#### Join triplets & all QA training pairs

Merge all relevant dataframes, including authors, papers, venue details, author-topic relationships, co-authors, and papers published in venues, into a single comprehensive corpus, ensuring seamless integration for comprehensive analysis.

In [36]:
final_corpus = pd.concat([authors, papers, authors_venues, authors_topics, coauthors, papers_venue], ignore_index=True)
final_corpus

Unnamed: 0,subject,property,object,subjectId,objectId
0,N. Flyer,wrote,Solving PDEs with radial basis functions *,1751907,d397570eef10925f7ebc2da644e54f0a55ba2f13
1,Anthony Christopher O'callaghan,wrote,Direct Visualization of Antigen-specific CD8+T...,79187876,8c134d5124bb3eaae85e09d7d7cb60cc296a0ab7
2,Susan H. Fisher,wrote,Bacillus subtilis 168 Contains Two Differentia...,2220105,38677364e0210277add24a01ebabaed982455145
3,Antonio González Morilla,wrote,Speculative execution via address prediction a...,144359098,542d7ddb6a2efa4a9a55c63bdc1e5fbae129df56
4,Ballendat Till,wrote,Proxemic interaction: designing for a proximit...,2254514,be4ddea1bf8ee8f803b90425257892b31f6f5b87
...,...,...,...,...,...
12100,arXiv.org,published,All One Needs to Know about Metaverse: A Compl...,6c3a99f1-e1f4-3d27-9a01-85948c3c2108,b00cc531f4872dcff8577fc238f70a088eef3b56
12101,International Conference on Human Factors in C...,published,Semantic pointing: improving target acquisitio...,4aa50f84-3032-3672-a81e-834d9eae9d09,f86a29b66ab93bcf9e2e51718c47e738b719e095
12102,Journal of Virology,published,Full-Length Human Immunodeficiency Virus Type ...,dda431ea-b236-3c6c-8775-ccc799ebfdd4,30d19c81318b9c0db8ab9564b62472f6e1a4ab2a
12103,Science,published,HIV-1 Nomenclature Proposal,69b09919-4f6d-379f-bcb9-b36eef733a4a,25e27805d2c0e031dd48fb8d06cfb90623e1fb2f


Save the integrated corpus data as a Parquet file named "triples_corpus.parquet" for efficient storage and future retrieval.

In [37]:
final_corpus.to_parquet('data/triples_corpus.parquet')


Combine all question-answer pairs related to the dataset into a single list.

In [38]:
qa = qa_related_papers + qa_author_papers + qa_author_venue + qa_author_topic + qa_coauthors + qa_venue_paper

In [39]:
len(qa)

12105

In [40]:
pd.to_pickle(qa, 'data/qa_training.pkl')

#### Save evaluation set

Create an evaluation set by randomly selecting 250 question-answer pairs from each category: related papers, author papers, author venues, author topics, co-authors, and venue papers. Combine them into a single list named "evaluation" for assessment purposes.

In [41]:
evaluation = random.choices(qa_related_papers, k=250) +\
             random.choices(qa_author_papers, k=250) +\
             random.choices(qa_author_venue, k=250) +\
             random.choices(qa_author_topic, k=250) +\
             random.choices(qa_coauthors, k=250) +\
             random.choices(qa_venue_paper, k=250) 

Generate an evaluation DataFrame by converting the list of question-answer pairs into a DataFrame named "evaluation". Ensure uniqueness of questions, remove any rows with missing values, and reset the index. Assign column names "question", "answer", and "answerId" to the DataFrame.

In [42]:
evaluation = pd.DataFrame(evaluation, columns=['question', 'answer', 'answerId'])
evaluation.drop_duplicates('question', inplace=True)
evaluation.dropna(inplace=True)
evaluation.reset_index(inplace=True, names='id')
evaluation

Unnamed: 0,id,question,answer,answerId
0,0,Which paper is cited or referenced in the pape...,Cramer-Rao bounds for non-linear filtering wit...,055ac0372dd10db1871fae986bf7fb4bad518104
1,1,Which paper is cited or referenced in the pape...,Dynamic voltage and frequency management for a...,37793d57b862d322404308fcc54b7027d77d6061
2,2,Which paper is cited or referenced in the pape...,Longitudinal observation of parechovirus in st...,0a0c696ccd1344785a656b10d6237a23ab3f451f
3,3,Which paper is cited or referenced in the pape...,Soli: ubiquitous gesture sensing with millimet...,00f0a8788ed43c534a8888b2b2e4edbafafe8bf3
4,4,Which paper is cited or referenced in the pape...,Multidimensional independent component analysis,8676573fb87797b0e744f1fd62d230c3fb9903ad
...,...,...,...,...
1211,1493,"Which paper was published in the venue ""Procee...",On-line fault detection of sensor measurements,62a64b9717d01706b0f82a5f20cdd576dd98b2a7
1212,1494,"Which paper was published in the venue ""Brain ...",The functional anatomy of attention to visual ...,7c406b8547c080a1b2aa987d084fc5080d35174c
1213,1496,"Which paper was published in the venue ""CSUR""?",The Recovery Manager of the System R Database ...,7a9abc36f336750f4c0679f0b4ef87c9dc12133c
1214,1497,"Which paper was published in the venue ""Journa...",Rainbow triangles and the Caccetta‐Häggkvist c...,26a14afe49062eb660d3aa3e3d1ea53d5b104dd4


Create dictionaries from the evaluation DataFrame.

In [43]:
corpus = {row.answerId:row.answer for row in evaluation.itertuples()}
queries = {str(row.id):row.question for row in evaluation.itertuples()}
relevant_docs = {str(row.id):row.answerId for row in evaluation.itertuples()}

In [44]:
evaluation = dict(corpus=corpus, queries=queries, relevant_docs=relevant_docs)

In [45]:
pd.to_pickle(evaluation, 'data/qa_evaluation.pkl')

d