In [1]:
import pandas as pd
import re
import random

random.seed(42)

In [2]:
data = pd.read_parquet("data/mlt_data_publications.parquet")

### Authors QA pairs with id for training

In [3]:
authors = []
for row in data.itertuples():
    for author in row.authors:
        names = [author.get('name')]
        
        try:
            aliases = author.get('aliases').tolist()
        except AttributeError:
            aliases = []
        
        names += aliases
        
        names = [' '.join(set(re.sub(r'\s+', ' ', name).split(' ')) )for name in names]

        for name in list(set(names)):
            authors.append({'author': name, 'publication': row.title, 
                            'paperId': row.paperId, 'authorId': author.get('authorId')})

In [4]:
authors = pd.DataFrame(authors)

## Reduce the dataset! This should be removed in the final version
authors['len_name'] = authors.author.apply(len)
authors.sort_values(['paperId','authorId','len_name'], ascending=False, inplace=True)
authors.drop_duplicates(subset=['paperId','authorId'], keep='first', inplace=True)

authors = authors.sample(1000, random_state=42).reset_index(drop=True)
len(authors)

1000

In [5]:
question_answer_pairs_authors = [
    (f'Who wrote the paper titled "{row.publication}"?', row.author, row.authorId)
    for row in authors.itertuples()
]

In [6]:
pd.to_pickle(question_answer_pairs_authors, 'data/qa_subsets/qa_authors.pkl')

### Create (paper1, cite/ref, paper2) triples & QA pairs with id for training

In [7]:
papers = data[data.paperId.isin(authors.paperId)]

titles_map = {row.paperId:row.title for row in data.itertuples()}

related_pubs = [(titles_map[row.paperId], 'relatedWith', titles_map[row.source], row.paperId) 
                for row in papers.itertuples() if row.con_type!='base']

question_answer_pairs = [
    (f'Which paper is cited or referenced in the paper titled "{triple[0]}"?', triple[2], triple[3])
    for triple in related_pubs
]

len(question_answer_pairs)

719

In [8]:
pd.to_pickle(question_answer_pairs, 'data/qa_subsets/qa_cites_refs.pkl')

#### Join all qa training pairs

In [9]:
qa = question_answer_pairs + question_answer_pairs_authors

In [10]:
len(qa)

1719

In [11]:
pd.to_pickle(qa, 'data/qa_training.pkl')

#### Save evaluation set

In [17]:
evaluation = random.choices(question_answer_pairs, k=300) +\
             random.choices(question_answer_pairs_authors, k=300)

In [18]:
evaluation = pd.DataFrame(evaluation, columns=['question', 'answer', 'answerId']).reset_index(names='id')
evaluation.drop_duplicates('question', inplace=True)
evaluation.dropna(inplace=True)

evaluation

Unnamed: 0,id,question,answer,answerId
0,0,Which paper is cited or referenced in the pape...,A multivariate analysis of 59 candidate genes ...,1f585073d6f5bfc9f9b38a5656b02bdde108c916
1,1,Which paper is cited or referenced in the pape...,Current Assessment of Docking into GPCR Crysta...,ee681b0d9bac1b5a0f03b82bec75bf14a99d087b
2,2,Which paper is cited or referenced in the pape...,Real-time and accurate segmentation of moving ...,43a408eae98bd8a8e7363ba47dafe363a01efb5a
3,3,Which paper is cited or referenced in the pape...,Impaired face discrimination in acquired proso...,96a0bd3baa22fa3330b87943fed790160b0af7fe
4,4,Which paper is cited or referenced in the pape...,Toward Optimal Data Aggregation in Random Wire...,7ac5d873742fb30a04725f19a61f0820466c55bb
...,...,...,...,...
591,591,"Who wrote the paper titled ""Intelligent Adapta...",Alvarez Lluc,2295602
594,594,"Who wrote the paper titled ""The Frankencamera:...",K. Pulli,1704409
595,595,"Who wrote the paper titled ""Low-status compens...",Hippler Jochim,2259108636
596,596,"Who wrote the paper titled ""DTW-D: time series...",Chen Yanping,32289348


In [19]:
corpus = {row.answerId:row.answer for row in evaluation.itertuples()}
queries = {str(row.id):row.question for row in evaluation.itertuples()}
relevant_docs = {str(row.id):row.answerId for row in evaluation.itertuples()}

In [20]:
evaluation = dict(corpus=corpus, queries=queries, relevant_docs=relevant_docs)

In [21]:
pd.to_pickle(evaluation, 'data/qa_evaluation.pkl')