In [4]:
import pandas as pd
import re
import random

random.seed(42)

In [5]:
data = pd.read_parquet("data/mlt_data_publications.parquet")

### Authors QA pairs with id for training

In [6]:
TOTAL_UNIQUE_AUTHORS = 2500

In [7]:
authors = []
for row in data.itertuples():
    for author in row.authors:
        names = [author.get('name')]
        
        try:
            aliases = author.get('aliases').tolist()
        except AttributeError:
            aliases = []
        
        names += aliases
        
        names = [' '.join(set(re.sub(r'\s+', ' ', name).split(' ')) )for name in names]

        for name in list(set(names)):
            authors.append({'author': name, 'publication': row.title, 
                            'paperId': row.paperId, 'authorId': author.get('authorId')})

In [8]:
authors = pd.DataFrame(authors)

## Reduce the dataset! This should be removed in the final version
authors['len_name'] = authors.author.apply(len)
authors.sort_values(['paperId','authorId','len_name'], ascending=False, inplace=True)
authors.drop_duplicates(subset=['paperId','authorId'], keep='first', inplace=True)

authors = authors.sample(TOTAL_UNIQUE_AUTHORS, random_state=42).reset_index(drop=True)
len(authors)

2500

In [9]:
question_answer_pairs_authors = [
    (f'Who wrote the paper titled "{row.publication}"?', row.author, row.authorId)
    for row in authors.itertuples()
]

In [10]:
pd.to_pickle(question_answer_pairs_authors, 'data/qa_subsets/qa_authors.pkl')

### Create (paper1, cite/ref, paper2) triples & QA pairs with id for training

In [11]:
papers = data[data.paperId.isin(authors.paperId)]

titles_map = {row.paperId:row.title for row in data.itertuples()}

related_pubs = [(titles_map[row.paperId], 'relatedWith', titles_map[row.source], row.paperId) 
                for row in papers.itertuples() if row.con_type!='base']

question_answer_pairs = [
    (f'Which paper is cited or referenced in the paper titled "{triple[0]}"?', triple[2], triple[3])
    for triple in related_pubs
]

len(question_answer_pairs)

1700

In [12]:
pd.to_pickle(question_answer_pairs, 'data/qa_subsets/qa_cites_refs.pkl')

#### Join all qa training pairs

In [13]:
qa = question_answer_pairs + question_answer_pairs_authors

In [14]:
len(qa)

4200

In [15]:
pd.to_pickle(qa, 'data/qa_training.pkl')

#### Save evaluation set

In [16]:
evaluation = random.choices(question_answer_pairs, k=500) +\
             random.choices(question_answer_pairs_authors, k=500)

In [17]:
evaluation = pd.DataFrame(evaluation, columns=['question', 'answer', 'answerId']).reset_index(names='id')
evaluation.drop_duplicates('question', inplace=True)
evaluation.dropna(inplace=True)

evaluation

Unnamed: 0,id,question,answer,answerId
0,0,Which paper is cited or referenced in the pape...,PerfExplorer: A Performance Data Mining Framew...,6af3228141a9891e57f879c6ea2b48787e56e17f
1,1,Which paper is cited or referenced in the pape...,A Novel Adaptive Design Methodology for Minimu...,37793d57b862d322404308fcc54b7027d77d6061
2,2,Which paper is cited or referenced in the pape...,Improving Grid-based SLAM with Rao-Blackwelliz...,6b581dac06dfe4d0221412044fbd843b9af124da
3,3,Which paper is cited or referenced in the pape...,Free-Steering Relaxation Methods for Problems ...,dd3eefc22d78467971b28f0b0f9b5b09e838fd56
4,4,Which paper is cited or referenced in the pape...,PAC-Bayes Risk Bounds for Stochastic Averages ...,3505a49f85e7b7bfc3de0cc4c33cf4fd3a5d2bb7
...,...,...,...,...
992,992,"Who wrote the paper titled ""A Verilog RTL synt...",Rose Jonathan,144172650
994,994,"Who wrote the paper titled ""Wavelet-based affi...",W. Boles,2170172
995,995,"Who wrote the paper titled ""An Efficient Data ...",Zhu Chuanqi,3353152
997,997,"Who wrote the paper titled ""Immunoglobulin gen...",N. Maizels,4155046


In [18]:
corpus = {row.answerId:row.answer for row in evaluation.itertuples()}
queries = {str(row.id):row.question for row in evaluation.itertuples()}
relevant_docs = {str(row.id):row.answerId for row in evaluation.itertuples()}

In [19]:
evaluation = dict(corpus=corpus, queries=queries, relevant_docs=relevant_docs)

In [20]:
pd.to_pickle(evaluation, 'data/qa_evaluation.pkl')