In [17]:
import pandas as pd
from semanticscholar import SemanticScholar
import time
from tqdm import tqdm
import glob
import os

tqdm.pandas()

In [18]:
ASREVIEW_PATH=os.path.abspath("") + '/asreview/'
if not os.path.isdir(ASREVIEW_PATH):
    os.mkdir(ASREVIEW_PATH)

In [19]:
%env ASREVIEW_PATH={ASREVIEW_PATH}

env: ASREVIEW_PATH=C:\Users\korzh\Documents\projects\Semi-automatic-literature-review/asreview/


In [20]:
!echo %ASREVIEW_PATH%

C:\Users\korzh\Documents\projects\Semi-automatic-literature-review/asreview/


In [22]:
%%writefile seed.csv
ID
CorpusID:19100865
CorpusID:21698746
CorpusID:17421999
CorpusID:13529313
CorpusID:220793511
CorpusID:88514331
CorpusID:46761881
CorpusID:68034559
CorpusID:236447602
CorpusID:253523568

Overwriting seed.csv


In [23]:
fields = [
        'abstract',
        'authors',
        'citationCount',
        'externalIds',
        'fieldsOfStudy',
        'influentialCitationCount',
        'isOpenAccess',
        'journal',
        'paperId',
        'publicationDate',
        'publicationTypes',
        'referenceCount',
        's2FieldsOfStudy',
        'title',
        'url',
        'venue',
        'year'
    ]

In [24]:
def GetPaperAttributes(paper):
    ret = {}
    for field in fields:
        if field in paper.keys():
            ret[field] = paper[field]
        else:
            ret[field] = None
    return ret

In [25]:
def GetASReviewProjectFile():
    list_of_files = glob.glob(ASREVIEW_PATH + '*.csv') 
    
    if len(list_of_files) == 0:
        return None
    
    latest_file = max(list_of_files, key=os.path.getctime)
    if os.path.basename(latest_file).startswith('asreview_dataset_'):
        return latest_file
    return None

def FirstIteration():
    return GetASReviewProjectFile() == None

In [26]:
sch = SemanticScholar()
sch.timeout = 100000
    
added_papers = set()
linked_papers = []
row_id = 0

if FirstIteration():
    seed_df = pd.read_csv('seed.csv')
    df_papers = pd.DataFrame(columns=fields + ['Included'])
    
    for paper_id in tqdm(seed_df['ID']):
        paper = sch.get_paper(paper_id)
        d = GetPaperAttributes(paper)
        d['Included'] = '1'
        
        df_papers.loc[row_id] = d
 
        if paper.references != None:
            for linked_paper in paper.references:
                if linked_paper.paperId not in added_papers:
                    linked_papers.append(linked_paper)
                    added_papers.add(linked_paper.paperId)
                
        if paper.citations != None:   
            for linked_paper in paper.citations:
                if linked_paper.paperId not in added_papers:
                    linked_papers.append(linked_paper)
                    added_papers.add(linked_paper.paperId)

        row_id = row_id + 1

else:
    df_papers = pd.read_csv(GetASReviewProjectFile())
    df_papers.drop(columns=['asreview_ranking', 'record_id'], inplace=True)
    df_papers = df_papers[(df_papers.Included == 0) | (df_papers.Included == 1)].reset_index(drop=True)
    
    added_papesrs = set()
    
    for paperId in df_papers['paperId']:
        added_papesrs.add(paperId)

    for paper_inc, paperId in tqdm(list(zip(df_papers['Included'], df_papers['paperId']))):

        # if paper is not included we don't consider its references and citations
        if paper_inc != 1:
            continue
        
        paper = sch.get_paper(paperId)
        
        if paper.references != None:
            for linked_paper in paper.references:
                if linked_paper.paperId not in added_papers:
                    linked_papers.append(linked_paper)
                    added_papers.add(linked_paper.paperId)
                
        if paper.citations != None:   
            for linked_paper in paper.citations:
                if linked_paper.paperId not in added_papers:
                    linked_papers.append(linked_paper)
                    added_papers.add(linked_paper.paperId)
            
row_id = len(df_papers)
    
for linked_paper in linked_papers:
    d = GetPaperAttributes(linked_paper)
    d['Included'] = ''
    df_papers.loc[row_id] = d
    row_id = row_id + 1
    
len(df_papers)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:56<00:00,  1.76it/s]


3351

In [27]:
df_papers.drop_duplicates(subset=['paperId'], keep='first', inplace=True)
len(df_papers)

3252

In [28]:
df_papers.dropna(subset=['abstract', 'title'], inplace=True)
len(df_papers)

2704

In [31]:
df_papers.to_csv(ASREVIEW_PATH + 'asreview_dataset_partially_labeled_2.csv', index=False)

In [32]:
!asreview lab

^C
