In [12]:
from typing import Any, Dict, List
import os
import json
import requests

DATA_LOCATION = "../data/"
DEFAULT_DATASET = "s3_arxiv-papers-1000.json"
S3_DATA_URL = "https://arxiv-search.s3.us-east-2.amazonaws.com/arxiv-papers-1000.json"

def read_paper_json() -> List[Dict[str, Any]]:
    """
    Load JSON array of arXiv papers and embeddings.
    """
    print("Loading papers dataset from disk")
    path = os.path.join(DATA_LOCATION, DEFAULT_DATASET)
    try:
        with open(path, "r") as f:
            data = json.load(f)
    except:
        print("File not in location => getting from s3")
        res = requests.get(S3_DATA_URL)
        data = res.json()

        if os.path.isdir(DATA_LOCATION):
            print("writing to disk")
            with open(path, "w") as f:
                json.dump(data, f)
        else:
            
    return data

In [13]:
data = read_paper_json()

Loading papers dataset from disk
File not in location => getting from s3
writing to disk


In [3]:
data[0]

{'id': '1701.07125',
 'title': 'jsCoq: Towards Hybrid Theorem Proving Interfaces',
 'year': 2017,
 'authors': "Emilio Jes\\'us Gallego Arias (MINES ParisTech, PSL Research\n  University, France), Beno\\^it Pin (MINES ParisTech, PSL Research University,\n  France), Pierre Jouvelot (MINES ParisTech, PSL Research University, France)",
 'categories': 'cs.PL,cs.HC,cs.LG,cs.LO',
 'abstract': '  We describe jsCcoq, a new platform and user environment for the Coq\ninteractive proof assistant. The jsCoq system targets the HTML5-ECMAScript 2015\nspecification, and it is typically run inside a standards-compliant browser,\nwithout the need of external servers or services. Targeting educational use,\njsCoq allows the user to start interaction with proof scripts right away,\nthanks to its self-contained nature. Indeed, a full Coq environment is packed\nalong the proof scripts, easing distribution and installation. Starting to use\njsCoq is as easy as clicking on a link. The current release ships mo

In [9]:
from arxivsearch.db import redis_helpers
from arxivsearch.db import load
from redisvl.index import AsyncSearchIndex

In [5]:
index = AsyncSearchIndex(redis_helpers.schema, redis_helpers.client)


In [10]:
res = await load.write_async(index, data)

In [15]:
if await index.exists() and len((await index.search("*")).docs) > 0:
    print("Index loaded successfully")

Index loaded successfully
