# Import Libraries

In [7]:
import json
from neomodel import (StructuredNode, StringProperty, IntegerProperty, RelationshipTo, RelationshipFrom, config)

# Load Data

In [2]:
data = []

with open("archive/sample_500.json") as user_file:
    for line in user_file:
        data.append(json.loads(line))

# print(data)

# Defining Nodes

In [3]:
config.DATABASE_URL = 'bolt://neo4j:password@localhost:7687'

In [4]:
class Paper(StructuredNode):
    uid = StringProperty(unique_index=True)
    submitter = StringProperty()
    title = StringProperty()
    comments = StringProperty()
    journal_ref = StringProperty()
    doi = StringProperty()
    report_no = StringProperty()
    categories = StringProperty()
    abstract = StringProperty()
    update_date = StringProperty()
    
    authors = RelationshipTo("Author", 'AUTHORED_BY')
    versions = RelationshipTo("Version", "HAS_VERSION")


class Author(StructuredNode):
    name = StringProperty(unique_index=True)


class Version(StructuredNode):
    version = StringProperty()
    created = StringProperty()

In [5]:
def create_nodes_and_relationships(data):
    paper = Paper(uid=data['id'], submitter=data['submitter'], 
                title=data['title'], comments=data['comments'],
                journal_ref=data['journal-ref'], doi=data['doi'],
                report_no=data['report-no'],categories=data['categories'],
                abstract=data['abstract'], update_data=data['update_date']).save()

    for author in data['authors_parsed']:
        author_node = Author(name=" ".join(author)).save()
        paper.authors.connect(author_node)
    
    for version in data['versions']:
        version_node = Version(version=version['version'],
                                created=version['created']).save()
        paper.versions.connect(version_node)



# Creating Graph

In [22]:
# create_nodes_and_relationships(data[0])
with open("archive/arxiv-metadata-oai-snapshot.json") as user_file:
    for line in user_file:
        try:
            create_nodes_and_relationships(json.loads(line))
        except Exception as e:
            print(e)

Expecting ',' delimiter: line 2 column 1 (char 1695)
Extra data: line 1 column 12 (char 11)
Existing exports of data: object cannot be re-sized


# Creating Paper Embeddings 

In [6]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
import os 
from dotenv import load_dotenv
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

load_dotenv()

paper_graph = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url="bolt://localhost:7687",
    username="neo4j",
    password="password",
    index_name="paper_index",
    node_label="Paper",
    text_node_properties=["abstract", "title"],
    embedding_node_property="paper_embedding",
)

In [8]:
from pprint import pprint

result = paper_graph.similarity_search("dark matter field fluid model")
pprint(result[0].page_content)

('\n'
 'abstract:   The evolution of Earth-Moon system is described by the dark '
 'matter field\n'
 'fluid model proposed in the Meeting of Division of Particle and Field 2004,\n'
 'American Physical Society. The current behavior of the Earth-Moon system '
 'agrees\n'
 'with this model very well and the general pattern of the evolution of the\n'
 'Moon-Earth system described by this model agrees with geological and fossil\n'
 'evidence. The closest distance of the Moon to Earth was about 259000 km at '
 '4.5\n'
 "billion years ago, which is far beyond the Roche's limit. The result "
 'suggests\n'
 'that the tidal friction may not be the primary cause for the evolution of '
 'the\n'
 'Earth-Moon system. The average dark matter field fluid constant derived '
 'from\n'
 'Earth-Moon system data is 4.39 x 10^(-22) s^(-1)m^(-1). This model predicts\n'
 "that the Mars's rotation is also slowing with the angular acceleration rate\n"
 'about -4.38 x 10^(-22) rad s^(-2).\n'
 '\n'
 'title: The e

# Using Existing Index

In [24]:
paper_store = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url="bolt://localhost:7687",
    username="neo4j",
    password="password",
    index_name="paper_index",
    text_node_property="abstract"
)

result = paper_store.similarity_search("We discuss the results from the combined IRAC and MIPS c2d Spitzer Legacy observations of the Serpens star-forming region. In particular we present")
pprint(result[0].page_content)

('  We discuss the results from the combined IRAC and MIPS c2d Spitzer Legacy\n'
 'observations of the Serpens star-forming region. In particular we present a '
 'set\n'
 "of criteria for isolating bona fide young stellar objects, YSO's, from the\n"
 'extensive background contamination by extra-galactic objects. We then '
 'discuss\n'
 "the properties of the resulting high confidence set of YSO's. We find 235 "
 'such\n'
 'objects in the 0.85 deg^2 field that was covered with both IRAC and MIPS. '
 'An\n'
 "additional set of 51 lower confidence YSO's outside this area is identified\n"
 'from the MIPS data combined with 2MASS photometry. We describe two sets of\n'
 'results, color-color diagrams to compare our observed source properties '
 'with\n'
 'those of theoretical models for star/disk/envelope systems and our own '
 'modeling\n'
 'of the subset of our objects that appear to be star+disks. These objects\n'
 'exhibit a very wide range of disk properties, from many that can be fit '

('  We discuss the results from the combined IRAC and MIPS c2d Spitzer Legacy\n'
 'observations of the Serpens star-forming region. In particular we present a '
 'set\n'
 "of criteria for isolating bona fide young stellar objects, YSO's, from the\n"
 'extensive background contamination by extra-galactic objects. We then '
 'discuss\n'
 "the properties of the resulting high confidence set of YSO's. We find 235 "
 'such\n'
 'objects in the 0.85 deg^2 field that was covered with both IRAC and MIPS. '
 'An\n'
 "additional set of 51 lower confidence YSO's outside this area is identified\n"
 'from the MIPS data combined with 2MASS photometry. We describe two sets of\n'
 'results, color-color diagrams to compare our observed source properties '
 'with\n'
 'those of theoretical models for star/disk/envelope systems and our own '
 'modeling\n'
 'of the subset of our objects that appear to be star+disks. These objects\n'
 'exhibit a very wide range of disk properties, from many that can be fit '

In [None]:
result.