In [1]:
from typing import List

from s2ag_corpus.database_catalogue import CorpusDatabaseCatalogue, production_connection

In [2]:
connection = production_connection()
catalogue = CorpusDatabaseCatalogue(connection)

In [3]:
CITATION_SQL = """
select citationid from papers, citations
                    where citedcorpusid = %s
                    and corpusid = citingcorpusid
                    """

In [4]:
def find_citations_for(corpus_id: int, catalogue: CorpusDatabaseCatalogue, filter: str = None) -> List[int]:
    if filter is None:
        filter = ''
    rows = catalogue.fetch(CITATION_SQL + filter, (corpus_id,))
    return [row[0] for row in rows]

In [5]:
find_citations_for(161134357, catalogue, 'and isinfluential')

[]

In [15]:
def find_corpus_id_from(sha: str, catalogue:CorpusDatabaseCatalogue):
    rows = catalogue.fetch("select corpusid from paperids where sha = %s", (sha,))
    if len(rows) == 0:
        return None
    else:
        return rows[0][0]

In [16]:
id = find_corpus_id_from('ded3b33fa8b8d6cdf890478bf22721c7fa7cef6f', catalogue)

In [17]:
id

244184633

In [18]:
find_citations_for(id, catalogue)

[4103410700, 4464566537, 4202070000, 4119774780, 4555215429]

In [19]:
REFERENCE_SQL = """
select citationid from papers, citations
                    where citingcorpusid = %s
                    and corpusid = citedcorpusid
                    """

In [20]:
def find_references_for(corpus_id: int, catalogue: CorpusDatabaseCatalogue, filter: str = None) -> List[int]:
    if filter is None:
        filter = ''
    rows = catalogue.fetch(REFERENCE_SQL + filter, (corpus_id,))
    return [row[0] for row in rows]

In [21]:
find_references_for(id, catalogue)

[]

In [36]:
def transitive_closure(corpusid, catalogue, filter: str = None):
    related_corpus_ids = set()

    def add_related_ids(corpusid):
        related_corpus_ids.add(corpusid)
        for related_id in find_citations_for(corpusid, catalogue, filter=filter):
            if related_id not in related_corpus_ids:
                add_related_ids(related_id)

    add_related_ids(corpusid)

    return related_corpus_ids

In [37]:
transitive_closure(id, catalogue)

{244184633, 4103410700, 4119774780, 4202070000, 4464566537, 4555215429}

In [38]:
newid = find_corpus_id_from('57e849d0de13ed5f91d086936296721d4ff75a75', catalogue)

In [32]:
tc = transitive_closure(newid, catalogue)

In [34]:
len(tc)

5242

In [42]:
tc = transitive_closure(newid, catalogue, 'and isinfluential = True')


In [41]:
connection.rollback()

In [43]:
len(tc)

798

In [44]:
def find_links(corpusid, catalogue):
    related_corpus_ids = set()
    links = set()

    def add_related_ids_and_links(corpusid):
        related_ids = find_citations_for(corpusid, catalogue)
        for related_id in related_ids:
            if related_id not in related_corpus_ids:
                related_corpus_ids.add(related_id)
                links.add((corpusid, related_id))
                add_related_ids_and_links(related_id)

    add_related_ids_and_links(corpusid)

    return links

In [45]:
links = find_links(newid, catalogue)

In [46]:
def write_dot_file(links, file_path):
    with open(file_path, "w") as f:
        f.write("digraph {\n")
        for corpusid1, corpusid2 in links:
            f.write(f'    "{corpusid1}" -> "{corpusid2}";\n')
        f.write("}\n")

In [47]:
write_dot_file(links, 'llama-links.dot')