In [1]:
import datetime
import os
import json
import csv
from io import StringIO

from dotenv import load_dotenv

from s2ag_corpus.database_catalogue import local_connection
from s2ag_corpus.sql import CREATE_EXTENDED_CITATIONS_TABLE_WITHOUT_INDICES

In [2]:
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [3]:

connection = local_connection()

def drop_and_replace_citations_table():
    with connection.cursor() as cursor:
        cursor.execute('drop table if exists citations')
        cursor.execute(CREATE_EXTENDED_CITATIONS_TABLE_WITHOUT_INDICES)

In [4]:
drop_and_replace_citations_table()
    

In [5]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    output = StringIO()
    writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
    count = 0
    with open('timimg.csv','w') as tf, open(file_path, 'r') as file:
        timing_writer = csv.writer(tf)
        start = datetime.datetime.now()
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['citationid'],
                      jd['citingcorpusid'],
                      jd['citedcorpusid'],
                      jd['isinfluential'],
                      jd['contexts'],
                      jd['intents'])
            output.seek(0)
            output.truncate(0)
            writer.writerow(record)
            count += 1
            if count % 100000 == 0:
                end = datetime.datetime.now()
                timing_writer.writerow([count, end - start])
                start = end
            yield output.getvalue()

In [6]:
class JsonFileInserter:
    def __init__(self, generator):
        self.generator = generator
        self.buffer = ''
        self.count = 0

    def read(self, size=-1):
        # Fill the buffer to meet the size requirement or if size is -1 then try to exhaust the generator
        while (size < 0 or len(self.buffer) < size) and (chunk := next(self.generator, None)) is not None:
            self.buffer += chunk
            if not self.buffer.endswith('\n'):
                self.buffer += '\n'  # Ensure each chunk ends with a newline

        if size < 0 or len(self.buffer) <= size:
            to_return, self.buffer = self.buffer, ''
        else:
            to_return, self.buffer = self.buffer[:size], self.buffer[size:]

        return to_return

In [7]:
def copy_json_to_papers(test_file):
    adapter = JsonFileInserter(read_records_from_file(test_file))
    with connection.cursor() as cursor:
        cursor.copy_from(adapter, 'citations', sep=',', null='')
        connection.commit()
    print('done')

In [8]:
test_file = base_dir+'/2024-04-02/citations/citations0'
copy_json_to_papers(test_file)

done


In [9]:
connection.rollback()