In [1]:
import os
import json
import csv
from io import StringIO

from dotenv import load_dotenv

from s2ag_corpus.sql import CREATE_PAPERS_TABLE_WITH_KEYS

In [2]:
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [3]:
from s2ag_corpus.database_catalogue import local_connection

connection = local_connection()

def drop_and_replace_papers_table():
    with connection.cursor() as cursor:
        cursor.execute('drop table if exists papers')
        cursor.execute(CREATE_PAPERS_TABLE_WITH_KEYS)

In [4]:
drop_and_replace_papers_table()
    

In [5]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    output = StringIO()
    writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            output.seek(0)
            output.truncate(0)
            writer.writerow(record)
            yield output.getvalue()

In [6]:
class GeneratorFileAdapter:
    def __init__(self, generator):
        self.generator = generator
        self.buffer = ''
        self.count = 0

    def read(self, size=-1):
        # Fill the buffer to meet the size requirement or if size is -1 then try to exhaust the generator
        while (size < 0 or len(self.buffer) < size) and (chunk := next(self.generator, None)) is not None:
            self.buffer += chunk
            if not self.buffer.endswith('\n'):
                self.buffer += '\n'  # Ensure each chunk ends with a newline

        if size < 0 or len(self.buffer) <= size:
            to_return, self.buffer = self.buffer, ''
        else:
            to_return, self.buffer = self.buffer[:size], self.buffer[size:]

        return to_return

In [7]:
def copy_json_to_papers(test_file):
        adapter = GeneratorFileAdapter(read_records_from_file(test_file))
        with connection.cursor() as cursor:
            cursor.copy_from(adapter, 'papers', sep=',', null='')
            connection.commit()
print('done')

done


In [8]:
test_file = base_dir+'/2024-04-02/papers/file'
copy_json_to_papers(test_file)