In [23]:
import sys
sys.path.append('../src')


In [24]:
from datetime import datetime
import json
import csv
import os
import socket
from io import StringIO

from s2ag_corpus.database_catalogue import CorpusDatabaseCatalogue
from s2ag_corpus.database_catalogue import test_connection

In [25]:
from dotenv import load_dotenv
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [26]:
hostname = socket.gethostname()

In [27]:
connection = test_connection()
catalogue = CorpusDatabaseCatalogue(test_connection())


def delete_papers_from_local_db():
    with connection.cursor() as cursor:
        cursor.execute('DELETE FROM papers')
        connection.commit()


In [28]:
delete_papers_from_local_db()

In [29]:
release_id = '2024-04-02'
test_dir = f"{base_dir}/test-data"

In [30]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    output = StringIO()
    writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            output.seek(0)
            output.truncate(0)
            writer.writerow(record)
            yield output.getvalue()

In [31]:
files = os.listdir()

In [32]:
class GeneratorFileAdapter:
    def __init__(self, generator):
        self.generator = generator
        self.buffer = ''
        self.count = 0

    def read(self, size=-1):
        # Fill the buffer to meet the size requirement or if size is -1 then try to exhaust the generator
        while (size < 0 or len(self.buffer) < size) and (chunk := next(self.generator, None)) is not None:
            self.buffer += chunk
            if not self.buffer.endswith('\n'):
                self.buffer += '\n'  # Ensure each chunk ends with a newline

        if size < 0 or len(self.buffer) <= size:
            to_return, self.buffer = self.buffer, ''
        else:
            to_return, self.buffer = self.buffer[:size], self.buffer[size:]

        return to_return


In [33]:
count = 0
session_start = datetime.now()
timing_file_name = f'timing-without-csv-file-{hostname}-{session_start}.csv'
timing_file = open(timing_file_name, 'w')
print(f"opening timing file {timing_file_name}")
timing_writer = csv.writer(timing_file)
timing_writer.writerow(['filename','count','start','end','duration'])
for filename in sorted(os.listdir(test_dir)):
    if filename.startswith("2024-04-02"):
        print(f"processing: {filename} at {datetime.now()}")
        adapter = GeneratorFileAdapter(read_records_from_file(f"{test_dir}/{filename}"))
        # transfer_file = f"{test_dir}/transfer.csv"
        output = StringIO()
        writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
        start = datetime.now()
        with connection.cursor() as cursor:
            cursor.copy_from(adapter, 'papers', sep=',', null='')
            connection.commit()
        end = datetime.now()
        print(f"loaded: in {end - start}")
        timing_writer.writerow([filename,count,start,end, end-start])
timing_file.close()
print('done')


opening timing file timing-without-csv-file-treliske-2024-04-21 17:22:48.940202.csv
processing: 2024-04-02-papers000 at 2024-04-21 17:22:48.941612
loaded: in 0:01:29.520131
processing: 2024-04-02-papers001 at 2024-04-21 17:24:18.461888
loaded: in 0:01:29.079421
processing: 2024-04-02-papers002 at 2024-04-21 17:25:47.541423
loaded: in 0:01:29.619652
processing: 2024-04-02-papers003 at 2024-04-21 17:27:17.161180
loaded: in 0:01:36.377192
processing: 2024-04-02-papers004 at 2024-04-21 17:28:53.538508
loaded: in 0:01:32.550572
done
