In [17]:
import sys
sys.path.append('../src')


In [18]:
from datetime import datetime
import json
import csv
import os
import socket

from s2ag_corpus.database_catalogue import DatabaseCatalogue
from s2ag_corpus.database_catalogue import test_connection

In [19]:
from dotenv import load_dotenv
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [20]:
hostname = socket.gethostname()

In [21]:
connection = test_connection()
catalogue = DatabaseCatalogue(test_connection())


def delete_papers_from_local_db():
    with connection.cursor() as cursor:
        cursor.execute('DELETE FROM papers')
        connection.commit()


In [22]:
delete_papers_from_local_db()

In [23]:
release_id = '2024-04-02'
test_dir = f"{base_dir}/test-data"

In [24]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            yield record

In [25]:
files = os.listdir()

In [26]:
count = 0
session_start = datetime.now()
timing_file_name = f'timing-without_key-{hostname}-{session_start}.csv'
timing_file = open(timing_file_name, 'w')
print(f"opening timing file {timing_file_name}")
timing_writer = csv.writer(timing_file)
timing_writer.writerow(['filename','count','start','end','duration'])
for filename in sorted(os.listdir(test_dir)):
    if filename.startswith("2024-04-02"):
        print(f"processing: {filename} at {datetime.now()}")
        transfer_file = f"{test_dir}/transfer.csv"
        start = datetime.now()
        with open(transfer_file,'w') as csvf:
            writer = csv.writer(csvf, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
            for record in read_records_from_file(f"{test_dir}/{filename}"):
                writer.writerow(record)
                count += 1
        print(f"processed: {count} at {datetime.now()}")
        print(f"copying from csv file {transfer_file}")
        start = datetime.now()
        with open(transfer_file,'r') as csvf:
            with connection.cursor() as cursor:
                cursor.copy_from(csvf, 'papers', sep=',', null='')
            connection.commit()
        end = datetime.now()
        print(f"loaded: {count} in {end - start}")
        timing_writer.writerow([filename,count,start,end, end-start])
timing_file.close()
print('done')


opening timing file timing-without_key-treliske-2024-04-21 14:10:52.665683.csv
processing: 2024-04-02-papers000 at 2024-04-21 14:10:52.666403
processed: 100000 at 2024-04-21 14:10:57.790196
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 100000 in 0:00:08.891173
processing: 2024-04-02-papers001 at 2024-04-21 14:11:06.681548
processed: 200000 at 2024-04-21 14:11:11.318589
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 200000 in 0:00:08.916455
processing: 2024-04-02-papers002 at 2024-04-21 14:11:20.235199
processed: 300000 at 2024-04-21 14:11:24.858718
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 300000 in 0:00:08.876208
processing: 2024-04-02-papers003 at 2024-04-21 14:11:33.735254
processed: 400000 at 2024-04-21 14:11:38.292185
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 400000 in 0:00:08.894806
processing: 2024-04-02-papers004 at 2024-04-21 14:11:47.187330
pr