In [1]:
import sys
sys.path.append('../src')


In [2]:
from datetime import datetime
import json
import csv
import os
import socket

from s2ag_corpus.database_catalogue import CorpusDatabaseCatalogue
from s2ag_corpus.database_catalogue import local_connection

In [3]:
from dotenv import load_dotenv
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [4]:
hostname = socket.gethostname()

In [5]:
connection = local_connection()
catalogue = CorpusDatabaseCatalogue(local_connection())


def delete_papers_from_local_db():
    with connection.cursor() as cursor:
        cursor.execute('DELETE FROM papers')
        connection.commit()


In [6]:
delete_papers_from_local_db()

In [7]:
release_id = '2024-04-02'
test_dir = f"{base_dir}/test-data"

In [8]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            yield record

In [9]:
files = os.listdir()

In [10]:
count = 0
session_start = datetime.now()
timing_file = open(f'timing-without_key-{hostname}-{session_start}.csv', 'w')
timing_writer = csv.writer(timing_file)
timing_writer.writerow(['filename','count','start','end','duration'])
for filename in sorted(os.listdir(test_dir)):
    if filename.startswith("2024-04-02"):
        print(f"processing: {filename} at {datetime.now()}")
        transfer_file = f"{test_dir}/transfer.csv"
        start = datetime.now()
        with open(transfer_file,'w') as csvf:
            writer = csv.writer(csvf, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
            for record in read_records_from_file(f"{test_dir}/{filename}"):
                writer.writerow(record)
                count += 1
        print(f"processed: {count} at {datetime.now()}")
        print(f"copying from csv file {transfer_file}")
        start = datetime.now()
        with open(transfer_file,'r') as csvf:
            with connection.cursor() as cursor:
                cursor.copy_from(csvf, 'papers', sep=',', null='')
            connection.commit()
        end = datetime.now()
        print(f"loaded: {count} in {end - start}")
        timing_writer.writerow([filename,count,start,end, end-start])
timing_file.close()
print('done')


processing: 2024-04-02-papers000 at 2024-04-20 09:05:07.149633
processed: 1000000 at 2024-04-20 09:05:47.061821
copying from csv file /home/romilly/data/ss-corpus/test-data/transfer.csv
loaded: 1000000 in 0:01:46.086628
processing: 2024-04-02-papers001 at 2024-04-20 09:07:33.148859
processed: 2000000 at 2024-04-20 09:09:10.462125
copying from csv file /home/romilly/data/ss-corpus/test-data/transfer.csv
loaded: 2000000 in 0:00:55.963326
processing: 2024-04-02-papers002 at 2024-04-20 09:10:06.425716
processed: 3000000 at 2024-04-20 09:12:34.862762
copying from csv file /home/romilly/data/ss-corpus/test-data/transfer.csv
loaded: 3000000 in 0:01:49.794610
processing: 2024-04-02-papers003 at 2024-04-20 09:14:24.657636
processed: 4000000 at 2024-04-20 09:17:35.724016
copying from csv file /home/romilly/data/ss-corpus/test-data/transfer.csv
loaded: 4000000 in 0:01:48.428091
processing: 2024-04-02-papers004 at 2024-04-20 09:19:24.152399
processed: 5000000 at 2024-04-20 09:22:13.369785
copying 