In [1]:
import datetime
import json
import csv
import os
from s2ag_corpus.database_catalogue import CorpusDatabaseCatalogue
from s2ag_corpus.database_catalogue import local_connection

In [2]:
from dotenv import load_dotenv
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [3]:
connection = local_connection()
catalogue = CorpusDatabaseCatalogue(local_connection())


def delete_papers_from_local_db():
    with connection.cursor() as cursor:
        cursor.execute('DELETE FROM papers')
        connection.commit()


In [4]:
delete_papers_from_local_db()

In [5]:
release_id = '2024-04-02'
test_dir = f"{base_dir}/test-data"

In [6]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            yield record

In [7]:
files = os.listdir()

In [8]:
count = 0
timing_file = open('timing-without_key.csv', 'w')
timing_writer = csv.writer(timing_file)
timing_writer.writerow(['filename','count','start','end','duration'])
for filename in sorted(os.listdir(test_dir)):
    if filename.startswith("2024-04-02"):
        print(f"processing: {filename}")
        transfer_file = f"{test_dir}/transfer.csv"
        start = datetime.datetime.now()
        with open(transfer_file,'w') as csvf:
            writer = csv.writer(csvf, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
            for record in read_records_from_file(f"{test_dir}/{filename}"):
                writer.writerow(record)
                count += 1
        print(f"processed: {count}")
        print(f"copying from csv file {transfer_file}")
        start = datetime.datetime.now()
        with open(transfer_file,'r') as csvf:
            with connection.cursor() as cursor:
                cursor.copy_from(csvf, 'papers', sep=',', null='')
            connection.commit()
        end = datetime.datetime.now()
        print(f"loaded: {count} in {end - start}")
        timing_writer.writerow([filename,count,start,end, end-start])
timing_file.close()
print('done')


processing: 2024-04-02-papers0
processed: 1000000
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 1000000 in 0:00:15.384123
processing: 2024-04-02-papers1
processed: 2000000
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 2000000 in 0:00:14.340832
processing: 2024-04-02-papers10
processed: 3000000
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 3000000 in 0:00:17.092344
processing: 2024-04-02-papers11
processed: 4000000
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 4000000 in 0:00:17.562724
processing: 2024-04-02-papers12
processed: 5000000
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 5000000 in 0:00:17.620399
processing: 2024-04-02-papers13
processed: 5810613
copying from csv file /media/romilly/ss-corpus/test-data/transfer.csv
loaded: 5810613 in 0:00:15.419921
processing: 2024-04-02-papers14
processed: 6810613
copying from csv file