In [1]:
import sys
sys.path.append('../src')


In [2]:
from datetime import datetime
import json
import csv
import os
import socket

from s2ag_corpus.database_catalogue import CorpusDatabaseCatalogue
from s2ag_corpus.database_catalogue import production_connection

In [3]:
from dotenv import load_dotenv
load_dotenv()
base_dir = os.getenv("BASE_DIR")

In [4]:
hostname = socket.gethostname()

In [5]:
connection = production_connection()
catalogue = CorpusDatabaseCatalogue(production_connection())


In [6]:
release_id = '2024-04-02'
production_dir = f"{base_dir}/{release_id}/papers"
print('using', production_dir)

using /home/romilly/data/ss-corpus/2024-04-02/papers


In [7]:
def read_records_from_file(file_path):
    """A generator function that returns reformatted lines in a file."""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            jd = json.loads(line)
            record = (jd['corpusid'], line)
            yield record

In [8]:
files = os.listdir()

In [None]:
count = 0
session_start = datetime.now()
timing_file = open(f'timing-production-without_key-{hostname}-{session_start}.csv', 'w')
timing_writer = csv.writer(timing_file)
timing_writer.writerow(['filename','count','start','end','duration'])
for filename in sorted(os.listdir(production_dir)):
    if filename.startswith("file"):
        print(f"processing: {filename} at {datetime.now()}")
        transfer_file = f"{production_dir}/transfer.csv"
        start = datetime.now()
        with open(transfer_file,'w') as csvf:
            writer = csv.writer(csvf, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
            for record in read_records_from_file(f"{production_dir}/{filename}"):
                writer.writerow(record)
                count += 1
        print(f"processed: {count} at {datetime.now()}")
        print(f"copying from csv file {transfer_file}")
        start = datetime.now()
        with open(transfer_file,'r') as csvf:
            with connection.cursor() as cursor:
                cursor.copy_from(csvf, 'papers', sep=',', null='')
            connection.commit()
        end = datetime.now()
        print(f"loaded: {count} in {end - start}")
        timing_writer.writerow([filename,count,start,end, end-start])
timing_file.close()
print('done')


processing: file at 2024-04-20 12:28:52.738506
processed: 4965667 at 2024-04-20 12:34:11.174971
copying from csv file /home/romilly/data/ss-corpus/2024-04-02/papers/transfer.csv
loaded: 4965667 in 0:11:21.221421
processing: file0 at 2024-04-20 12:45:32.396742
processed: 7385768 at 2024-04-20 12:49:00.654992
copying from csv file /home/romilly/data/ss-corpus/2024-04-02/papers/transfer.csv
loaded: 7385768 in 0:05:13.558437
processing: file1 at 2024-04-20 12:54:14.213734
processed: 12351115 at 2024-04-20 13:02:51.189908
copying from csv file /home/romilly/data/ss-corpus/2024-04-02/papers/transfer.csv
loaded: 12351115 in 0:10:59.707430
processing: file10 at 2024-04-20 13:13:50.897607
processed: 17317075 at 2024-04-20 13:23:20.246948
copying from csv file /home/romilly/data/ss-corpus/2024-04-02/papers/transfer.csv
loaded: 17317075 in 0:11:32.311852
processing: file11 at 2024-04-20 13:34:52.559131
processed: 22281742 at 2024-04-20 13:46:38.743494
copying from csv file /home/romilly/data/ss-c