In [13]:
from multiprocessing import Process, Queue
from sqlalchemy import create_engine, text
from sqlalchemy.pool import NullPool
from configparser import ConfigParser
from tqdm.auto import tqdm, trange
import uuid
import time

In [14]:
def config(filename='database.ini', mode="test"):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
    section='postgresql'
    if mode == "production":
        section = "cah_production"
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
    return db

In [15]:
def dump_3m(j, workers, engine, jobtype, cycles, queue, path):
    engine.dispose()
    with engine.connect() as eng:
        conn = engine.raw_connection()
        for i in range(cycles):
            file = uuid.uuid4()
            # clipped out
            if jobtype == "clipped":
                select_stmt1 = f"COPY (DELETE FROM dataset WHERE sampleid % {cycles*workers} = {cycles*j+i} and status = 2 RETURNING *) TO '{path}/clipped/ok-{file}.csv' DELIMITER '|' CSV HEADER;"
            # rejected out
            elif jobtype == "rejected":
                select_stmt1 = f"COPY (DELETE FROM dataset WHERE sampleid % {cycles*workers} = {cycles*j+i} and status > 8 RETURNING *) TO '{path}/rejected/bad-{file}.csv' DELIMITER '|' CSV HEADER;"
            # todo nolang out
            elif jobtype == "todo_nolang":
                select_stmt1 = f"COPY (DELETE FROM dataset WHERE sampleid % {cycles*workers} = {cycles*j+i} and status = 0 and language = '' RETURNING *) TO '{path}/todo/nolang/nolang-{file}.csv' DELIMITER '|' CSV HEADER;"
            # todo intl out
            elif jobtype == "todo_intl":
                select_stmt1 = f"COPY (DELETE FROM dataset WHERE sampleid % {cycles*workers} = {cycles*j+i} and status = 0 and language not in ('','en') RETURNING *) TO '{path}/todo/intl/intl-{file}.csv' DELIMITER '|' CSV HEADER;"
            # todo english out
            elif jobtype == "todo_en":
                select_stmt1 = f"COPY (DELETE FROM dataset WHERE sampleid % {cycles*workers} = {cycles*j+i} and status = 0 and language = 'en' RETURNING *) TO '{path}/todo/english/eng-{file}.csv' DELIMITER '|' CSV HEADER;"
            else:
                continue
            try:
                cur = conn.cursor()
                cur.execute(select_stmt1)
                conn.commit()
            except Exception as e:
                print(f"error: {e}")
            queue.put(1)
    return

In [16]:
mode = "production"
params = config(mode=mode)
engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_pre_ping=True, poolclass=NullPool)

path = "/home/cah"
if mode == "production":
    path = "/mnt/md1/export"


with engine.connect() as eng:
    conn = engine.raw_connection()
    #calculate number of cycles for each type of job
    #clipped
    clipped = 0
    select_stmt1 = f"SELECT count(*) from dataset where status = 2 and language = 'en';"
    try:
        cur = conn.cursor()
        cur.execute(select_stmt1)
        conn.commit()
        clipped = cur.fetchone()[0]
    except Exception as e:
        print(f"clipped error: {e}")

    rejected = 0
    select_stmt1 = f"SELECT count(*) from dataset where status > 8;"
    try:
        cur = conn.cursor()
        cur.execute(select_stmt1)
        conn.commit()
        rejected = cur.fetchone()[0]
    except Exception as e:
        print(f"rejected error: {e}")

    todonolang = 0
    select_stmt1 = f"SELECT count(*) from dataset where status = 0 and language = '';"
    try:
        cur = conn.cursor()
        cur.execute(select_stmt1)
        conn.commit()
        todonolang = cur.fetchone()[0]
    except Exception as e:
        print(f"todonolang error: {e}")

    todointl = 0
    select_stmt1 = f"SELECT count(*) from dataset where status = 0 and language not in ('','en');"
    try:
        cur = conn.cursor()
        cur.execute(select_stmt1)
        conn.commit()
        todointl = cur.fetchone()[0]
    except Exception as e:
        print(f"todointl error: {e}")

    todoen = 0
    select_stmt1 = f"SELECT count(*) from dataset where status = 0 and language = 'en';"
    try:
        cur = conn.cursor()
        cur.execute(select_stmt1)
        conn.commit()
        todoen = cur.fetchone()[0]
    except Exception as e:
        print(f"todoen error: {e}")

# calculate workers distribution
total = clipped + rejected + todonolang + todointl # + todoen
clipped_proc = int(round(clipped*10 / total, 0))
rejected_proc = int(round(rejected*10 / total, 0))
todonolang_proc = int(round(todonolang*10 / total, 0))
todointl_proc = int(round(todointl*10 / total, 0))
#todoen_proc = int(round(todoen*10 / total, 0))
cycles = int(round(total / 50000000, 0))

workers = []
for _ in range(clipped_proc):
    workers.append("clipped")
for _ in range(rejected_proc):
    workers.append("rejected")
for _ in range(todonolang_proc):
    workers.append("todo_nolang")
for _ in range(todointl_proc):
    workers.append("todo_intl")
#for j in range(todoen_proc):
#    workers.append("todo_en")

print(f"total={total}, cycles={cycles} processed by {len(workers)} workers")

iterations = cycles * len(workers)
processes = []
pbars = []
pbar = tqdm(total=iterations)
q = Queue()

for i, worker in enumerate(workers):
    print(f"[{i}] {worker}")
    j = 0
    num = 0
    if worker == "clipped":
        j = i
        num = clipped_proc
    elif worker == "rejected":
        j = i - clipped_proc
        num = rejected_proc
    elif worker == "todo_nolang":
        j = i - clipped_proc - rejected_proc
        num = todonolang_proc
    elif worker == "todo_intl":
        j = i - clipped_proc - rejected_proc - todonolang_proc
        num = todointl_proc
    else:
        j = i - clipped_proc - rejected_proc - todonolang_proc - todointl_proc
        num = todoen_proc

    p = Process(target=dump_3m, args = [j, num, engine, worker, cycles, q, path], daemon=False)
    try:
        p.start()
        processes.append(p)
    except:
        pass

progress = 0
while progress < iterations:
    if not q.empty():
        q.get()
        pbar.update(1)
        progress += 1
    time.sleep(0.2)

for proc in processes:
    proc.join()

print (f"Job ended")


total=55545859, cycles=1 processed by 9 workers


  0%|          | 0/9 [00:00<?, ?it/s]

[0] todo_nolang
[1] todo_nolang
[2] todo_nolang
[3] todo_intl
[4] todo_intl
[5] todo_intl
[6] todo_intl
[7] todo_intl
[8] todo_intl
Job ended
