Approach 1: Row by Row (working, but slow, throughput is about 600 records per minute)

In [None]:
import psycopg2
import os
from dotenv import load_dotenv
import uuid
import pandas as pd

# Load environment variables
load_dotenv()

# Database connection parameters
dbname = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")

# Path to your taxons TSV file
tsv_file_path = '../data/taxons.tsv'  # Update to the correct path

# Read TSV file into DataFrame with chunking
df_taxons = pd.read_csv(tsv_file_path, sep='\t', chunksize=100)  # Adjust chunk size as needed

# Establishing Database Connection
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
    cursor = conn.cursor()

    for chunk in df_taxons:
        for index, row in chunk.iterrows():
            # Generate UUID for each taxon
            taxon_id = uuid.uuid4()

            # Prepare the SQL INSERT statement with all columns
            sql_statement = f"""
            INSERT INTO taxons (
                taxon_id, scientific_name, canonical_name, generic_name, taxonomic_status,
                kingdom, phylum, class, "order", family, genus
            )
            VALUES (
                '{taxon_id}',
                '{row['scientificName'].replace("'", "''") if pd.notna(row['scientificName']) else ''}',
                '{row['canonicalName'].replace("'", "''") if pd.notna(row['canonicalName']) else ''}',
                '{row['genericName'].replace("'", "''") if pd.notna(row['genericName']) else ''}',
                '{row['taxonomicStatus'].replace("'", "''") if pd.notna(row['taxonomicStatus']) else ''}',
                '{row['kingdom'].replace("'", "''") if pd.notna(row['kingdom']) else ''}',
                '{row['phylum'].replace("'", "''") if pd.notna(row['phylum']) else ''}',
                '{row['class'].replace("'", "''") if pd.notna(row['class']) else ''}',
                '{row['order'].replace("'", "''") if pd.notna(row['order']) else ''}',
                '{row['family'].replace("'", "''") if pd.notna(row['family']) else ''}',
                '{row['genus'].replace("'", "''") if pd.notna(row['genus']) else ''}'
            );
            """

            # Execute the SQL statement
            cursor.execute(sql_statement)

        # Commit after each chunk
        conn.commit()
        print("Chunk inserted successfully")

except Exception as e:
    print(f"Error: {e}")
finally:
    # Close the cursor and connection to clean up
    if conn:
        cursor.close()
        conn.close()


Batch Parallelization, DB Pool w/o Retries (1 file)

- 2 mil (before error handle line)

In [None]:
import psycopg2
from psycopg2 import pool, extras
import os
from dotenv import load_dotenv
import uuid
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load environment variables
load_dotenv()

# Database connection parameters
db_params = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT")
}

# Initialize the connection pool
db_pool = psycopg2.pool.SimpleConnectionPool(1, 10, **db_params)

def truncate(value, length):
    """Truncate the string to a specified length."""
    return (value[:length] if len(value) > length else value) if value else ''


def process_and_insert_chunk(chunk):
    conn = db_pool.getconn()  # Get a connection from the pool
    try:
        cursor = conn.cursor()
        
        # Prepare the values for insertion
        values = []
        for index, row in chunk.iterrows():
            taxon_id = str(uuid.uuid4())  # Convert UUID to string
            values.append((
                taxon_id,
                truncate(row['scientificName'] if pd.notnull(row['scientificName']) else '', 255),
                truncate(row['canonicalName'] if pd.notnull(row['canonicalName']) else '', 255),
                truncate(row['genericName'] if pd.notnull(row['genericName']) else '', 255),
                truncate(row['taxonomicStatus'] if pd.notnull(row['taxonomicStatus']) else '', 50),
                truncate(row['kingdom'] if pd.notnull(row['kingdom']) else '', 100),
                truncate(row['phylum'] if pd.notnull(row['phylum']) else '', 100),
                truncate(row['class'] if pd.notnull(row['class']) else '', 100),
                truncate(row['order'] if pd.notnull(row['order']) else '', 100),
                truncate(row['family'] if pd.notnull(row['family']) else '', 100),
                truncate(row['genus'] if pd.notnull(row['genus']) else '', 100)
            ))

        # Attempt to insert the chunk as a batch
        sql_statement = """
        INSERT INTO taxons (
            taxon_id, scientific_name, canonical_name, generic_name, taxonomic_status,
            kingdom, phylum, class, "order", family, genus
        ) VALUES %s;
        """

        try:
            extras.execute_values(cursor, sql_statement, values, template=None, page_size=100)
            conn.commit()
        except Exception as batch_error:
            print(f"Batch insert failed, attempting row-by-row insertion. Error: {batch_error}")
            conn.rollback()
            # Fallback to row-by-row insertion if batch fails
            for value in values:
                try:
                    cursor.execute(sql_statement, (value,))
                    conn.commit()
                except Exception as row_error:
                    print(f"Error inserting row: {row_error}")
                    conn.rollback()
    except Exception as e:
        print(f"Error processing chunk: {e}")
        conn.rollback()
    finally:
        db_pool.putconn(conn)  # Return the connection to the pool

# Path to your taxons TSV file and read it into DataFrame with chunking
tsv_file_path = '../data/taxons.tsv'
df_taxons = pd.read_csv(tsv_file_path, sep='\t', chunksize=100, error_bad_lines=False, warn_bad_lines=True)

# Use ThreadPoolExecutor to parallelize the insertion
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_and_insert_chunk, chunk) for chunk in df_taxons]

    for future in as_completed(futures):
        print("Chunk processed successfully")

# Cleanup: Close the connection pool when done
db_pool.closeall()

File Parallelization, DB Pool, w/ File and Batch Retry Mechanisms (700 files). Worked!

In [1]:
import psycopg2
from psycopg2 import extras, pool
import os
from dotenv import load_dotenv
import uuid
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import time

# Load environment variables and return database connection parameters
def load_db_params():
    load_dotenv()
    return {
        "dbname": os.getenv("DB_NAME"),
        "user": os.getenv("DB_USER"),
        "password": os.getenv("DB_PASSWORD"),
        "host": os.getenv("DB_HOST"),
        "port": os.getenv("DB_PORT")
    }

# Truncate string fields to their maximum lengths
def truncate(value, length):
    return (value[:length] if len(value) > length else value) if value else ''

# Log the name of processed files
def log_processed_file(file_path, log_file_path='processed_files.log'):
    with open(log_file_path, 'a') as log_file:
        log_file.write(file_path + '\n')

# Get the set of already processed files
def get_processed_files(log_file_path='processed_files.log'):
    try:
        with open(log_file_path, 'r') as log_file:
            return set(line.strip() for line in log_file)
    except FileNotFoundError:
        return set()

# Process a single chunk of data from a CSV file
def process_and_insert_chunk(chunk, db_pool):
    conn = db_pool.getconn()
    try:
        cursor = conn.cursor()
        values = [
            (
                str(uuid.uuid4()),
                truncate(row['scientificName'] if pd.notnull(row['scientificName']) else '', 255),
                truncate(row['canonicalName'] if pd.notnull(row['canonicalName']) else '', 255),
                truncate(row['genericName'] if pd.notnull(row['genericName']) else '', 255),
                truncate(row['taxonomicStatus'] if pd.notnull(row['taxonomicStatus']) else '', 50),
                truncate(row['kingdom'] if pd.notnull(row['kingdom']) else '', 100),
                truncate(row['phylum'] if pd.notnull(row['phylum']) else '', 100),
                truncate(row['class'] if pd.notnull(row['class']) else '', 100),
                truncate(row['order'] if pd.notnull(row['order']) else '', 100),
                truncate(row['family'] if pd.notnull(row['family']) else '', 100),
                truncate(row['genus'] if pd.notnull(row['genus']) else '', 100)
            )
            for index, row in chunk.iterrows()
        ]
        extras.execute_values(cursor, """
            INSERT INTO taxons (
                taxon_id, scientific_name, canonical_name, generic_name, taxonomic_status,
                kingdom, phylum, class, "order", family, genus
            ) VALUES %s;
        """, values)
        conn.commit()
    except Exception as e:
        conn.rollback()
        raise e
    finally:
        db_pool.putconn(conn)

# Process an entire file with retry logic
def process_file_with_batch_retry(file_path, db_pool, max_file_retries=3, max_chunk_retries=3, wait_seconds=5):
    file_attempts = 0
    while file_attempts < max_file_retries:
        try:
            df_taxons = pd.read_csv(file_path, sep=',', chunksize=100, on_bad_lines='skip')
            for chunk in df_taxons:
                chunk_attempts = 0
                while chunk_attempts < max_chunk_retries:
                    try:
                        process_and_insert_chunk(chunk, db_pool)
                        break  # Break the chunk retry loop on success
                    except Exception as chunk_error:
                        print(f"Error processing chunk in {file_path}, attempt {chunk_attempts+1}: {chunk_error}")
                        chunk_attempts += 1
                        time.sleep(wait_seconds)  # Wait before retrying the chunk
                        if chunk_attempts == max_chunk_retries:
                            print(f"Max chunk retries reached for a chunk in {file_path}.")

            log_processed_file(str(file_path))  # Log the file as processed after all chunks are attempted
            print(f"{file_path} processed successfully with retries.")
            break  # Break the file retry loop on success
        except Exception as file_error:
            print(f"Error processing file {file_path}, attempt {file_attempts+1}: {file_error}")
            file_attempts += 1
            time.sleep(wait_seconds)  # Wait before retrying the file
            if file_attempts == max_file_retries:
                print(f"Failed to process {file_path} after {max_file_retries} attempts.")

# Parallel file processing with file and batch retries
if __name__ == "__main__":
    db_params = load_db_params()
    db_pool = psycopg2.pool.SimpleConnectionPool(1, 20, **db_params)

    data_directory = Path('../data/temp')
    csv_files = [file for file in data_directory.glob('taxons_chunk_*.csv')]
    processed_files = get_processed_files()

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_file_with_batch_retry, file_path, db_pool): file_path for file_path in csv_files if str(file_path) not in processed_files}

        for future in as_completed(futures):
            file_path = futures[future]

    db_pool.closeall()
    print("Processing complete.")


Processing complete.
