In [1]:
import pandas as pd
import psycopg


def create_db_connections(db_configs):
    """
    Create and return connections to multiple PostgreSQL databases.

    :param db_configs: A list of dictionaries, where each dictionary contains
                       the database connection parameters.
                       Example:
                       [
                           {
                               "dbname": "db1",
                               "user": "user1",
                               "password": "password1",
                               "host": "localhost",
                               "port": "5432"
                           },
                           {
                               "dbname": "db2",
                               "user": "user2",
                               "password": "password2",
                               "host": "localhost",
                               "port": "5432"
                           }
                       ]
    :return: A dictionary where the key is the database name and the value is the connection object.
    """
    connections = {}
    for config in db_configs:
        try:
            conn = psycopg.connect(
                dbname=config["dbname"],
                user=config["user"],
                password=config["password"],
                host=config["host"],
                port=config["port"],
            )
            connections[config["dbname"]] = conn
            print(f"Connected to database {config['dbname']} successfully.")
        except psycopg.OperationalError as e:
            print(f"Error connecting to database {config['dbname']}: {e}")

    return connections


def close_db_connections(connections):
    """
    Close all the database connections.

    :param connections: Dictionary of database connections.
                        The keys should be the database names and the values should be the connection objects.
    """
    for dbname, conn in connections.items():
        conn.close()
        print(f"Connection to database {dbname} closed.")


def fetch_column_names(conn, table_name):
    """
    Fetch the column names for a given table.

    :param conn: A psycopg connection object.
    :param table_name: The name of the table to fetch column names from.
    :return: A list of column names.
    """
    query = f"""
    SELECT column_name
    FROM information_schema.columns
    WHERE table_name = '{table_name}'
    ORDER BY ordinal_position;
    """
    with conn.cursor() as cursor:
        cursor.execute(query)
        columns = [row[0] for row in cursor.fetchall()]
        print(columns)
    return columns


def fetch_data_in_chunks(conn, query, chunk_size=10000, params=None):
    """
    Fetch data in chunks to handle large datasets.

    :param conn: A psycopg connection object.
    :param query: SQL query to execute.
    :param chunk_size: Number of rows to fetch per chunk.
    :param params: Optional parameters for the SQL query.
    :return: A generator that yields chunks of data.
    """
    offset = 0
    while True:
        chunk_query = f"{query} LIMIT {chunk_size} OFFSET {offset};"
        with conn.cursor() as cursor:
            cursor.execute(chunk_query, params)
            data = cursor.fetchall()
            if not data:
                break
            yield data
            offset += chunk_size


def run_query_with_dynamic_columns(
    db_configs, table_name, user_query, chunk_size=10000, params=None
):
    """
    Execute a user query across multiple databases and return the results as a pandas DataFrame.

    :param db_configs: List of dictionaries containing database connection parameters.
    :param table_name: The table name to fetch column names from.
    :param user_query: SQL query provided by the user.
    :param chunk_size: Number of rows to fetch per chunk.
    :param params: Optional parameters for the SQL query.
    :return: A pandas DataFrame with the combined results from all databases.
             The DataFrame includes a column for the database name.
    """
    # Establish database connections
    connections = create_db_connections(db_configs)

    try:
        # List to hold individual DataFrames from each database
        dfs = []

        # For each database, fetch column names and execute the user query
        for dbname, conn in connections.items():
            try:
                column_names = fetch_column_names(conn, table_name)

                # Fetch data in chunks and append to DataFrames list
                for chunk in fetch_data_in_chunks(conn, user_query, chunk_size, params):
                    if chunk:
                        if len(chunk[0]) != len(column_names):
                            raise ValueError(
                                f"Number of columns in result from {dbname} does not match the provided column names."
                            )

                        df = pd.DataFrame(chunk, columns=column_names)
                        df["database"] = dbname
                        dfs.append(df)
                    else:
                        print(f"No data returned from {dbname}.")

            except Exception as e:
                print(f"Error in {dbname}: {e}")

        # Concatenate all DataFrames into a single DataFrame
        combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

        return combined_df
    finally:
        # Ensure connections are closed even if an error occurs
        close_db_connections(connections)


# Example usage
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

In [3]:
user_query = "SELECT * FROM employees ORDER BY emp_no ASC"  # Adjust the query as needed
table_name = "employees"  # Replace with your table name

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, table_name, user_query)

# Display the combined DataFrame
print(df)


Connected to database Employees successfully.
['emp_no', 'birth_date', 'first_name', 'last_name', 'gender', 'hire_date']
Connection to database Employees closed.
        emp_no  birth_date first_name last_name gender   hire_date   database
0        10001  1953-09-02     Georgi   Facello      M  1986-06-26  Employees
1        10002  1964-06-02    Bezalel    Simmel      F  1985-11-21  Employees
2        10003  1959-12-03      Parto   Bamford      M  1986-08-28  Employees
3        10004  1954-05-01  Chirstian   Koblick      M  1986-12-01  Employees
4        10005  1955-01-21    Kyoichi  Maliniak      M  1989-09-12  Employees
...        ...         ...        ...       ...    ...         ...        ...
300019  499995  1958-09-24     Dekang  Lichtner      F  1993-01-12  Employees
300020  499996  1953-03-07       Zito      Baaz      M  1990-09-27  Employees
300021  499997  1961-08-03    Berhard    Lenart      M  1986-04-21  Employees
300022  499998  1956-09-05   Patricia   Breugel      M  19

In [4]:
user_query = """SELECT * FROM titles WHERE emp_no = '10006' ORDER BY emp_no ASC"""  # Adjust the query as needed
table_name = "titles"  # Replace with your table name

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, table_name, user_query)

# Display the combined DataFrame
print(df)

Connected to database Employees successfully.
['emp_no', 'title', 'from_date', 'to_date']
Connection to database Employees closed.
   emp_no            title   from_date     to_date   database
0   10006  Senior Engineer  1990-08-05  9999-01-01  Employees


In [5]:
import pandas as pd
import psycopg

def fetch_column_names_for_query(conn, query):
    """
    Fetch column names based on the query.
    
    :param conn: A psycopg connection object.
    :param query: SQL query to analyze.
    :return: A list of column names.
    """
    with conn.cursor() as cursor:
        cursor.execute(f"SELECT * FROM ({query}) AS subquery LIMIT 0;")
        return [desc[0] for desc in cursor.description]

def run_query_with_dynamic_columns(db_configs, user_query, chunk_size=10000, params=None):
    """
    Execute a user query across multiple databases and return the results as a pandas DataFrame.
    
    :param db_configs: List of dictionaries containing database connection parameters.
    :param user_query: SQL query provided by the user.
    :param chunk_size: Number of rows to fetch per chunk.
    :param params: Optional parameters for the SQL query.
    :return: A pandas DataFrame with the combined results from all databases.
             The DataFrame includes a column for the database name.
    """
    # Establish database connections
    connections = create_db_connections(db_configs)

    try:
        # List to hold individual DataFrames from each database
        dfs = []

        # For each database, fetch column names and execute the user query
        for dbname, conn in connections.items():
            try:
                # Fetch the correct column names based on the user query
                column_names = fetch_column_names_for_query(conn, user_query)

                # Fetch data in chunks and append to DataFrames list
                for chunk in fetch_data_in_chunks(conn, user_query, chunk_size, params):
                    if chunk:
                        if len(chunk[0]) != len(column_names):
                            raise ValueError(f"Number of columns in result from {dbname} does not match the provided column names.")
                        
                        df = pd.DataFrame(chunk, columns=column_names)
                        df['database'] = dbname
                        dfs.append(df)
                    else:
                        print(f"No data returned from {dbname}.")

            except Exception as e:
                print(f"Error in {dbname}: {e}")
        
        # Concatenate all DataFrames into a single DataFrame
        combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

        return combined_df
    finally:
        # Ensure connections are closed even if an error occurs
        close_db_connections(connections)

# Example usage
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432"
    }
]



In [6]:
# Define the user's query
user_query = """SELECT emp_no AS "Employee #" FROM "employees" ORDER BY emp_no ASC"""

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
print(df)


Connected to database Employees successfully.
Connection to database Employees closed.
        Employee #   database
0            10001  Employees
1            10002  Employees
2            10003  Employees
3            10004  Employees
4            10005  Employees
...            ...        ...
300019      499995  Employees
300020      499996  Employees
300021      499997  Employees
300022      499998  Employees
300023      499999  Employees

[300024 rows x 2 columns]


In [7]:
# Define the user's query
user_query = """SELECT emp_no AS "Employee #", birth_date AS "Birthday", first_name AS "First name" FROM "employees" ORDER BY emp_no ASC"""

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,Employee #,Birthday,First name,database
0,10001,1953-09-02,Georgi,Employees
1,10002,1964-06-02,Bezalel,Employees
2,10003,1959-12-03,Parto,Employees
3,10004,1954-05-01,Chirstian,Employees
4,10005,1955-01-21,Kyoichi,Employees
...,...,...,...,...
300019,499995,1958-09-24,Dekang,Employees
300020,499996,1953-03-07,Zito,Employees
300021,499997,1961-08-03,Berhard,Employees
300022,499998,1956-09-05,Patricia,Employees


In [8]:
# Define the user's query
user_query = """SELECT * FROM "employees" ORDER BY emp_no ASC"""

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,database
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26,Employees
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21,Employees
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28,Employees
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Employees
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12,Employees
...,...,...,...,...,...,...,...
300019,499995,1958-09-24,Dekang,Lichtner,F,1993-01-12,Employees
300020,499996,1953-03-07,Zito,Baaz,M,1990-09-27,Employees
300021,499997,1961-08-03,Berhard,Lenart,M,1986-04-21,Employees
300022,499998,1956-09-05,Patricia,Breugel,M,1993-10-13,Employees


In [9]:
# Define the user's query
user_query = """SELECT CONCAT(emp_no, ' is a ', title) AS "Employee Title" FROM "titles" ORDER BY emp_no ASC"""

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,Employee Title,database
0,10001 is a Senior Engineer,Employees
1,10002 is a Staff,Employees
2,10003 is a Senior Engineer,Employees
3,10004 is a Engineer,Employees
4,10004 is a Senior Engineer,Employees
...,...,...
443303,499997 is a Engineer,Employees
443304,499997 is a Senior Engineer,Employees
443305,499998 is a Senior Staff,Employees
443306,499998 is a Staff,Employees


In [10]:
# Define the user's query
user_query = """SELECT CONCAT(first_name, ' ', last_name) AS "Name" FROM "employees" ORDER BY emp_no ASC"""

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,Name,database
0,Georgi Facello,Employees
1,Bezalel Simmel,Employees
2,Parto Bamford,Employees
3,Chirstian Koblick,Employees
4,Kyoichi Maliniak,Employees
...,...,...
300019,Dekang Lichtner,Employees
300020,Zito Baaz,Employees
300021,Berhard Lenart,Employees
300022,Patricia Breugel,Employees


In [11]:
# Define the user's query
user_query = """SELECT COUNT(emp_no) FROM "employees" """

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,count,database
0,300024,Employees


In [12]:
# Define the user's query
user_query = """SELECT max(emp_no) FROM "employees" """

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,max,database
0,499999,Employees


In [13]:
# Define the user's query
user_query = """SELECT MAX(salary) FROM "salaries" """

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,max,database
0,158220,Employees


In [14]:
# Define the user's query
user_query = """
                SELECT SUM(salary)
                FROM "salaries" 
             """

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,sum,database
0,181480757419,Employees


In [15]:
# Define the user's query
user_query = """
               -- select statement to filter Mayumi Schueller
               SELECT *
               FROM "employees"
               /*
               filter on first name AND last name to limit the amount of data returned
               and focus the filtering on a single person
               */
               WHERE first_name = 'Mayumi' AND last_name = 'Schueller' -- filter here on Mayumi Schueller
             """

# SQL comments[single line]
# -- filter on first name AND last name to limit the amount of data returned
# -- and focus the filtering on a single person

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,database
0,10054,1957-04-04,Mayumi,Schueller,M,1995-03-13,Employees


In [16]:
# Define the user's query
user_query = """
               SELECT *
               FROM "employees"
               WHERE "gender" = 'F'
             """

# Run the function with the user query
df = run_query_with_dynamic_columns(db_configs, user_query)

# Display the combined DataFrame
df

Connected to database Employees successfully.
Connection to database Employees closed.


Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,database
0,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21,Employees
1,10006,1953-04-20,Anneke,Preusig,F,1989-06-02,Employees
2,10007,1957-05-23,Tzvetan,Zielinski,F,1989-02-10,Employees
3,10009,1952-04-19,Sumant,Peac,F,1985-02-18,Employees
4,10010,1963-06-01,Duangkaew,Piveteau,F,1989-08-24,Employees
...,...,...,...,...,...,...,...
120046,499988,1962-09-28,Bangqing,Kleiser,F,1986-06-06,Employees
120047,499991,1962-02-26,Pohua,Sichman,F,1989-01-12,Employees
120048,499992,1960-10-12,Siamak,Salverda,F,1987-05-10,Employees
120049,499994,1952-02-26,Navin,Argence,F,1990-04-24,Employees
