In [1]:
import pandas as pd
import psycopg
from contextlib import contextmanager


@contextmanager
def get_connection(config):
    """
    Context manager to handle PostgreSQL connection lifecycle.
    Ensures that the connection is closed after use.

    :param config: Dictionary containing database connection parameters.
    """
    conn = None
    try:
        conn = psycopg.connect(
            dbname=config["dbname"],
            user=config["user"],
            password=config["password"],
            host=config["host"],
            port=config["port"],
        )
        yield conn
        print(f"Connected to database {config['dbname']} successfully.")
    except psycopg.Error as e:
        print(f"Error connecting to database {config['dbname']}: {e}")
        raise
    finally:
        if conn:
            conn.close()
            print(f"Connection to database {config['dbname']} closed.")
        else:
            print(f"Failed to connect to database {config['dbname']}.")


def fetch_column_names_for_query(conn, query):
    """
    Fetch column names based on the query.

    :param conn: A psycopg connection object.
    :param query: SQL query to analyze.
    :return: A list of column names.
    """
    with conn.cursor() as cursor:
        try:
            cursor.execute(f"SELECT * FROM ({query}) AS subquery LIMIT 0;")
            return [desc[0] for desc in cursor.description]
        except Exception as e:
            print(f"Error executing query to fetch column names: {e}")
            raise


def fetch_data_in_chunks(conn, query, chunk_size=10000, params=None):
    """
    Fetch data in chunks to handle large datasets.

    :param conn: A psycopg connection object.
    :param query: SQL query to execute.
    :param chunk_size: Number of rows to fetch per chunk.
    :param params: Optional parameters for the SQL query.
    :return: A generator that yields chunks of data.
    """
    with conn.cursor() as cursor:
        cursor.execute(query, params)
        while True:
            data = cursor.fetchmany(chunk_size)
            if not data:
                break
            yield data


def run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=None
):
    """
    Execute a user query across multiple databases and return the results as a pandas DataFrame.

    :param db_configs: List of dictionaries containing database connection parameters.
    :param user_query: SQL query provided by the user.
    :param chunk_size: Number of rows to fetch per chunk.
    :param params: Optional parameters for the SQL query.
    :return: A pandas DataFrame with the combined results from all databases.
             The DataFrame includes a column for the database name.
    """
    dfs = []

    for config in db_configs:
        try:
            with get_connection(config) as conn:
                # Fetch the correct column names based on the user query
                column_names = fetch_column_names_for_query(conn, user_query)

                # Fetch data in chunks and append to DataFrames list
                for chunk in fetch_data_in_chunks(conn, user_query, chunk_size, params):
                    if chunk:
                        # Validate that the number of columns matches the expected number
                        if len(chunk[0]) != len(column_names):
                            print(
                                f"Warning: Number of columns in result from {config['dbname']} does not match the provided column names."
                            )
                            continue

                        for row in chunk:
                            if len(row) != len(column_names):
                                print(
                                    f"Warning: Row length mismatch in {config['dbname']}."
                                )
                                continue

                        # Create DataFrame and append to the list
                        df = pd.DataFrame(chunk, columns=column_names)
                        df['database'] = config['dbname']  # Add the database name as a column
                        dfs.append(df)
                    else:
                        print(f"No data returned from {config['dbname']}.")
        except Exception as e:
            print(f"Error processing database {config['dbname']}: {e}")

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

    # Log message if no data was retrieved
    if combined_df.empty:
        print("No data was retrieved from the databases.")

    return combined_df


# Example usage
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = "SELECT * FROM employees ORDER BY emp_no ASC"  # Adjust the query as needed
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)
print(result_df)


Connected to database Employees successfully.
Connection to database Employees closed.
        emp_no  birth_date first_name last_name gender   hire_date   database
0        10001  1953-09-02     Georgi   Facello      M  1986-06-26  Employees
1        10002  1964-06-02    Bezalel    Simmel      F  1985-11-21  Employees
2        10003  1959-12-03      Parto   Bamford      M  1986-08-28  Employees
3        10004  1954-05-01  Chirstian   Koblick      M  1986-12-01  Employees
4        10005  1955-01-21    Kyoichi  Maliniak      M  1989-09-12  Employees
...        ...         ...        ...       ...    ...         ...        ...
300019  499995  1958-09-24     Dekang  Lichtner      F  1993-01-12  Employees
300020  499996  1953-03-07       Zito      Baaz      M  1990-09-27  Employees
300021  499997  1961-08-03    Berhard    Lenart      M  1986-04-21  Employees
300022  499998  1956-09-05   Patricia   Breugel      M  1993-10-13  Employees
300023  499999  1958-05-01     Sachin   Tsukuda      M 

In [9]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = "SELECT * FROM employees"  # Adjust the query as needed
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
        emp_no  birth_date first_name last_name gender   hire_date   database
0        10001  1953-09-02     Georgi   Facello      M  1986-06-26  Employees
1        10002  1964-06-02    Bezalel    Simmel      F  1985-11-21  Employees
2        10003  1959-12-03      Parto   Bamford      M  1986-08-28  Employees
3        10004  1954-05-01  Chirstian   Koblick      M  1986-12-01  Employees
4        10005  1955-01-21    Kyoichi  Maliniak      M  1989-09-12  Employees
...        ...         ...        ...       ...    ...         ...        ...
300019  499995  1958-09-24     Dekang  Lichtner      F  1993-01-12  Employees
300020  499996  1953-03-07       Zito      Baaz      M  1990-09-27  Employees
300021  499997  1961-08-03    Berhard    Lenart      M  1986-04-21  Employees
300022  499998  1956-09-05   Patricia   Breugel      M  1993-10-13  Employees
300023  499999  1958-05-01     Sachin   Tsukuda      M 

In [14]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."departments"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
  dept_no           dept_name   database
0    d001           Marketing  Employees
1    d002             Finance  Employees
2    d003     Human Resources  Employees
3    d004          Production  Employees
4    d005         Development  Employees
5    d006  Quality Management  Employees
6    d007               Sales  Employees
7    d008            Research  Employees
8    d009    Customer Service  Employees


In [15]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."salaries"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
         emp_no  salary   from_date     to_date   database
0         10001   60117  1986-06-26  1987-06-26  Employees
1         10001   62102  1987-06-26  1988-06-25  Employees
2         10001   66074  1988-06-25  1989-06-25  Employees
3         10001   66596  1989-06-25  1990-06-25  Employees
4         10001   66961  1990-06-25  1991-06-25  Employees
...         ...     ...         ...         ...        ...
2844042  499999   63707  1997-11-30  1998-11-30  Employees
2844043  499999   67043  1998-11-30  1999-11-30  Employees
2844044  499999   70745  1999-11-30  2000-11-29  Employees
2844045  499999   74327  2000-11-29  2001-11-29  Employees
2844046  499999   77303  2001-11-29  9999-01-01  Employees

[2844047 rows x 5 columns]


In [46]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."customers" 
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Error executing query to fetch column names: relation "public.customers" does not exist
LINE 2: SELECT * FROM "public"."customers" 
                      ^
Error connecting to database Employees: relation "public.customers" does not exist
LINE 2: SELECT * FROM "public"."customers" 
                      ^
Connection to database Employees closed.
Error processing database Employees: relation "public.customers" does not exist
LINE 2: SELECT * FROM "public"."customers" 
                      ^
No data was retrieved from the databases.
Empty DataFrame
Columns: []
Index: []


In [20]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."salaries" WHERE emp_no=10001
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
   emp_no            title   from_date     to_date   database
0   10006  Senior Engineer  1990-08-05  9999-01-01  Employees


In [22]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT emp_no AS "Employee #", birth_date AS "Birthday", first_name AS "First name" FROM "public"."employees"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
        Employee #    Birthday First name   database
0            10001  1953-09-02     Georgi  Employees
1            10002  1964-06-02    Bezalel  Employees
2            10003  1959-12-03      Parto  Employees
3            10004  1954-05-01  Chirstian  Employees
4            10005  1955-01-21    Kyoichi  Employees
...            ...         ...        ...        ...
300019      499995  1958-09-24     Dekang  Employees
300020      499996  1953-03-07       Zito  Employees
300021      499997  1961-08-03    Berhard  Employees
300022      499998  1956-09-05   Patricia  Employees
300023      499999  1958-05-01     Sachin  Employees

[300024 rows x 4 columns]


In [24]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT CONCAT(emp_no, ' is a ', title) AS "Employee Title" FROM "public"."titles"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
                     Employee Title   database
0        10001 is a Senior Engineer  Employees
1                  10002 is a Staff  Employees
2        10003 is a Senior Engineer  Employees
3               10004 is a Engineer  Employees
4        10004 is a Senior Engineer  Employees
...                             ...        ...
443303         499997 is a Engineer  Employees
443304  499997 is a Senior Engineer  Employees
443305     499998 is a Senior Staff  Employees
443306            499998 is a Staff  Employees
443307         499999 is a Engineer  Employees

[443308 rows x 2 columns]


In [None]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT emp_no, CONCAT(first_name, ' ', last_name) AS "Full name" FROM "public"."employees"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
        emp_no          Full name   database
0        10001     Georgi Facello  Employees
1        10002     Bezalel Simmel  Employees
2        10003      Parto Bamford  Employees
3        10004  Chirstian Koblick  Employees
4        10005   Kyoichi Maliniak  Employees
...        ...                ...        ...
300019  499995    Dekang Lichtner  Employees
300020  499996          Zito Baaz  Employees
300021  499997     Berhard Lenart  Employees
300022  499998   Patricia Breugel  Employees
300023  499999     Sachin Tsukuda  Employees

[300024 rows x 3 columns]


In [33]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT COUNT(emp_no) FROM "public"."employees"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
    count   database
0  300024  Employees


In [35]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT SUM(salary) FROM "public"."salaries"
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
            sum   database
0  181480757419  Employees


In [37]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."employees" WHERE first_name='Mayumi' AND last_name='Schueller'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
   emp_no  birth_date first_name  last_name gender   hire_date   database
0   10054  1957-04-04     Mayumi  Schueller      M  1995-03-13  Employees


In [39]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT * FROM "public"."employees" WHERE gender='F'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
        emp_no  birth_date first_name  last_name gender   hire_date   database
0        10002  1964-06-02    Bezalel     Simmel      F  1985-11-21  Employees
1        10006  1953-04-20     Anneke    Preusig      F  1989-06-02  Employees
2        10007  1957-05-23    Tzvetan  Zielinski      F  1989-02-10  Employees
3        10009  1952-04-19     Sumant       Peac      F  1985-02-18  Employees
4        10010  1963-06-01  Duangkaew   Piveteau      F  1989-08-24  Employees
...        ...         ...        ...        ...    ...         ...        ...
120046  499988  1962-09-28   Bangqing    Kleiser      F  1986-06-06  Employees
120047  499991  1962-02-26      Pohua    Sichman      F  1989-01-12  Employees
120048  499992  1960-10-12     Siamak   Salverda      F  1987-05-10  Employees
120049  499994  1952-02-26      Navin    Argence      F  1990-04-24  Employees
120050  499995  1958-09-24     Dekang   Lich

In [41]:
db_configs = [
    {
        "dbname": "Employees",
        "user": "postgres",
        "password": "root",
        "host": "localhost",
        "port": "5432",
    }
]

# Sample query
user_query = """
SELECT first_name, last_name, hire_date FROM "public"."employees" 
WHERE first_name='Georgi' AND last_name='Facello' AND hire_date='1986-06-26'
OR first_name='Bezalel' AND last_name='Simmel'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
  first_name last_name   hire_date   database
0     Georgi   Facello  1986-06-26  Employees
1    Bezalel    Simmel  1985-11-21  Employees


In [61]:
db_configs[0]["dbname"] = "Employees"

# Sample query
user_query = """
SELECT * FROM "public"."employees" 
WHERE first_name='Georgi' AND last_name='Facello' AND hire_date='1986-06-26'
OR first_name='Bezalel' AND last_name='Simmel'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

print(result_df)

Connected to database Employees successfully.
Connection to database Employees closed.
   emp_no  birth_date first_name last_name gender   hire_date   database
0   10001  1953-09-02     Georgi   Facello      M  1986-06-26  Employees
1   10002  1964-06-02    Bezalel    Simmel      F  1985-11-21  Employees


In [59]:
db_configs[0]["dbname"] = "Store"

# Sample query
user_query = """
SELECT COUNT(firstname) FROM "public"."customers" 
WHERE gender='F' AND (state='OR' OR state='NY')
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

result_df

Connected to database Store successfully.
Connection to database Store closed.


Unnamed: 0,count,database
0,200,Store


In [66]:
db_configs[0]["dbname"] = "Store"

# Sample query
user_query = """
SELECT * FROM "public"."customers" 
WHERE NOT age='55' AND NOT age='20'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

len(result_df)

Connected to database Store successfully.
Connection to database Store closed.


19419

In [71]:
db_configs[0]["dbname"] = "Store"

# Sample query
user_query = """
SELECT * FROM "public"."customers" 
WHERE age>'44' AND income='100000'
"""
params = None  # You can add parameters if needed

# Execute the function
result_df = run_query_with_dynamic_columns(
    db_configs, user_query, chunk_size=10000, params=params
)

result_df.head(5)

Connected to database Store successfully.
Connection to database Store closed.


Unnamed: 0,customerid,firstname,lastname,address1,address2,city,state,zip,country,region,...,phone,creditcardtype,creditcard,creditcardexpiration,username,password,age,income,gender,database
0,1,VKUUXF,ITHOMQJNYX,4608499546 Dell Way,,QSDPAGD,SD,24101,US,1,...,4608499546,1,1979279217775911,2012/03,user1,password,55,100000,M,Store
1,3,JTNRNB,LYYSHTQJRE,6297761196 Dell Way,,LWVIFXJ,OH,96082,US,1,...,6297761196,4,8728086929768325,2010/12,user3,password,47,100000,M,Store
2,6,FXDZBW,BAXPEEKXVJ,6192740010 Dell Way,,OPLRCNT,IN,99300,US,1,...,6192740010,5,7730283664073796,2011/01,user6,password,72,100000,M,Store
3,9,NCGWRC,CJOPRHUHIE,7291678624 Dell Way,,ZAVIELY,VT,78838,US,1,...,7291678624,1,7172072122339160,2009/10,user9,password,86,100000,M,Store
4,15,SIQANV,QQNKJSURDA,3354132892 Dell Way,,BREQSOA,AK,37471,US,1,...,3354132892,4,8717996907886119,2008/05,user15,password,66,100000,M,Store
