## Importing Libraries

In [51]:
import pandas as pd
from sqlalchemy import create_engine,text
import psycopg2
import os
import io
import time 
from psycopg2 import sql
import timeit
from jinja2 import Template

In [2]:
import sys 
sys.path.append(".")

In [3]:
import config 

In [4]:
def create_postgres_engine(user, password, host, port, db_name):
    """Create a SQLAlchemy engine for PostgreSQL."""
    connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}"
    engine = create_engine(connection_string)
    return engine

In [5]:
engine = create_postgres_engine(config.DB_USER,config.DB_PASSWORD,config.DB_HOST,config.DB_PORT,config.DB_NAME)

In [6]:
def run_sql_file(filename):
    # Open and read the file
    with open(filename, 'r') as file:
        sql_script = file.read()
    
    # Begin a connection
    with engine.connect() as connection:
        # Start a transaction
        with connection.begin():
            # Split script into individual statements
            statements = sql_script.split(';')
            
            # Execute each statement
            for statement in statements:
                # Remove whitespace
                clean_statement = statement.strip()
                
                # Skip empty statements
                if clean_statement:
                    try:
                        # Execute each statement
                        connection.execute(text(clean_statement))
                    except Exception as e:
                        print(f"Error executing statement: {clean_statement}")
                        print(f"Error details: {e}")
                        raise
        
        print(f"SQL file {filename} executed successfully!")


## Load Data Stats

In [7]:
def load_table_into_db(file_path, table_name, conn_params):

    metrics = {
        'file_name': file_path.split("/")[-1],
        'insertion_time_ms': 0,
        'wall_time_ms': 0,
        'rows_inserted': 0
    }

    try:
        # Establish connection
        conn = psycopg2.connect(**conn_params)
        conn.set_session(autocommit=False)
        
        try:
            wall_start_time = time.time()

            with conn.cursor() as cur:
                # Start timing
               
                # Open the CSV file and copy
                with open(file_path, 'r') as f:
                    insertion_start = timeit.default_timer()
                    cur.copy_expert(
                        sql.SQL('COPY {} FROM STDIN WITH (FORMAT CSV, HEADER TRUE)').format(
                            sql.Identifier(table_name)
                        ), 
                        f
                    )
                    insertion_end = timeit.default_timer()
                    metrics['insertion_time_ms'] = (insertion_end - insertion_start) * 1000
                   
                # Commit the transaction
                conn.commit()
                
                # Calculate wall time
                wall_end_time = time.time()
                metrics['wall_time_ms'] = (wall_end_time - wall_start_time) * 1000

                cur.execute(sql.SQL('SELECT COUNT(*) FROM {}').format(sql.Identifier(table_name)))
                metrics['rows_inserted'] = cur.fetchone()[0]
                        
                    
                # Print metrics
                print(f"Import Metrics for {file_path}:")
                print(f"Insertion Time: {metrics['insertion_time_ms']:.2f} ms")
                print(f"Wall Time: {metrics['wall_time_ms']:.2f} ms")
                print(f"Rows Inserted: {metrics['rows_inserted']}")

        
        
        except Exception as inner_e:
            # Rollback if any error occurs
            conn.rollback()
            print(f"Error importing {file_path}: {inner_e}")
        
        finally:
            # Ensure connection is closed
            conn.close()
            return metrics 
    
    except psycopg2.Error as conn_e:
        print(f"Database connection error: {conn_e}")
        return metrics 


In [8]:
conn_params = {
    'dbname': config.DB_NAME,
    'user': config.DB_USER,
    'password': config.DB_PASSWORD,
    'host': config.DB_HOST,
    'port': config.DB_PORT
}


In [9]:
table_names = {
    'ACC':'accelerometer_data',
    'BVP':'blood_volume_pulse',
    'Dexcom':'interstitial_glucose',
    'EDA':'electrodermal_activity',
    'HR':'heart_rate_data',
    'IBI':'ibi_data',
    'TEMP':'temperature_data'
}

In [10]:
## mention scale factor
scale_factor = 2

In [11]:
def integer_to_places_string(number):
    
    # Ensure the input is a valid integer within range
    if not isinstance(number, int) or not (0 <= number <= 999):
        raise ValueError("Input must be an integer between 0 and 999.")

    # Extract hundreds, tens, and ones
    hundreds = number // 100
    tens = (number // 10) % 10
    ones = number % 10

    # Format into the desired string
    result = f"{hundreds}{tens}{ones}"
    return result

In [12]:
folder_to_use = [integer_to_places_string(i) for i in range(1,scale_factor+1)]

In [13]:
accepted_files = ['ACC','BVP','Dexcom','EDA','HR','IBI','TEMP']  ## if want to ignore a table remove it from the list

In [14]:
## Create Schema
run_sql_file(os.path.join(config.SQL_SCRIPTS_PATH,'create_schema.sql'))

## Load Demographics Data not to be included in data insertion timings - one time load
demographic_path = os.path.join(config.TRANSFORM_DATA_PATH,'Demographics.csv') 
load_table_into_db(demographic_path,'demographics',conn_params)

SQL file sql_scripts/create_schema.sql executed successfully!
Import Metrics for ../new_data/Demographics.csv:
Insertion Time: 3.24 ms
Wall Time: 3.84 ms
Rows Inserted: 16


{'file_name': 'Demographics.csv',
 'insertion_time_ms': 3.2363749924115837,
 'wall_time_ms': 3.8442611694335938,
 'rows_inserted': 16}

In [16]:
list_of_metrics = []
for i in range(0,scale_factor):
    folder_path = os.path.join(config.TRANSFORM_DATA_PATH,folder_to_use[i])
    
    for file in accepted_files:
         
        file_path = os.path.join(folder_path,f'{file}_{folder_to_use[i]}.csv')
        metrics = load_table_into_db(file_path,table_names[file],conn_params)

        list_of_metrics.append(metrics)


report_df = pd.DataFrame(list_of_metrics)
total_df =pd.DataFrame(report_df.select_dtypes(include=['float','int']).sum()).T 
total_df.insert(0,'file_name',['Total'])
report_df = pd.concat([report_df,total_df],axis=0).reset_index(drop=True)
report_df.to_csv(os.path.join(config.RESULTS_PATH,f"insertion_stats_scale_{scale_factor}.csv"),index=False)

Import Metrics for ../new_data/001/ACC_001.csv:
Insertion Time: 97880.30 ms
Wall Time: 97887.20 ms
Rows Inserted: 20296428
Import Metrics for ../new_data/001/BVP_001.csv:
Insertion Time: 192493.17 ms
Wall Time: 192499.13 ms
Rows Inserted: 40592838
Import Metrics for ../new_data/001/Dexcom_001.csv:
Insertion Time: 54.38 ms
Wall Time: 56.81 ms
Rows Inserted: 2561
Import Metrics for ../new_data/001/EDA_001.csv:
Insertion Time: 12286.29 ms
Wall Time: 12288.02 ms
Rows Inserted: 2537046
Import Metrics for ../new_data/001/HR_001.csv:
Insertion Time: 2456.72 ms
Wall Time: 2458.53 ms
Rows Inserted: 634188
Import Metrics for ../new_data/001/IBI_001.csv:
Insertion Time: 1082.72 ms
Wall Time: 1083.74 ms
Rows Inserted: 266366
Import Metrics for ../new_data/001/TEMP_001.csv:
Insertion Time: 11940.48 ms
Wall Time: 11943.48 ms
Rows Inserted: 2537040
Import Metrics for ../new_data/002/ACC_002.csv:
Insertion Time: 100324.15 ms
Wall Time: 100329.31 ms
Rows Inserted: 40448658
Import Metrics for ../new_dat

In [17]:
## compress the data 
report_df

Unnamed: 0,file_name,insertion_time_ms,wall_time_ms,rows_inserted
0,ACC_001.csv,97880.302583,97887.200594,20296428.0
1,BVP_001.csv,192493.171584,192499.131203,40592838.0
2,Dexcom_001.csv,54.378125,56.807041,2561.0
3,EDA_001.csv,12286.285875,12288.01918,2537046.0
4,HR_001.csv,2456.716084,2458.53281,634188.0
5,IBI_001.csv,1082.72025,1083.739996,266366.0
6,TEMP_001.csv,11940.480792,11943.480015,2537040.0
7,ACC_002.csv,100324.153,100329.308987,40448658.0
8,BVP_002.csv,194929.632708,194938.419819,80897311.0
9,Dexcom_002.csv,73.518292,76.000214,4680.0


In [30]:
def get_hypertable_sizes(conn_params):
    query = """
    SELECT 
        hypertable_schema AS schema,
        hypertable_name AS table_name,
        pg_size_pretty(hypertable_size(hypertable_schema || '.' || hypertable_name)) AS total_size,
        hypertable_size(hypertable_schema || '.' || hypertable_name) AS total_size_bytes
    FROM 
        timescaledb_information.hypertables
    ORDER BY 
        total_size_bytes DESC;
    """
    
    conn = psycopg2.connect(**conn_params)
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    return df

In [31]:
size_df = get_hypertable_sizes(conn_params)

  df = pd.read_sql_query(query, conn)


In [42]:
print("Total Size MB ",size_df.select_dtypes(include=['float','int']).sum()/(1024*1024))

Total Size MB  total_size_bytes    11665.640625
dtype: float64


In [41]:
size_df

Unnamed: 0,schema,table_name,total_size,total_size_bytes
0,public,blood_volume_pulse,6877 MB,7211220992
1,public,accelerometer_data,3753 MB,3934855168
2,public,electrodermal_activity,437 MB,458326016
3,public,temperature_data,436 MB,456728576
4,public,heart_rate_data,96 MB,101007360
5,public,ibi_data,66 MB,68878336
6,public,interstitial_glucose,1264 kB,1294336


In [54]:
def render_query(sql_file_path, params):
    # Read the SQL template
    with open(sql_file_path, 'r') as file:
        template_content = file.read()
    
    # Render the template with parameters
    template = Template(template_content)
    query = template.render(params)
    
    return query

def execute_sql_file(conn_params, sql_file_path, params=None):
   
    try:
        query = render_query(sql_file_path,params)        
        # Establish database connection
        conn = psycopg2.connect(**conn_params)

        with conn.cursor() as cur:
                # Start timing
                execution_start = timeit.default_timer()
                cur.execute(sql.SQL(query))
                execution_end = timeit.default_timer()
        # Close the connection

        execution_time_taken = (execution_end-execution_start)*1000

        print("Time of Execution:",execution_time_taken)
        conn.close()
        
        return execution_time_taken
    
    except (Exception, psycopg2.Error) as error:
        print(f"Error executing SQL file: {error}")
        return None
    
    finally:
         conn.close()

In [56]:
execute_sql_file(conn_params,os.path.join(config.SQL_SCRIPTS_PATH,"query_0.sql"),{'list_of_participants':(1,2)})

Time of Execution: 4712.430790998042


4712.430790998042