## Importing Libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine,text
import psycopg2
import os
import io
import time 
from psycopg2 import sql
import timeit

In [2]:
import sys 
sys.path.append(".")

In [3]:
import config 

In [4]:
def create_postgres_engine(user, password, host, port, db_name):
    """Create a SQLAlchemy engine for PostgreSQL."""
    connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db_name}"
    engine = create_engine(connection_string)
    return engine

In [5]:
engine = create_postgres_engine(config.DB_USER,config.DB_PASSWORD,config.DB_HOST,config.DB_PORT,config.DB_NAME)

In [6]:
def run_sql_file(filename):
    # Open and read the file
    with open(filename, 'r') as file:
        sql_script = file.read()
    
    # Begin a connection
    with engine.connect() as connection:
        # Start a transaction
        with connection.begin():
            # Split script into individual statements
            statements = sql_script.split(';')
            
            # Execute each statement
            for statement in statements:
                # Remove whitespace
                clean_statement = statement.strip()
                
                # Skip empty statements
                if clean_statement:
                    try:
                        # Execute each statement
                        connection.execute(text(clean_statement))
                    except Exception as e:
                        print(f"Error executing statement: {clean_statement}")
                        print(f"Error details: {e}")
                        raise
        
        print(f"SQL file {filename} executed successfully!")


## Load Data Stats

In [7]:
def load_table_into_db(file_path, table_name, conn_params):

    metrics = {
        'file_name': file_path.split("/")[-1],
        'insertion_time_ms': 0,
        'wall_time_ms': 0,
        'rows_inserted': 0,
        'size_in_mb': 0
    }

    try:
        # Establish connection
        conn = psycopg2.connect(**conn_params)
        conn.set_session(autocommit=False)
        
        try:
            wall_start_time = time.time()

            with conn.cursor() as cur:
                # Start timing
               
                # Open the CSV file and copy
                with open(file_path, 'r') as f:
                    insertion_start = timeit.default_timer()
                    cur.copy_expert(
                        sql.SQL('COPY {} FROM STDIN WITH (FORMAT CSV, HEADER TRUE)').format(
                            sql.Identifier(table_name)
                        ), 
                        f
                    )
                    insertion_end = timeit.default_timer()
                    metrics['insertion_time_ms'] = (insertion_end - insertion_start) * 1000
                   
                # Commit the transaction
                conn.commit()
                
                # Calculate wall time
                wall_end_time = time.time()
                metrics['wall_time_ms'] = (wall_end_time - wall_start_time) * 1000

                cur.execute(sql.SQL('SELECT COUNT(*) FROM {}').format(sql.Identifier(table_name)))
                metrics['rows_inserted'] = cur.fetchone()[0]
                
                if table_name !='demographics':
                    cur.execute(sql.SQL("""
                    SELECT 
                        pg_size_pretty(hypertable_size('{}')) as total_size,
                        hypertable_size('{}') as total_size_bytes
                    """).format(
                        sql.Identifier(table_name),
                        sql.Identifier(table_name)
                    ))
                
                    metrics['size_in_mb'] = round(cur.fetchone()[1] / (1024 * 1024), 4)
                
                # Print metrics
                print(f"Import Metrics for {file_path}:")
                print(f"Insertion Time: {metrics['insertion_time_ms']:.2f} ms")
                print(f"Wall Time: {metrics['wall_time_ms']:.2f} ms")
                print(f"Rows Inserted: {metrics['rows_inserted']}")
                print(f"Total Size in MB:{metrics['size_in_mb']}")
        
        
        except Exception as inner_e:
            # Rollback if any error occurs
            conn.rollback()
            print(f"Error importing {file_path}: {inner_e}")
        
        finally:
            # Ensure connection is closed
            conn.close()
            return metrics 
    
    except psycopg2.Error as conn_e:
        print(f"Database connection error: {conn_e}")
        return metrics 


In [8]:
conn_params = {
    'dbname': config.DB_NAME,
    'user': config.DB_USER,
    'password': config.DB_PASSWORD,
    'host': config.DB_HOST,
    'port': config.DB_PORT
}


In [9]:
table_names = {
    'ACC':'accelerometer_data',
    'BVP':'blood_volume_pulse',
    'Dexcom':'interstitial_glucose',
    'EDA':'electrodermal_activity',
    'HR':'heart_rate_data',
    'IBI':'ibi_data',
    'TEMP':'temperature_data'
}

In [10]:
## mention scale factor
scale_factor = 1

In [11]:
def integer_to_places_string(number):
    
    # Ensure the input is a valid integer within range
    if not isinstance(number, int) or not (0 <= number <= 999):
        raise ValueError("Input must be an integer between 0 and 999.")

    # Extract hundreds, tens, and ones
    hundreds = number // 100
    tens = (number // 10) % 10
    ones = number % 10

    # Format into the desired string
    result = f"{hundreds}{tens}{ones}"
    return result

In [12]:
folder_to_use = [integer_to_places_string(i) for i in range(1,scale_factor+1)]

In [13]:
accepted_files = ['ACC','BVP','Dexcom','EDA','HR','IBI','TEMP']  ## if want to ignore a table remove it from the list

In [14]:
## Create Schema
run_sql_file(os.path.join(config.SQL_SCRIPTS_PATH,'create_schema.sql'))

## Load Demographics Data not to be included in data insertion timings - one time load
demographic_path = os.path.join(config.TRANSFORM_DATA_PATH,'Demographics.csv') 
load_table_into_db(demographic_path,'demographics',conn_params)

SQL file sql_scripts/create_schema.sql executed successfully!
Import Metrics for ../new_data/Demographics.csv:
Insertion Time: 5.42 ms
Wall Time: 6.67 ms
Rows Inserted: 16
Total Size in MB:0


{'file_name': 'Demographics.csv',
 'insertion_time_ms': 5.415791005361825,
 'wall_time_ms': 6.6680908203125,
 'rows_inserted': 16,
 'size_in_mb': 0}

In [15]:
list_of_metrics = []
for i in range(0,scale_factor):
    folder_path = os.path.join(config.TRANSFORM_DATA_PATH,folder_to_use[i])
    
    for file in accepted_files:
         
        file_path = os.path.join(folder_path,f'{file}_{folder_to_use[i]}.csv')
        metrics = load_table_into_db(file_path,table_names[file],conn_params)

        list_of_metrics.append(metrics)


report_df = pd.DataFrame(list_of_metrics)
total_df =pd.DataFrame(report_df.select_dtypes(include=['float','int']).sum()).T 
total_df.insert(0,'file_name',['Total'])
report_df = pd.concat([report_df,total_df],axis=0).reset_index(drop=True)
report_df.to_csv(os.path.join(config.RESULTS_PATH,f"insertion_stats_scale_{scale_factor}.csv"),index=False)

Import Metrics for ../new_data/001/ACC_001.csv:
Insertion Time: 97549.35 ms
Wall Time: 97552.81 ms
Rows Inserted: 20296428
Total Size in MB:1947.6719
Import Metrics for ../new_data/001/BVP_001.csv:
Insertion Time: 184302.78 ms
Wall Time: 184378.12 ms
Rows Inserted: 40592838
Total Size in MB:3583.0078
Import Metrics for ../new_data/001/Dexcom_001.csv:
Insertion Time: 38.86 ms
Wall Time: 41.70 ms
Rows Inserted: 2561
Total Size in MB:0.4297
Import Metrics for ../new_data/001/EDA_001.csv:
Insertion Time: 12812.02 ms
Wall Time: 12813.79 ms
Rows Inserted: 2537046
Total Size in MB:224.8281
Import Metrics for ../new_data/001/HR_001.csv:
Insertion Time: 2382.18 ms
Wall Time: 2383.39 ms
Rows Inserted: 634188
Total Size in MB:39.6875
Import Metrics for ../new_data/001/IBI_001.csv:
Insertion Time: 1344.49 ms
Wall Time: 1346.23 ms
Rows Inserted: 266366
Total Size in MB:23.7578
Import Metrics for ../new_data/001/TEMP_001.csv:
Insertion Time: 12373.77 ms
Wall Time: 12376.09 ms
Rows Inserted: 2537040


In [16]:
report_df

Unnamed: 0,file_name,insertion_time_ms,wall_time_ms,rows_inserted,size_in_mb
0,ACC_001.csv,97549.3545,97552.810907,20296428.0,1947.6719
1,BVP_001.csv,184302.778583,184378.121853,40592838.0,3583.0078
2,Dexcom_001.csv,38.860542,41.699886,2561.0,0.4297
3,EDA_001.csv,12812.021291,12813.791275,2537046.0,224.8281
4,HR_001.csv,2382.179542,2383.385897,634188.0,39.6875
5,IBI_001.csv,1344.49225,1346.230745,266366.0,23.7578
6,TEMP_001.csv,12373.7695,12376.087189,2537040.0,224.25
7,Total,310803.456208,310892.127752,66866467.0,6043.6328
