In [2]:
import pandas as pd
import os
import io
import time 
import numpy as np
import timeit
from jinja2 import Template
from influxdb import InfluxDBClient,DataFrameClient
import sys 
sys.path.append(".")
from insert_data import *
import config 

In [4]:
client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT)
client.drop_database(config.DB_NAME)
client.create_database(config.DB_NAME)
client.close()

In [6]:
## Load Demographics Data 
client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME)
demo_df = pd.read_csv(os.path.join(config.TRANSFORM_DATA_PATH,'Demographics.csv'))
points = []
for index, data in demo_df.iterrows():
    point = {
        "measurement": "demographics",  # Measurement name
        "tags": {
            "ID": data['ID'],
            "Gender": data["Gender"]  # Tag for categorical data
        },
        "fields": {
            "HbA1c": data["HbA1c"]
        }
    }
    points.append(point)

client.write_points(points)
client.close()

In [8]:
scale_factor = config.SCALE_FACTOR

In [10]:
print("scale_factor:", scale_factor)

scale_factor: 5


In [12]:
def integer_to_places_string(number):
    
    # Ensure the input is a valid integer within range
    if not isinstance(number, int) or not (0 <= number <= 999):
        raise ValueError("Input must be an integer between 0 and 999.")

    # Extract hundreds, tens, and ones
    hundreds = number // 100
    tens = (number // 10) % 10
    ones = number % 10

    # Format into the desired string
    result = f"{hundreds}{tens}{ones}"
    return result
folder_to_use = [integer_to_places_string(i) for i in range(1,scale_factor+1)]
accepted_files = ['ACC','BVP','Dexcom','EDA','HR','IBI','TEMP']  

## Loading Data

In [15]:
list_of_metrics = []
for i in range(0,scale_factor):
    folder_path = os.path.join(config.TRANSFORM_DATA_PATH,folder_to_use[i])
    
    for file in accepted_files:
        
        file_path = os.path.join(folder_path,f'{file}_{folder_to_use[i]}.csv')
        metrics = {
            'file_name': file_path.split("/")[-1],
            'insertion_time_ms': 0,
            'wall_time_ms': 0
        }
        
        if file=="ACC": 
            wall_time, insertion_time = insert_acc_data(file_path)
        elif file=="BVP":
            wall_time, insertion_time = insert_bvp_data(file_path)
        elif file=="Dexcom":
            wall_time, insertion_time = insert_dexcom_data(file_path)
        elif file=="EDA": 
            wall_time, insertion_time = insert_eda_data(file_path)
        elif file=="HR": 
            wall_time, insertion_time = insert_hr_data(file_path)
        elif file=="IBI": 
            wall_time, insertion_time = insert_ibi_data(file_path)
        elif file=="TEMP": 
            wall_time, insertion_time = insert_temp_data(file_path)


        
        metrics['insertion_time_ms'] = insertion_time
        metrics['wall_time_ms'] = wall_time
        print("file path:",file_path)
        print("insertion_time:", insertion_time)
        print("wall_time:", wall_time)
        list_of_metrics.append(metrics)


report_df = pd.DataFrame(list_of_metrics)
total_df =pd.DataFrame(report_df.select_dtypes(include=['float','int']).sum()).T 
total_df.insert(0,'file_name',['Total'])
report_df = pd.concat([report_df,total_df],axis=0).reset_index(drop=True)
report_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_insertion_stats_scale_{scale_factor}.csv"),index=False)

file path: ../../new_data/001/ACC_001.csv
insertion_time: 92258.52654199999
wall_time: 92259.0069770813
file path: ../../new_data/001/BVP_001.csv
insertion_time: 151599.895041
wall_time: 151600.56900978088
file path: ../../new_data/001/Dexcom_001.csv
insertion_time: 46.412999999972726
wall_time: 46.48876190185547
file path: ../../new_data/001/EDA_001.csv
insertion_time: 9888.851541000007
wall_time: 9889.007806777954
file path: ../../new_data/001/HR_001.csv
insertion_time: 2472.3457910000093
wall_time: 2472.4509716033936
file path: ../../new_data/001/IBI_001.csv
insertion_time: 1022.328707999975
wall_time: 1022.4459171295166
file path: ../../new_data/001/TEMP_001.csv
insertion_time: 9718.752457999983
wall_time: 9718.908071517944
file path: ../../new_data/002/ACC_002.csv
insertion_time: 93743.11641699995
wall_time: 93743.58010292053
file path: ../../new_data/002/BVP_002.csv
insertion_time: 156899.77179199996
wall_time: 156900.47788619995
file path: ../../new_data/002/Dexcom_002.csv
inser

In [16]:
report_df

Unnamed: 0,file_name,insertion_time_ms,wall_time_ms
0,ACC_001.csv,92258.53,92259.01
1,BVP_001.csv,151599.9,151600.6
2,Dexcom_001.csv,46.413,46.48876
3,EDA_001.csv,9888.852,9889.008
4,HR_001.csv,2472.346,2472.451
5,IBI_001.csv,1022.329,1022.446
6,TEMP_001.csv,9718.752,9718.908
7,ACC_002.csv,93743.12,93743.58
8,BVP_002.csv,156899.8,156900.5
9,Dexcom_002.csv,34.05625,34.14226


In [17]:
table_names = {
    'ACC':'accelerometer_data',
    'BVP':'blood_volume_pulse',
    'Dexcom':'interstitial_glucose',
    'EDA':'electrodermal_activity',
    'HR':'heart_rate_data',
    'IBI':'ibi_data',
    'TEMP':'temperature_data'
}


In [18]:
def get_rows_inserted(table_name):
    
    client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME)
    query = f'SELECT count(*) FROM "{table_name}"'
    result = client.query(query)
    client.close()
    for i in result.get_points():
        count = list(i.values())[1]
        return count


In [19]:
row_info = {}
for name in table_names.values():
    row_info[name] = get_rows_inserted(name)

row_df = pd.DataFrame(row_info.items(),columns=['table_name','number_of_rows_inserted'])
row_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_insertion_stats_num_rows_scale_{scale_factor}.csv"),index=False)

In [20]:
row_df

Unnamed: 0,table_name,number_of_rows_inserted
0,accelerometer_data,91510446
1,blood_volume_pulse,183020849
2,interstitial_glucose,11702
3,electrodermal_activity,11438736
4,heart_rate_data,2859141
5,ibi_data,1382424
6,temperature_data,11438656


In [21]:
def get_table_sizes():
    query_2 = """SHOW STATS"""
    

    client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME) 
    result = client.query(query_2)
    client.close()

    res = [value for value in result.get_points()]
    bytes_sum = 0
    for sample_dict in res:
        if 'diskBytes' in sample_dict:
            bytes_sum += sample_dict['diskBytes']
    
    
    return round(bytes_sum/(1024*1024),4)

In [22]:
val = get_table_sizes()

In [23]:
size_df = pd.DataFrame([{'Total Disk Usage(MB)':val}])
size_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_compression_stats_size_scale_{scale_factor}.csv"),index=False)

## Run the Influx Queries

In [25]:
queries_dir = config.QUERIES_PATH
list_of_participants = " OR ".join([f'"participant_id"=\'{id}\'' for id in range(1,scale_factor+1)])

In [26]:
def load_query(query_file_path, list_of_participants):
    with open(query_file_path, 'r') as file:
        raw_query = file.read()
    if "{list_of_participants}" in raw_query:
        final_query = raw_query.format(list_of_participants=list_of_participants)
    else:
        final_query = raw_query

    return final_query

In [27]:
def run_query(query_file_path, list_of_participants):
   
    try:
        # Establish database connection
        client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME)

        query = load_query(query_file_path, list_of_participants)
        
        execution_start = timeit.default_timer()
        result = client.query(query)
        execution_end = timeit.default_timer()
        execution_time_taken = (execution_end-execution_start)*1000

        print("Time of Execution:",execution_time_taken)
        client.close()
        
        return execution_time_taken
    
    except Exception as e:
        print(f"Error: {e}")
        return None
    
    finally:
        client.close()

In [28]:
number_of_queries = 9 
number_of_times_to_run = config.NUMBER_TIMES_TO_RUN_QUERY

In [29]:
execution_summary = {}

for j in range(number_of_times_to_run):
    print("RUN: ",j)
    for i in range(number_of_queries): 
        print("query_number: ",i)
        

        execution_time = run_query(os.path.join(queries_dir,f"query_{i}.txt"),list_of_participants)

        if i not in execution_summary.keys():
            execution_summary[i] = [execution_time]
        else:
            execution_summary[i].append(execution_time)

query_df = pd.DataFrame({
    'query_number': execution_summary.keys(),
    'execution_times': execution_summary.values()
})
runs_df = pd.DataFrame(execution_summary).T
total_run_time = runs_df.sum(axis=0).tolist()
query_df = pd.concat([query_df,pd.DataFrame({'query_number':['total'],'execution_times':[total_run_time]})]).reset_index(drop=True)
query_df['min_time'] = query_df['execution_times'].apply(min)
query_df['median_time'] = query_df['execution_times'].apply(np.median)
query_df['mean_time'] = query_df['execution_times'].apply(np.mean)
query_df['std_dev'] = query_df['execution_times'].apply(np.std)
query_df['max_time'] = query_df['execution_times'].apply(max)
query_df.to_csv(os.path.join(config.RESULTS_PATH,f"stats_influx_query_run_time_scale_{scale_factor}.csv"),index=False)

RUN:  0
query_number:  0
Time of Execution: 77.2609589998865
query_number:  1
Time of Execution: 3.283832999841252
query_number:  2
Time of Execution: 4.035250000015367
query_number:  3
Time of Execution: 66.19899999986956
query_number:  4
Time of Execution: 53047.50491699997
query_number:  5
Time of Execution: 1824.5127079999293
query_number:  6
Time of Execution: 101.82470800009469
query_number:  7
Time of Execution: 4.966874999809079
query_number:  8
Time of Execution: 33.7790829998994
RUN:  1
query_number:  0
Time of Execution: 76.14808300013465
query_number:  1
Time of Execution: 5.8059580001099675
query_number:  2
Time of Execution: 7.88212499992369
query_number:  3
Time of Execution: 62.97958300001483
query_number:  4
Time of Execution: 53025.66408300004
query_number:  5
Time of Execution: 1837.4935419999474
query_number:  6
Time of Execution: 101.77429200007282
query_number:  7
Time of Execution: 5.0086250000731525
query_number:  8
Time of Execution: 39.2536670001391
RUN:  2
qu