In [1]:
import pandas as pd
from sqlalchemy import create_engine,text
import psycopg2
import os
import io
import time 
import numpy as np
from psycopg2 import sql
import timeit
from jinja2 import Template
from influxdb import InfluxDBClient,DataFrameClient
import sys 
sys.path.append(".")
from insert_data import *
import config 

In [2]:
client = InfluxDBClient(host='localhost', port=8086, database='benchmark_db')
client.drop_database('benchmark_db')
client.create_database('benchmark_db')
client.close()

In [3]:
## Load Demographics Data 
client = InfluxDBClient(host='localhost', port=8086, database='benchmark_db')
demo_df = pd.read_csv(os.path.join(config.TRANSFORM_DATA_PATH,'Demographics.csv'))
points = []
for index, data in demo_df.iterrows():
    point = {
        "measurement": "demographics",  # Measurement name
        "tags": {
            "ID": data['ID'],
            "Gender": data["Gender"]  # Tag for categorical data
        },
        "fields": {
            "HbA1c": data["HbA1c"]
        }
    }
    points.append(point)

client.write_points(points)
client.close()

In [4]:
scale_factor = config.SCALE_FACTOR
def integer_to_places_string(number):
    
    # Ensure the input is a valid integer within range
    if not isinstance(number, int) or not (0 <= number <= 999):
        raise ValueError("Input must be an integer between 0 and 999.")

    # Extract hundreds, tens, and ones
    hundreds = number // 100
    tens = (number // 10) % 10
    ones = number % 10

    # Format into the desired string
    result = f"{hundreds}{tens}{ones}"
    return result
folder_to_use = [integer_to_places_string(i) for i in range(1,scale_factor+1)]
accepted_files = ['ACC','BVP','Dexcom','EDA','HR','IBI','TEMP']  ## if want to ignore a table remove it from the list

## Loading Data

In [6]:
list_of_metrics = []
for i in range(0,scale_factor):
    folder_path = os.path.join(config.TRANSFORM_DATA_PATH,folder_to_use[i])
    
    for file in accepted_files:
        
        file_path = os.path.join(folder_path,f'{file}_{folder_to_use[i]}.csv')
        metrics = {
            'file_name': file_path.split("/")[-1],
            'insertion_time_ms': 0,
            'wall_time_ms': 0
        }
        
        if file=="ACC": 
            wall_time, insertion_time = insert_acc_data(file_path)
        elif file=="BVP":
            wall_time, insertion_time = insert_bvp_data(file_path)
        elif file=="Dexcom":
            wall_time, insertion_time = insert_dexcom_data(file_path)
        elif file=="EDA": 
            wall_time, insertion_time = insert_eda_data(file_path)
        elif file=="HR": 
            wall_time, insertion_time = insert_hr_data(file_path)
        elif file=="IBI": 
            wall_time, insertion_time = insert_ibi_data(file_path)
        elif file=="TEMP": 
            wall_time, insertion_time = insert_temp_data(file_path)


        
        metrics['insertion_time_ms'] = insertion_time
        metrics['wall_time_ms'] = wall_time
        print("file path:",file_path)
        print("insertion_time:", insertion_time)
        print("wall_time:", wall_time)
        list_of_metrics.append(metrics)


report_df = pd.DataFrame(list_of_metrics)
total_df =pd.DataFrame(report_df.select_dtypes(include=['float','int']).sum()).T 
total_df.insert(0,'file_name',['Total'])
report_df = pd.concat([report_df,total_df],axis=0).reset_index(drop=True)
report_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_insertion_stats_scale_{scale_factor}.csv"),index=False)

file path: ../new_data/001/ACC_001.csv
insertion_time: 118306.96975000319
wall_time: 118308.81261825562
file path: ../new_data/001/BVP_001.csv
insertion_time: 173824.6142499993
wall_time: 173827.62718200684
file path: ../new_data/001/Dexcom_001.csv
insertion_time: 50.0277079991065
wall_time: 50.170183181762695
file path: ../new_data/001/EDA_001.csv
insertion_time: 11191.996708999795
wall_time: 11192.29507446289


  df['time'] = pd.to_datetime(df['datetime'])


file path: ../new_data/001/HR_001.csv
insertion_time: 2660.2898749988526
wall_time: 2660.4936122894287
file path: ../new_data/001/IBI_001.csv
insertion_time: 1115.5374999943888
wall_time: 1115.724802017212
file path: ../new_data/001/TEMP_001.csv
insertion_time: 10571.018124996044
wall_time: 10571.304321289062
file path: ../new_data/002/ACC_002.csv
insertion_time: 118262.96000000002
wall_time: 118264.65892791748
file path: ../new_data/002/BVP_002.csv
insertion_time: 162155.96695800195
wall_time: 162158.34379196167
file path: ../new_data/002/Dexcom_002.csv
insertion_time: 71.7597909970209
wall_time: 71.90775871276855
file path: ../new_data/002/EDA_002.csv
insertion_time: 10984.083916999225
wall_time: 10984.405994415283
file path: ../new_data/002/HR_002.csv
insertion_time: 2566.576624994923
wall_time: 2566.789150238037
file path: ../new_data/002/IBI_002.csv
insertion_time: 2000.6879169959575
wall_time: 2000.8621215820312
file path: ../new_data/002/TEMP_002.csv
insertion_time: 10191.233124

In [7]:
report_df

Unnamed: 0,file_name,insertion_time_ms,wall_time_ms
0,ACC_001.csv,118306.96975,118308.812618
1,BVP_001.csv,173824.61425,173827.627182
2,Dexcom_001.csv,50.027708,50.170183
3,EDA_001.csv,11191.996709,11192.295074
4,HR_001.csv,2660.289875,2660.493612
5,IBI_001.csv,1115.5375,1115.724802
6,TEMP_001.csv,10571.018125,10571.304321
7,ACC_002.csv,118262.96,118264.658928
8,BVP_002.csv,162155.966958,162158.343792
9,Dexcom_002.csv,71.759791,71.907759


In [11]:
table_names = {
    'ACC':'accelerometer_data',
    'BVP':'blood_volume_pulse',
    'Dexcom':'interstitial_glucose',
    'EDA':'electrodermal_activity',
    'HR':'heart_rate_data',
    'IBI':'ibi_data',
    'TEMP':'temperature_data'
}

In [41]:
def get_rows_inserted(table_name):
    
    client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME)
    query = f'SELECT count(*) FROM "{table_name}"'
    result = client.query(query)
    client.close()
    for i in result.get_points():
        count = list(i.values())[1]
        return count


In [42]:
row_info = {}
for name in table_names.values():
    row_info[name] = get_rows_inserted(name)

row_df = pd.DataFrame(row_info.items(),columns=['table_name','number_of_rows_inserted'])
row_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_insertion_stats_num_rows_scale_{scale_factor}.csv"),index=False)

In [43]:
row_df

Unnamed: 0,table_name,number_of_rows_inserted
0,accelerometer_data,40448658
1,blood_volume_pulse,80897311
2,interstitial_glucose,4680
3,electrodermal_activity,5056068
4,heart_rate_data,640250
5,ibi_data,740374
6,temperature_data,5056032


In [103]:
def get_table_sizes():
    query_2 = """SHOW STATS"""
    

    client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME) 
    result = client.query(query_2)
    client.close()

    res = [value for value in result.get_points()]
    bytes_sum = 0
    for sample_dict in res:
        if 'diskBytes' in sample_dict:
            bytes_sum += sample_dict['diskBytes']
    
    
    return round(bytes_sum/(1024*1024),4)

In [104]:
val = get_table_sizes()

In [105]:
size_df = pd.DataFrame([{'Total Disk Usage(MB)':val}])
size_df.to_csv(os.path.join(config.RESULTS_PATH,f"influx_compression_stats_size_scale_{scale_factor}.csv"),index=False)

## Run the Influx Queries

In [None]:
number_of_queries = 9 
number_of_times_to_run = config.NUMBER_TIMES_TO_RUN_QUERY

In [120]:
# Connect to InfluxDB (replace with your actual connection details)
client = InfluxDBClient(host=config.DB_HOST, port=config.DB_PORT, database=config.DB_NAME)

# Query the data from InfluxDB
query = """
SELECT
    participant_id,
    glucose_value AS current_glucose,
    time AS current_time
FROM
    interstitial_glucose
ORDER BY
    time
"""
result = client.query(query)

#Convert the result into a DataFrame
df = pd.DataFrame([val for val in result.get_points()])

#Initialize columns for the next time and glucose, as well as the time difference
df['next_time'] = df.groupby('participant_id')['current_time'].shift(-1)
df['next_glucose'] = df.groupby('participant_id')['current_glucose'].shift(-1)

#Calculate time difference and glucose change
df['time_diff'] = pd.to_datetime(df['next_time']) - pd.to_datetime(df['current_time'])
df['glucose_change'] = df['next_glucose'] - df['current_glucose']

# Filter the results based on the glucose change and time difference conditions
filtered_df = df[(df['glucose_change'] > 14) & (df['time_diff'] <= pd.Timedelta('30 minutes'))]

client.close()


Unnamed: 0,current_time,participant_id,current_glucose,next_time,next_glucose,time_diff,glucose_change
10,2020-02-13T18:13:32Z,1,70.0,2020-02-13T18:18:32Z,86.0,0 days 00:05:00,16.0
11,2020-02-13T18:18:32Z,1,86.0,2020-02-13T18:23:32Z,105.0,0 days 00:05:00,19.0
756,2020-02-16T08:28:30Z,1,79.0,2020-02-16T08:33:30Z,96.0,0 days 00:05:00,17.0
757,2020-02-16T08:33:30Z,1,96.0,2020-02-16T08:38:29Z,111.0,0 days 00:04:59,15.0
2836,2020-02-22T13:58:35Z,2,118.0,2020-02-22T14:03:34Z,133.0,0 days 00:04:59,15.0
2838,2020-02-22T14:03:34Z,2,133.0,2020-02-22T14:08:34Z,153.0,0 days 00:05:00,20.0
2840,2020-02-22T14:08:34Z,2,153.0,2020-02-22T14:13:33Z,174.0,0 days 00:04:59,21.0
3411,2020-02-24T09:58:31Z,2,135.0,2020-02-24T10:03:32Z,151.0,0 days 00:05:01,16.0
3412,2020-02-24T10:03:32Z,2,151.0,2020-02-24T10:08:32Z,166.0,0 days 00:05:00,15.0
