This notebook documents and analyzes discrepancies between the Python and R implementations of IGLU.
It contains test cases that demonstrate unexpected or questionable results from the R implementation.

In [3]:
import sys
from importlib.metadata import version

import pandas as pd
import rpy2.robjects as ro
from iglu_py import bridge

In [7]:
# Print versions for future references
print(f"Python version: {sys.version}")
print(f"R version: {ro.r('R.version.string')}")
iglu_version = str(ro.r('packageVersion("iglu")'))
print(f"iglu version: {iglu_version}")
print(f"iglu_py version: {version('iglu-py')}")
print(f"rpy2 version: {version('rpy2')}")


Python version: 3.11.10 (main, Oct  3 2024, 02:26:51) [Clang 14.0.6 ]
R version: [1] "R version 4.4.3 (2025-02-28)"

iglu version: [1] ‘4.2.2’

iglu_py version: 1.1.1
rpy2 version: 3.6.0


# CGMS2DayByDay Discrepancies      

In [11]:
@bridge.df_conversion
def my_CGMS2DayByDay(data: pd.DataFrame, **kwargs):

    r_named_list = bridge.iglu_r.CGMS2DayByDay(data, **kwargs)

    result = {
        name: ro.conversion.rpy2py(r_named_list[i])
            for i, name in enumerate(r_named_list.names())
    }

    result['actual_dates'] = [pd.to_datetime(d, unit='D', origin='1970-01-01') for d in result['actual_dates']]
    result['dt0'] = result['dt0'][0]

    return result


Use a very simple test data - 4 measurements every 5 min , starting at midnight.

In [22]:
data = pd.DataFrame({
    'id': ['subject1', 'subject1', 'subject1', 'subject1'],
    'time': pd.to_datetime([
        '2020-01-01 00:00:00',  # 0 min
        '2020-01-01 00:05:00',  # 5 min
        '2020-01-01 00:10:00',  # 10 min
        '2020-01-01 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]
})
display(data)

Unnamed: 0,id,time,gl
0,subject1,2020-01-01 00:00:00,150
1,subject1,2020-01-01 00:05:00,155
2,subject1,2020-01-01 00:10:00,160
3,subject1,2020-01-01 00:15:00,165


It is expected that CGMS2DayByDay returns a 288 measurements for a single day - 4 measurements from input DataFrame ( as it timed exactly to interpolation grid), followed by 284 nan values

In [27]:
r_result = my_CGMS2DayByDay(data)

gd2d = r_result['gd2d']
actual_dates = r_result['actual_dates']
dt0 = r_result['dt0']

print(gd2d.shape)       # expected (1,288)
print(actual_dates)     # expected [datetime.date(2020, 1, 1)]
print(dt0)              # expected 5

print(gd2d[:,0:5])      # expected [[150. 155. 160. 165. nan]]





(2, 288)
[Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00')]
5.0
[[155. 160. 165.  nan  nan]
 [ nan  nan  nan  nan  nan]]


In [28]:
r_result = my_CGMS2DayByDay(data,tz="UTC")

gd2d = r_result['gd2d']
actual_dates = r_result['actual_dates']
dt0 = r_result['dt0']

print(gd2d.shape)       # expected (1,288)
print(actual_dates)     # expected [datetime.date(2020, 1, 1)]
print(dt0)              # expected 5

print(gd2d[:,0:5])      # expected [[150. 155. 160. 165. nan]]

(2, 288)
[Timestamp('2019-12-31 00:00:00'), Timestamp('2020-01-01 00:00:00')]
5.0
[[155. 160. 165.  nan  nan]
 [ nan  nan  nan  nan  nan]]


Lets try with a 4 measurement at 10am. On 5 min grid, 10am measurement has to be 10*(60/5)=120 position. 

In [29]:
data = pd.DataFrame({
    'id': ['subject1', 'subject1', 'subject1', 'subject1'],
    'time': pd.to_datetime([
        '2020-01-01 10:00:00',  # 0 min
        '2020-01-01 10:05:00',  # 5 min
        '2020-01-01 10:10:00',  # 10 min
        '2020-01-01 10:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]
})
display(data)

Unnamed: 0,id,time,gl
0,subject1,2020-01-01 10:00:00,150
1,subject1,2020-01-01 10:05:00,155
2,subject1,2020-01-01 10:10:00,160
3,subject1,2020-01-01 10:15:00,165


In [35]:
r_result = my_CGMS2DayByDay(data)

gd2d = r_result['gd2d']
actual_dates = r_result['actual_dates']
dt0 = r_result['dt0']

print(gd2d.shape)       # expected (1,288)
print(actual_dates)     # expected [datetime.date(2020, 1, 1)]
print(dt0)              # expected 5

indx_10am = int((60/dt0) * 10)
print(indx_10am)
print(gd2d[:,indx_10am:indx_10am+6])      # expected [[ 150. 155. 160. 165.  nan  nan]
print(gd2d[:,indx_10am-1:indx_10am+5])    # but we'll get expected on one position left


(2, 288)
[Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00')]
5.0
120
[[155. 160. 165.  nan  nan  nan]
 [ nan  nan  nan  nan  nan  nan]]
[[150. 155. 160. 165.  nan  nan]
 [ nan  nan  nan  nan  nan  nan]]


Lets look now on data that spans two consecutive days

In [47]:
data = pd.DataFrame({
    'id': ['subject1']*8,
    'time': pd.to_datetime([
        '2020-01-01 00:00:00',  # 0 min
        '2020-01-01 00:05:00',  # 5 min
        '2020-01-01 00:10:00',  # 10 min
        '2020-01-01 00:15:00',  # 15 min
        '2020-01-02 00:00:00',  # 0 min
        '2020-01-02 00:05:00',  # 5 min
        '2020-01-02 00:10:00',  # 10 min
        '2020-01-02 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]*2
})
display(data)

Unnamed: 0,id,time,gl
0,subject1,2020-01-01 00:00:00,150
1,subject1,2020-01-01 00:05:00,155
2,subject1,2020-01-01 00:10:00,160
3,subject1,2020-01-01 00:15:00,165
4,subject1,2020-01-02 00:00:00,150
5,subject1,2020-01-02 00:05:00,155
6,subject1,2020-01-02 00:10:00,160
7,subject1,2020-01-02 00:15:00,165


In [48]:
r_result = my_CGMS2DayByDay(data,tz="UTC")

gd2d = r_result['gd2d']
actual_dates = r_result['actual_dates']
dt0 = r_result['dt0']

print(gd2d.shape)       # expected (1,288)
print(actual_dates)     # expected [datetime.date(2020, 1, 1)]
print(dt0)              # expected 5

print(gd2d[:,0:5])      # expected [[150. 155. 160. 165. nan]]

(3, 288)
[Timestamp('2019-12-31 00:00:00'), Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00')]
5.0
[[155. 160. 165.  nan  nan]
 [155. 160. 165.  nan  nan]
 [ nan  nan  nan  nan  nan]]


Lets test two-days records that cross over midnight  

In [49]:
data = pd.DataFrame({
    'id': ['subject1']*8,
    'time': pd.to_datetime([
        '2020-01-01 23:40:00',  # 0 min
        '2020-01-01 23:45:00',  # 5 min
        '2020-01-01 23:50:00',  # 10 min
        '2020-01-01 23:55:00',  # 15 min
        '2020-01-02 00:00:00',  # 0 min
        '2020-01-02 00:05:00',  # 5 min
        '2020-01-02 00:10:00',  # 10 min
        '2020-01-02 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165, 170, 175, 180, 185]
})
display(data)

Unnamed: 0,id,time,gl
0,subject1,2020-01-01 23:40:00,150
1,subject1,2020-01-01 23:45:00,155
2,subject1,2020-01-01 23:50:00,160
3,subject1,2020-01-01 23:55:00,165
4,subject1,2020-01-02 00:00:00,170
5,subject1,2020-01-02 00:05:00,175
6,subject1,2020-01-02 00:10:00,180
7,subject1,2020-01-02 00:15:00,185


In [50]:
r_result = my_CGMS2DayByDay(data,tz="UTC")

gd2d = r_result['gd2d']
actual_dates = r_result['actual_dates']
dt0 = r_result['dt0']

print(gd2d.shape)       # expected (1,288)
print(actual_dates)     # expected [datetime.date(2020, 1, 1)]
print(dt0)              # expected 5

print(gd2d[:,0:5])      # expected [[150. 155. 160. 165. nan]]

(2, 288)
[Timestamp('2019-12-31 00:00:00'), Timestamp('2020-01-01 00:00:00')]
5.0
[[ nan  nan  nan  nan  nan]
 [175. 180. 185.  nan  nan]]


# check_data_columns

In [18]:
@bridge.df_conversion
def my_check_data_columns(data: pd.DataFrame, **kwargs):

   return bridge.iglu_r.check_data_columns(data, **kwargs)


In [44]:
data = pd.DataFrame({
    'id': ['subject1', 'subject1', 'subject1', 'subject1'],
    'time': pd.to_datetime([
        '2020-01-01 00:00:00',  # 0 min
        '2020-01-01 00:05:00',  # 5 min
        '2020-01-01 00:10:00',  # 10 min
        '2020-01-01 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]
})

r_result = my_check_data_columns(data,tz="UTC")

print(r_result)

         id                      time   gl
0  subject1 2020-01-01 00:00:00+02:00  150
1  subject1 2020-01-01 00:05:00+02:00  155
2  subject1 2020-01-01 00:10:00+02:00  160
3  subject1 2020-01-01 00:15:00+02:00  165


In [19]:
data = pd.DataFrame({
    'id': ['subject1', 'subject1', 'subject1', 'subject1'],
    'time': pd.to_datetime([
        '2020-01-01 00:00:00',  # 0 min
        '2020-01-01 00:05:00',  # 5 min
        '2020-01-01 00:10:00',  # 10 min
        '2020-01-01 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]
})

r_result = my_check_data_columns(data,tz="UTC")

print(r_result)

         id                      time   gl
0  subject1 2020-01-01 00:00:00+02:00  150
1  subject1 2020-01-01 00:05:00+02:00  155
2  subject1 2020-01-01 00:10:00+02:00  160
3  subject1 2020-01-01 00:15:00+02:00  165


In [45]:
data = pd.DataFrame({
    'id': ['subject1', 'subject1', 'subject1', 'subject1'],
    'time': pd.to_datetime([
        '2020-01-01 00:00:00',  # 0 min
        '2020-01-01 00:05:00',  # 5 min
        '2020-01-01 00:10:00',  # 10 min
        '2020-01-01 00:15:00',  # 15 min
    ]),
    'gl': [150, 155, 160, 165]
})

r_result = my_check_data_columns(data,time_check=True,tz="UTC")

print(r_result)

         id                      time   gl
1  subject1 2020-01-01 00:00:00+02:00  150
2  subject1 2020-01-01 00:05:00+02:00  155
3  subject1 2020-01-01 00:10:00+02:00  160
4  subject1 2020-01-01 00:15:00+02:00  165
