In [99]:
import numpy as np
import pandas as pd
import glob
import os

In [100]:
# 1) 파일 경로 지정
time_fp = "T1_granada_time/193263_0_time.npy"
data_fp = "T1_granada/193263_0.npy"

# 2) .npy 파일 로드 (datetime 배열 또는 숫자 배열)
time_arr = np.load(time_fp, allow_pickle=True)
data_arr = np.load(data_fp, allow_pickle=True)

# 3) 첫 열만 추출/출력
def print_first_column(arr, name):
    print(f"=== {name} : 첫 열 내용 ===")
    if arr.ndim == 1:
        # 1D array는 곧 첫 열 전체
        print(arr)
    else:
        # 2D 이상 배열인 경우 첫 열만
        print(arr[:, 0])
    print()

print_first_column(time_arr, "193263_0_time.npy")
print_first_column(data_arr, "193263_0.npy")

=== 193263_0_time.npy : 첫 열 내용 ===
[datetime.date(2020, 6, 10) datetime.date(2020, 6, 11)
 datetime.date(2020, 6, 12) datetime.date(2020, 6, 13)
 datetime.date(2020, 6, 14) datetime.date(2020, 6, 15)
 datetime.date(2020, 6, 16) datetime.date(2020, 6, 17)
 datetime.date(2020, 6, 18) datetime.date(2020, 6, 19)
 datetime.date(2020, 6, 20) datetime.date(2020, 6, 21)
 datetime.date(2020, 6, 22) datetime.date(2020, 6, 23)
 datetime.date(2020, 6, 24) datetime.date(2020, 6, 25)
 datetime.date(2020, 6, 26) datetime.date(2020, 6, 27)
 datetime.date(2020, 6, 28) datetime.date(2020, 6, 29)
 datetime.date(2020, 6, 30) datetime.date(2020, 7, 1)
 datetime.date(2020, 7, 2) datetime.date(2020, 7, 3)
 datetime.date(2020, 7, 4) datetime.date(2020, 7, 5)
 datetime.date(2020, 7, 6) datetime.date(2020, 7, 7)
 datetime.date(2020, 7, 8) datetime.date(2020, 7, 9)
 datetime.date(2020, 7, 10) datetime.date(2020, 7, 11)
 datetime.date(2020, 7, 12) datetime.date(2020, 7, 13)
 datetime.date(2020, 7, 14) datetime.da

In [101]:
# 1. Load demographic data
demographic = pd.read_csv("T1DiabetesGranada_demographic.csv")

# 2. Identify LIB193263 spans files
span_files = sorted(glob.glob("T1_granada_time/193263_*_time.npy"))

# 3. Compute span lengths
span_records = []
for idx, fpath in enumerate(span_files):
    arr = np.load(fpath, allow_pickle=True)
    length_days = arr.shape[0]
    span_records.append({
        "id": "LIB193263",
        "continuous_data_index": idx,
        "length": length_days
    })

spans_df = pd.DataFrame(span_records)

# 4. Calculate summary metrics
shortest_span = spans_df["length"].min()
has_at_least_7 = (spans_df["length"] >= 7).any()
has_at_least_14 = (spans_df["length"] >= 14).any()
count_spans_ge_7 = (spans_df["length"] >= 7).sum()
count_spans_ge_14 = (spans_df["length"] >= 14).sum()

# 5. Update demographic for LIB193263
demographic.loc[demographic["Patient_ID"] == "LIB193263", "shortest_span_days"] = shortest_span

# 6. Display results
print("=== Span Lengths for LIB193263 ===")
print(spans_df.to_string(index=False))

print("\nSummary for LIB193263:")
print(f"- Shortest continuous span: {shortest_span} days")
print(f"- Number of spans ≥ 7 days: {count_spans_ge_7}")
print(f"- Number of spans ≥ 14 days: {count_spans_ge_14}")
print(f"- Has at least one span ≥ 7 days? {has_at_least_7}")
print(f"- Has at least one span ≥ 14 days? {has_at_least_14}")

# 7. (Optionally) Save outputs
# spans_df.to_csv("LIB193263_continuous_spans.csv", index=False)
# demographic.to_csv("demographic_updated.csv", index=False)


=== Span Lengths for LIB193263 ===
       id  continuous_data_index  length
LIB193263                      0      52
LIB193263                      1      26
LIB193263                      2      37
LIB193263                      3      82
LIB193263                      4      13
LIB193263                      5       8
LIB193263                      6      40
LIB193263                      7      26
LIB193263                      8      26
LIB193263                      9      26
LIB193263                     10      79
LIB193263                     11      70
LIB193263                     12      26
LIB193263                     13      39
LIB193263                     14      67

Summary for LIB193263:
- Shortest continuous span: 8 days
- Number of spans ≥ 7 days: 15
- Number of spans ≥ 14 days: 13
- Has at least one span ≥ 7 days? True
- Has at least one span ≥ 14 days? True


In [102]:
# 1) Load all span files for subject 193263
span_files = sorted(glob.glob("T1_granada_time/193263_*_time.npy"))
all_good = True

for idx, fp in enumerate(span_files):
    arr = np.load(fp, allow_pickle=True)  # Python datetime array
    
    # 2) Compute day‐wise gaps between consecutive entries using .days
    #    gaps[i] = number of days between arr[i] and arr[i+1]
    gaps = [(arr[i+1] - arr[i]).days for i in range(len(arr) - 1)]
    
    # 3) Identify any gap > 2 days (tolerance: up to 2 days is OK)
    bad_gaps = [g for g in gaps if g > 2]
    if bad_gaps:
        # Report which span and what distinct gaps were too large
        print(f"Span {idx} has excessive date gaps: {set(bad_gaps)} days")
        all_good = False

if all_good:
    print("All spans pass date‐vector validation (gaps ≤ 2 days).")

All spans pass date‐vector validation (gaps ≤ 2 days).


In [103]:
# 1) 시간 벡터 파일 목록 불러오기
time_files = sorted(glob.glob("T1_granada_time/193263_*_time.npy"))

print(f"{'파일명':40s} {'시작일':12s} {'종료일':12s} {'엔트리수':>8s}")
print("-" * 75)

for fp in time_files:
    # 2) numpy 로드 (python datetime.date 배열)
    arr = np.load(fp, allow_pickle=True)
    
    # 3) 시작·종료 날짜 추출 (datetime.date 객체)
    start_date = arr.min()   # already a datetime.date
    end_date   = arr.max()
    
    # 4) 엔트리 수(일수)
    entry_count = len(arr)
    
    # 5) 결과 출력
    filename = os.path.basename(fp)
    print(f"{filename:40s} {start_date:%Y-%m-%d}   {end_date:%Y-%m-%d}   {entry_count:8d}")

파일명                                      시작일          종료일              엔트리수
---------------------------------------------------------------------------
193263_0_time.npy                        2020-06-10   2020-08-01         52
193263_10_time.npy                       2021-09-26   2021-10-22         26
193263_11_time.npy                       2021-10-24   2021-11-30         37
193263_12_time.npy                       2021-12-02   2022-02-22         82
193263_13_time.npy                       2022-02-24   2022-03-08         13
193263_14_time.npy                       2022-03-11   2022-03-18          8
193263_1_time.npy                        2020-08-04   2020-09-12         40
193263_2_time.npy                        2020-09-15   2020-10-10         26
193263_3_time.npy                        2020-10-13   2020-11-07         26
193263_4_time.npy                        2020-11-10   2020-12-05         26
193263_5_time.npy                        2020-12-08   2021-02-24         79
193263_6_tim

In [104]:
# 1) 파일 읽기
df = pd.read_csv('T1_granada_filtered.csv', parse_dates=['time'])

# 2) 환자별로 시간 차 계산 (분 단위)
df['diff_min'] = df.groupby('id')['time'].diff().dt.total_seconds() / 60

# 3) 전체 간격별 빈도 계산
counts = df['diff_min'].value_counts().sort_index()

# 4) 10분부터 30분까지, 각 분별 건수 추출 (없으면 0으로)
minutes = list(range(10, 31))
counts_10_30 = counts.reindex(minutes, fill_value=0)

# 3) 전체 구간의 통계
print(df['diff_min'].describe())

# 5) 결과 출력
print(counts_10_30)

count    1.897551e+07
mean     1.741869e+01
std      3.207608e+02
min      0.000000e+00
25%      1.500000e+01
50%      1.500000e+01
75%      1.500000e+01
max      3.809030e+05
Name: diff_min, dtype: float64
diff_min
10       27691
11       32111
12       37058
13       43007
14      252363
15    17172491
16      763000
17      136677
18       35975
19       10557
20        2299
21         382
22         125
23         411
24         416
25         321
26          68
27          38
28          77
29         711
30        7807
Name: count, dtype: int64


In [105]:
# precise_from_csv_193263.py
# -----------------------------------------------------------------------------
# Part 1: Identify continuous CGM spans for patient LIB193263
# -----------------------------------------------------------------------------

# 1) Load the CSV into a pandas DataFrame, parsing the 'time' column as datetimes
df = pd.read_csv("T1_granada_filtered.csv", parse_dates=["time"])

#    Then filter to only keep rows for patient with id "LIB193263",
#    sort chronologically by 'time', and reset the DataFrame index
df = (
    df[df.id == "LIB193263"]
    .sort_values("time")
    .reset_index(drop=True)
)

# 2) Compute the time difference between each reading and the one before it
#    The result is a pandas Timedelta column named 'diff'
df["diff"] = df["time"].diff()

# 3) Mark the start of a new “continuous” segment whenever the gap exceeds 180 minutes
#    - df["new_seg"] is True at the first reading after a gap > 180 minutes
#    - fillna(False) ensures the very first row isn’t marked as a new segment
timesegment = 180
df["new_seg"] = (df["diff"] > timedelta(minutes=timesegment)).fillna(False)

#    Cumulatively sum those True flags to assign a unique segment_id to each block
df["segment_id"] = df["new_seg"].cumsum()

# 4) For each continuous segment, compute:
#      - start: timestamp of the first reading
#      - end:   timestamp of the last reading
#      - count: number of readings in that segment
segs = (
    df.groupby("segment_id")["time"]
      .agg(start="first", end="last", count="size")
      .reset_index(drop=True)
)

#    Then compute the duration of each segment in days:
#    (end - start).total_seconds() / 86400 converts seconds to days
segs["duration_days"] = (
    segs["end"] - segs["start"]
).dt.total_seconds() / 86400.0

# 5) Count how many segments last at least 7 days and at least 14 days
num_ge7  = (segs["duration_days"] >=  7.0).sum()
num_ge14 = (segs["duration_days"] >= 14.0).sum()

#    Determine whether any such long spans exist for this patient
has_ge7  = num_ge7  > 0
has_ge14 = num_ge14 > 0

# 6) Print out a report summarizing each segment’s start, end, and duration,
#    plus the counts of spans ≥ 7 days and ≥ 14 days
print("=== Precise CSV Analysis for LIB193263 ===")
print(segs[["start", "end", "duration_days"]])
print(f"Number of spans ≥ 7 days:  {num_ge7}")
print(f"Number of spans ≥ 14 days: {num_ge14}")
print(f"Has ≥7-day span?  {has_ge7}")
print(f"Has ≥14-day span? {has_ge14}")

# 7) (Optional) Save the segment table to a new CSV for later inspection
#    Uncomment the lines below to output a file named
#    "LIB193263_precise_spans.csv" containing segment index, start, end, and duration.
# segs = segs.reset_index().rename(columns={"index": "continuous_data_index"})
# segs[["continuous_data_index", "start", "end", "duration_days"]].to_csv(
#     "LIB193263_precise_spans.csv", index=False
# )

=== Precise CSV Analysis for LIB193263 ===
                 start                 end  duration_days
0  2020-06-09 19:08:00 2020-07-21 18:04:00      41.955556
1  2020-07-21 22:18:00 2020-08-02 20:11:00      11.911806
2  2020-08-03 00:06:00 2020-09-13 19:45:00      41.818750
3  2020-09-14 00:33:00 2020-10-11 19:50:00      27.803472
4  2020-10-12 00:49:00 2020-11-08 19:36:00      27.782639
5  2020-11-09 00:06:00 2020-12-06 19:35:00      27.811806
6  2020-12-07 00:09:00 2021-02-25 19:40:00      80.813194
7  2021-02-26 00:11:00 2021-04-25 13:33:00      58.556944
8  2021-04-25 16:54:00 2021-05-09 13:43:00      13.867361
9  2021-05-09 16:56:00 2021-05-23 13:33:00      13.859028
10 2021-05-23 17:53:00 2021-06-06 13:35:00      13.820833
11 2021-06-06 17:55:00 2021-06-20 13:32:00      13.817361
12 2021-06-20 17:56:00 2021-07-17 23:55:00      27.249306
13 2021-07-18 03:03:00 2021-08-14 12:39:00      27.400000
14 2021-08-14 15:47:00 2021-10-09 13:32:00      55.906250
15 2021-10-09 16:43:00 2021-1

In [106]:
# 1. Load demographic data
# ----------------------------------------------------------------
demographic = pd.read_csv("T1DiabetesGranada_demographic.csv")

# 2. Find all span files in T1_granada_time
#    Filenames like "193263_0_time.npy", "193263_1_time.npy", ...
# ----------------------------------------------------------------
span_files = sorted(glob.glob("T1_granada_time/*_time.npy"))

# 3. Build a list of span records for every subject
# ----------------------------------------------------------------
span_records = []
for fp in span_files:
    # Extract filename, e.g. "193263_0_time.npy"
    fname = os.path.basename(fp)
    parts = fname.split("_")
    subj_num = parts[0]               # e.g. "193263"
    span_idx = int(parts[1])          # e.g. 0, 1, 2, ...
    subject_id = "LIB" + subj_num     # match the format in demographic

    # Load the numpy date‐vector (one entry per day of continuous data)
    arr = np.load(fp, allow_pickle=True)  

    # Length in days = number of entries
    length_days = arr.shape[0]

    span_records.append({
        "id": subject_id,
        "continuous_data_index": span_idx,
        "length": length_days
    })

# Turn into DataFrame
spans_df = pd.DataFrame(span_records)

# 4. Summary: how many subjects have ≥7d spans, ≥14d spans?
# ----------------------------------------------------------------
# subjects with any span ≥7 days
num_subjects_ge7  = spans_df.loc[spans_df["length"] >=  7, "id"].nunique()
# subjects with any span ≥14 days
num_subjects_ge14 = spans_df.loc[spans_df["length"] >= 14, "id"].nunique()

print(f"Number of subjects with ≥7-day span:  {num_subjects_ge7}")
print(f"Number of subjects with ≥14-day span: {num_subjects_ge14}")

# 5. For each subject, find their shortest continuous span
# ----------------------------------------------------------------
shortest_per_subject = (
    spans_df
    .groupby("id")["length"]
    .min()
    .reset_index(name="shortest_span_days")
)

# 6. Merge that into demographic
# ----------------------------------------------------------------
# assume demographic has column "Patient_ID" matching "LIBxxxxx"
dem_updated = pd.merge(
    demographic,
    shortest_per_subject,
    how="left",
    left_on="Patient_ID",
    right_on="id"
)
# drop the extra 'id' column
dem_updated = dem_updated.drop(columns=["id"])

# 7. Save outputs
# ----------------------------------------------------------------
# Detailed spans for all subjects
spans_df.to_csv("continuous_spans_details.csv", index=False)
# Updated demographics with shortest span info
dem_updated.to_csv("demographic_with_span.csv", index=False)

print("continuous_spans_details.csv and demographic_with_span.csv generated.")

Number of subjects with ≥7-day span:  561
Number of subjects with ≥14-day span: 495
continuous_spans_details.csv and demographic_with_span.csv generated.


In [107]:
# 1) 원시 CSV 로드 및 환자 필터링
df = pd.read_csv("T1_granada_filtered.csv", parse_dates=["time"])
subdf = (
    df[df.id == "LIB193263"]
    .sort_values("time")
    .reset_index(drop=True)
)

# 2) 인접 측정 간격(분) 계산
#    diffs[i] = time[i] – time[i-1] (min 단위)
diffs = subdf["time"].diff().dt.total_seconds().div(60)

# 3) 180분 초과시 새로운 세그먼트 시작
threshold_min = 180
subdf["span_id"] = (diffs > threshold_min).cumsum()

# 4) 출력용 디렉토리 생성
out_dir = "T1_granada_time_193263"
os.makedirs(out_dir, exist_ok=True)

# 5) span별로 시작·종료 시각 추출 → npy 파일로 저장
print(f"{'파일명':30s} {'시작 시각':20s} {'종료 시각':20s}")
print("-" * 70)

for span_idx, grp in subdf.groupby("span_id", sort=True):
    # 각 span의 첫·마지막 time 값 (numpy datetime64[ns])
    start_ts = grp["time"].iloc[0]
    end_ts   = grp["time"].iloc[-1]
    # 두 개의 타임스탬프를 담은 1차원 배열
    arr = np.array([start_ts.to_datetime64(), end_ts.to_datetime64()])
    
    fn = f"193263_{span_idx}_time.npy"
    np.save(os.path.join(out_dir, fn), arr)
    
    # 사람이 보기 쉽게 출력 (YYYY-MM-DD HH:MM:SS)
    start_str = start_ts.strftime("%Y-%m-%d %H:%M:%S")
    end_str   = end_ts.strftime("%Y-%m-%d %H:%M:%S")
    print(f"{fn:30s} {start_str:20s} {end_str:20s}")

print(f"총 {subdf['span_id'].nunique()}개의 span을 '{out_dir}'에 저장했습니다.")

파일명                            시작 시각                종료 시각               
----------------------------------------------------------------------
193263_0_time.npy              2020-06-09 19:08:00  2020-07-21 18:04:00 
193263_1_time.npy              2020-07-21 22:18:00  2020-08-02 20:11:00 
193263_2_time.npy              2020-08-03 00:06:00  2020-09-13 19:45:00 
193263_3_time.npy              2020-09-14 00:33:00  2020-10-11 19:50:00 
193263_4_time.npy              2020-10-12 00:49:00  2020-11-08 19:36:00 
193263_5_time.npy              2020-11-09 00:06:00  2020-12-06 19:35:00 
193263_6_time.npy              2020-12-07 00:09:00  2021-02-25 19:40:00 
193263_7_time.npy              2021-02-26 00:11:00  2021-04-25 13:33:00 
193263_8_time.npy              2021-04-25 16:54:00  2021-05-09 13:43:00 
193263_9_time.npy              2021-05-09 16:56:00  2021-05-23 13:33:00 
193263_10_time.npy             2021-05-23 17:53:00  2021-06-06 13:35:00 
193263_11_time.npy             2021-06-06 17:55:00  2

In [108]:
# 1) 시간 벡터 파일 목록 불러오기
time_files = sorted(glob.glob("T1_granada_time_193263/193263_*_time.npy"))

# 2) 헤더 출력 (시작·종료 시각이 분 단위까지 보이도록 칼럼명 수정)
print(f"{'파일명':40s} {'시작시각':17s} {'종료시각':17s} {'엔트리수':>8s}")
print("-" * 90)

for fp in time_files:
    # 3) numpy 로드 (python datetime.datetime 객체 배열 또는 numpy.datetime64 배열)
    arr = np.load(fp, allow_pickle=True)

    # 4) 시작·종료 시각 추출 (datetime.datetime 또는 numpy.datetime64)
    start_dt = arr.min()
    end_dt   = arr.max()

    # 5) 엔트리 수
    entry_count = len(arr)

    # 6) 결과 출력: 분 단위까지 포맷팅
    filename = os.path.basename(fp)
    # 만약 arr 요소가 numpy.datetime64이면 to_pydatetime()으로 변환
    if isinstance(start_dt, np.datetime64):
        start_dt = start_dt.astype('M8[m]').astype('O')
        end_dt   = end_dt.astype('M8[m]').astype('O')

    print(f"{filename:40s} "
          f"{start_dt:%Y-%m-%d %H:%M}   "
          f"{end_dt:%Y-%m-%d %H:%M}   "
          f"{entry_count:8d}")

파일명                                      시작시각              종료시각                  엔트리수
------------------------------------------------------------------------------------------
193263_0_time.npy                        2020-06-09 19:08   2020-07-21 18:04          2
193263_10_time.npy                       2021-05-23 17:53   2021-06-06 13:35          2
193263_11_time.npy                       2021-06-06 17:55   2021-06-20 13:32          2
193263_12_time.npy                       2021-06-20 17:56   2021-07-17 23:55          2
193263_13_time.npy                       2021-07-18 03:03   2021-08-14 12:39          2
193263_14_time.npy                       2021-08-14 15:47   2021-10-09 13:32          2
193263_15_time.npy                       2021-10-09 16:43   2021-10-23 13:30          2
193263_16_time.npy                       2021-10-23 18:50   2021-11-06 13:37          2
193263_17_time.npy                       2021-11-06 18:56   2021-12-01 13:38          2
193263_18_time.npy             

In [109]:
# 1) 시간 벡터 파일 목록 불러오기
time_files = sorted(glob.glob("T1_granada_time/193263_*_time.npy"))

# 2) 헤더 출력 (시작·종료 시각이 분 단위까지 보이도록 칼럼명 수정)
print(f"{'파일명':40s} {'시작시각':17s} {'종료시각':17s} {'엔트리수':>8s}")
print("-" * 90)

for fp in time_files:
    # 3) numpy 로드 (python datetime.datetime 객체 배열 또는 numpy.datetime64 배열)
    arr = np.load(fp, allow_pickle=True)

    # 4) 시작·종료 시각 추출 (datetime.datetime 또는 numpy.datetime64)
    start_dt = arr.min()
    end_dt   = arr.max()

    # 5) 엔트리 수
    entry_count = len(arr)

    # 6) 결과 출력: 분 단위까지 포맷팅
    filename = os.path.basename(fp)
    # 만약 arr 요소가 numpy.datetime64이면 to_pydatetime()으로 변환
    if isinstance(start_dt, np.datetime64):
        start_dt = start_dt.astype('M8[m]').astype('O')
        end_dt   = end_dt.astype('M8[m]').astype('O')

    print(f"{filename:40s} "
          f"{start_dt:%Y-%m-%d %H:%M}   "
          f"{end_dt:%Y-%m-%d %H:%M}   "
          f"{entry_count:8d}")

파일명                                      시작시각              종료시각                  엔트리수
------------------------------------------------------------------------------------------
193263_0_time.npy                        2020-06-10 00:00   2020-08-01 00:00         52
193263_10_time.npy                       2021-09-26 00:00   2021-10-22 00:00         26
193263_11_time.npy                       2021-10-24 00:00   2021-11-30 00:00         37
193263_12_time.npy                       2021-12-02 00:00   2022-02-22 00:00         82
193263_13_time.npy                       2022-02-24 00:00   2022-03-08 00:00         13
193263_14_time.npy                       2022-03-11 00:00   2022-03-18 00:00          8
193263_1_time.npy                        2020-08-04 00:00   2020-09-12 00:00         40
193263_2_time.npy                        2020-09-15 00:00   2020-10-10 00:00         26
193263_3_time.npy                        2020-10-13 00:00   2020-11-07 00:00         26
193263_4_time.npy              