In [19]:
"""
This is a boilerplate pipeline 'DataProcessing'
generated using Kedro 1.0.0
"""

import pandas as pd
import numpy as np

# Load the classes
positive_class = pd.read_csv("patient-finder/data/01_raw/positive_cohort.csv")
negative_class = pd.read_csv("patient-finder/data/01_raw/negative_cohort.csv")


unique_codes = pd.concat([positive_class['code'], negative_class['code']]).unique()


# Count distinct patient IDs for each code
positive_counts = positive_class.groupby('code')['patient_id'].nunique()
negative_counts = negative_class.groupby('code')['patient_id'].nunique()

# Combine counts into a DataFrame
code_counts = pd.DataFrame({
    'positive_patient_count': positive_counts,
    'negative_patient_count': negative_counts
}).fillna(0).astype(int)

code_counts

Unnamed: 0_level_0,positive_patient_count,negative_patient_count
code,Unnamed: 1_level_1,Unnamed: 2_level_1
00054-0450,30,47
00093-3109,45,41
00093-7424,39,43
00781-5184,45,46
36415,40,46
50360,37,37
80069,31,42
81001,38,41
N02.8,33,20
N03.2,41,17


In [20]:
total_positive_patients_count = positive_class.patient_id.nunique()
total_negative_patients_count = negative_class.patient_id.nunique()

In [21]:
code_counts['positive_patient_relative'] = code_counts['positive_patient_count'] / total_positive_patients_count
code_counts['negative_patient_relative'] = code_counts['negative_patient_count'] / total_negative_patients_count
code_counts

Unnamed: 0_level_0,positive_patient_count,negative_patient_count,positive_patient_relative,negative_patient_relative
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00054-0450,30,47,0.315789,0.447619
00093-3109,45,41,0.473684,0.390476
00093-7424,39,43,0.410526,0.409524
00781-5184,45,46,0.473684,0.438095
36415,40,46,0.421053,0.438095
50360,37,37,0.389474,0.352381
80069,31,42,0.326316,0.4
81001,38,41,0.4,0.390476
N02.8,33,20,0.347368,0.190476
N03.2,41,17,0.431579,0.161905


In [22]:
code_counts['variance'] = np.var([code_counts['positive_patient_relative'], code_counts['negative_patient_relative']], axis=0)
code_counts

Unnamed: 0_level_0,positive_patient_count,negative_patient_count,positive_patient_relative,negative_patient_relative,variance
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00054-0450,30,47,0.315789,0.447619,0.004344759
00093-3109,45,41,0.473684,0.390476,0.001730894
00093-7424,39,43,0.410526,0.409524,2.512547e-07
00781-5184,45,46,0.473684,0.438095,0.0003166437
36415,40,46,0.421053,0.438095,7.261261e-05
50360,37,37,0.389474,0.352381,0.0003439677
80069,31,42,0.326316,0.4,0.001357341
81001,38,41,0.4,0.390476,2.267574e-05
N02.8,33,20,0.347368,0.190476,0.006153793
N03.2,41,17,0.431579,0.161905,0.01818104


In [23]:
important_features = code_counts.sort_values(by='variance', ascending=False).head(10).index.tolist()
important_features

['N03.2',
 'N02.8',
 'N17.9',
 '00054-0450',
 'N18.9',
 '00093-3109',
 '80069',
 'N04.9',
 '50360',
 '00781-5184']

In [24]:
positive_class

Unnamed: 0,patient_id,age,gender,service_date,code,code_type,target
0,PT0002,69,M,2023-05-17,N02.8,DX,True
1,PT0002,69,M,2023-08-23,80069,PX,True
2,PT0002,69,M,2022-07-08,N04.9,DX,True
3,PT0002,69,M,2020-11-22,36415,PX,True
4,PT0004,32,M,2023-09-16,81001,PX,True
...,...,...,...,...,...,...,...
568,PT0200,21,F,2023-03-20,00781-5184,RX,True
569,PT0200,21,F,2022-10-18,R80.9,DX,True
570,PT0200,21,F,2021-05-10,N17.9,DX,True
571,PT0200,21,F,2021-04-14,N03.2,DX,True


In [27]:
import random 
import pandas as pd
from datetime import datetime, timedelta
import numpy as np


def create_patient_pivot(df, values_to_pivot, current_date=None):
    if current_date is None:
        current_date = datetime.now()
    

    pivot_data = []

    for patient_id, group in df.groupby("patient_id"):
        patient_row = {"patient_id": patient_id, "target": group["target"].iloc[0]}
        patient_max_date = group["service_date"].max()
        for val in values_to_pivot:
            val_group = group[group["code"] == val].sort_values("service_date")
            freq = len(val_group)
            gap = (val_group["service_date"].diff().dt.days.mean()
                   if freq > 1 else np.nan)
            duration = ((patient_max_date  - val_group["service_date"]).dt.days.mean()
                        if freq > 0 else np.nan)
            patient_row[f"{val}_frequency"] = freq
            patient_row[f"{val}_gap"] = gap
            patient_row[f"{val}_duration"] = duration
        pivot_data.append(patient_row)
    pivot_df = pd.DataFrame(pivot_data)
    return pivot_df

# Example usage
positive_class['service_date'] = pd.to_datetime(positive_class['service_date'])
negative_class['service_date'] = pd.to_datetime(negative_class['service_date'])
claims_table = pd.concat([positive_class, negative_class], ignore_index=True)

pivot_df = create_patient_pivot(claims_table, important_features)

# positive_class.to_csv("data/01_raw/positive_class_mock_1.csv", index=False)
# pivot_df.to_csv("data/02_intermediate/positive_class_pivot_mock_1.csv", index=False)

print(pivot_df["target"].value_counts(normalize=True))  # sanity check distribution

 

target
False    0.525
True     0.475
Name: proportion, dtype: float64


In [28]:
pivot_df

Unnamed: 0,patient_id,target,N03.2_frequency,N03.2_gap,N03.2_duration,N02.8_frequency,N02.8_gap,N02.8_duration,N17.9_frequency,N17.9_gap,...,80069_duration,N04.9_frequency,N04.9_gap,N04.9_duration,50360_frequency,50360_gap,50360_duration,00781-5184_frequency,00781-5184_gap,00781-5184_duration
0,PT0001,False,0,,,0,,,0,,...,,0,,,0,,,0,,
1,PT0002,True,0,,,1,,98.0,0,,...,0.0,1,,411.0,0,,,0,,
2,PT0003,False,0,,,0,,,0,,...,,0,,,0,,,0,,
3,PT0004,True,1,,35.0,0,,,0,,...,806.0,0,,,0,,,1,,190.0
4,PT0005,True,2,903.0,541.5,0,,,0,,...,,2,363.0,820.5,1,,850.0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,PT0196,False,0,,,0,,,1,,...,,0,,,1,,10.0,0,,
196,PT0197,False,1,,0.0,0,,,0,,...,542.0,2,325.0,360.5,1,,937.0,1,,32.0
197,PT0198,False,0,,,1,,244.0,0,,...,,1,,391.0,2,923.0,461.5,0,,
198,PT0199,True,1,,1297.0,0,,,0,,...,,2,568.0,996.0,1,,49.0,0,,


In [29]:
# create sequence table of patient Journeys example patients_id
claims_table

Unnamed: 0,patient_id,age,gender,service_date,code,code_type,target
0,PT0002,69,M,2023-05-17,N02.8,DX,True
1,PT0002,69,M,2023-08-23,80069,PX,True
2,PT0002,69,M,2022-07-08,N04.9,DX,True
3,PT0002,69,M,2020-11-22,36415,PX,True
4,PT0004,32,M,2023-09-16,81001,PX,True
...,...,...,...,...,...,...,...
1203,PT0198,78,M,2022-04-13,00093-7424,RX,False
1204,PT0198,78,M,2022-10-27,N04.9,DX,False
1205,PT0198,78,M,2022-11-17,00054-0450,RX,False
1206,PT0198,78,M,2023-11-22,50360,PX,False


In [34]:
import pandas as pd


# Sort by patient and service_date
claims_table = claims_table.sort_values(["patient_id", "service_date"])

# Group by patient and aggregate codes into list (ordered sequence)
seq_df = claims_table.groupby("patient_id").agg({
    "code": lambda x: list(x),
    "target": "first"   # or 'first' depending on definition
}).reset_index()
seq_df.columns = ["patient_id", "codes", "target"]
seq_df.head()

Unnamed: 0,patient_id,codes,target
0,PT0001,"[00093-7424, N18.9, 00093-7424]",False
1,PT0002,"[36415, N04.9, N02.8, 80069]",True
2,PT0003,"[00093-7424, 81001, 00054-0450, 36415]",False
3,PT0004,"[80069, 36415, 00054-0450, 80069, 00781-5184, ...",True
4,PT0005,"[R80.9, N04.9, N03.2, 00054-0450, 50360, N04.9...",True


In [None]:


import pickle
file = 'patient-finder/data/06_models/models.pkl'
with open(file, 'rb') as f:
    models = pickle.load(f)

models

FileNotFoundError: [Errno 2] No such file or directory: 'data/06_models/models.pkl'