In [185]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [186]:
p = pd.read_csv('data/patients.csv')
p

Unnamed: 0,patient_id,gender,age,region,signup_date
0,1,M,34,East,2025-01-15
1,2,F,47,North,2024-12-20
2,3,M,55,South,2025-02-02
3,4,F,29,West,2025-01-10
4,5,F,38,East,2024-12-28
5,6,M,63,North,2025-02-14
6,7,M,41,West,2025-01-22
7,8,F,26,South,2024-12-18
8,9,M,53,East,2025-02-05
9,10,F,37,West,2025-01-27


In [187]:
app = pd.read_csv('data/appointments.csv')
app

Unnamed: 0,appointment_id,patient_id,scheduled_date,attended_flag,provider_type
0,101,1,2025-06-01,Y,GP
1,102,1,2025-07-14,N,Specialist
2,103,2,2025-06-10,Y,Therapist
3,104,3,2025-06-15,Y,GP
4,105,3,2025-07-02,Y,Specialist
5,106,4,2025-06-03,Y,Therapist
6,107,4,2025-07-19,N,GP
7,108,5,2025-06-22,Y,GP
8,109,6,2025-07-01,N,Therapist
9,110,7,2025-06-29,Y,GP


In [188]:
p_app = pd.merge(left=p, right=app, how='left')
p_app

Unnamed: 0,patient_id,gender,age,region,signup_date,appointment_id,scheduled_date,attended_flag,provider_type
0,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP
1,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist
2,2,F,47,North,2024-12-20,103.0,2025-06-10,Y,Therapist
3,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP
4,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist
5,4,F,29,West,2025-01-10,106.0,2025-06-03,Y,Therapist
6,4,F,29,West,2025-01-10,107.0,2025-07-19,N,GP
7,5,F,38,East,2024-12-28,108.0,2025-06-22,Y,GP
8,6,M,63,North,2025-02-14,109.0,2025-07-01,N,Therapist
9,7,M,41,West,2025-01-22,110.0,2025-06-29,Y,GP


In [189]:
eng = pd.read_csv('data/engagement.csv')
eng

Unnamed: 0,patient_id,date,action_type,action_count
0,1,2025-06-01,login,2
1,1,2025-06-03,article,1
2,2,2025-06-05,message,3
3,3,2025-06-02,login,1
4,3,2025-06-04,video,2
5,4,2025-06-07,login,4
6,5,2025-06-08,article,1
7,6,2025-06-09,login,3
8,7,2025-06-10,message,2
9,8,2025-06-11,login,1


In [190]:
p_app_eng = pd.merge(left=p_app, right=eng, how='left')
p_app_eng

Unnamed: 0,patient_id,gender,age,region,signup_date,appointment_id,scheduled_date,attended_flag,provider_type,date,action_type,action_count
0,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-01,login,2.0
1,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-03,article,1.0
2,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-01,login,2.0
3,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-03,article,1.0
4,2,F,47,North,2024-12-20,103.0,2025-06-10,Y,Therapist,2025-06-05,message,3.0
5,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-02,login,1.0
6,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-04,video,2.0
7,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-02,login,1.0
8,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-04,video,2.0
9,4,F,29,West,2025-01-10,106.0,2025-06-03,Y,Therapist,2025-06-07,login,4.0


In [191]:
lab = pd.read_csv('data/lab_results.csv')
lab

Unnamed: 0,patient_id,lab_date,test_type,test_value,diagnosis_flag
0,1,2025-05-10,HbA1c,5.8,Negative
1,2,2025-05-12,LDL,180,Positive
2,3,2025-05-18,BP,120/80,Negative
3,4,2025-05-22,Glucose,95,Negative
4,5,2025-06-02,HbA1c,6.2,Positive
5,6,2025-06-05,LDL,190,Positive
6,7,2025-06-10,BP,130/85,Negative
7,8,2025-06-15,Glucose,110,Negative
8,9,2025-06-20,HbA1c,5.9,Negative
9,10,2025-06-25,LDL,175,Positive


In [192]:
df_f = pd.merge(left=p_app_eng, right=lab, how='left')
df_f

Unnamed: 0,patient_id,gender,age,region,signup_date,appointment_id,scheduled_date,attended_flag,provider_type,date,action_type,action_count,lab_date,test_type,test_value,diagnosis_flag
0,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-01,login,2.0,2025-05-10,HbA1c,5.8,Negative
1,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-03,article,1.0,2025-05-10,HbA1c,5.8,Negative
2,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-01,login,2.0,2025-05-10,HbA1c,5.8,Negative
3,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-03,article,1.0,2025-05-10,HbA1c,5.8,Negative
4,2,F,47,North,2024-12-20,103.0,2025-06-10,Y,Therapist,2025-06-05,message,3.0,2025-05-12,LDL,180,Positive
5,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-02,login,1.0,2025-05-18,BP,120/80,Negative
6,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-04,video,2.0,2025-05-18,BP,120/80,Negative
7,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-02,login,1.0,2025-05-18,BP,120/80,Negative
8,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-04,video,2.0,2025-05-18,BP,120/80,Negative
9,4,F,29,West,2025-01-10,106.0,2025-06-03,Y,Therapist,2025-06-07,login,4.0,2025-05-22,Glucose,95,Negative


In [193]:
# Dropping null values
df_f = df_f.dropna()
df_f

Unnamed: 0,patient_id,gender,age,region,signup_date,appointment_id,scheduled_date,attended_flag,provider_type,date,action_type,action_count,lab_date,test_type,test_value,diagnosis_flag
0,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-01,login,2.0,2025-05-10,HbA1c,5.8,Negative
1,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-03,article,1.0,2025-05-10,HbA1c,5.8,Negative
2,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-01,login,2.0,2025-05-10,HbA1c,5.8,Negative
3,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-03,article,1.0,2025-05-10,HbA1c,5.8,Negative
4,2,F,47,North,2024-12-20,103.0,2025-06-10,Y,Therapist,2025-06-05,message,3.0,2025-05-12,LDL,180,Positive
5,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-02,login,1.0,2025-05-18,BP,120/80,Negative
6,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-04,video,2.0,2025-05-18,BP,120/80,Negative
7,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-02,login,1.0,2025-05-18,BP,120/80,Negative
8,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-04,video,2.0,2025-05-18,BP,120/80,Negative
9,4,F,29,West,2025-01-10,106.0,2025-06-03,Y,Therapist,2025-06-07,login,4.0,2025-05-22,Glucose,95,Negative


In [194]:
df_f.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 0 to 13
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   patient_id      14 non-null     int64  
 1   gender          14 non-null     object 
 2   age             14 non-null     int64  
 3   region          14 non-null     object 
 4   signup_date     14 non-null     object 
 5   appointment_id  14 non-null     float64
 6   scheduled_date  14 non-null     object 
 7   attended_flag   14 non-null     object 
 8   provider_type   14 non-null     object 
 9   date            14 non-null     object 
 10  action_type     14 non-null     object 
 11  action_count    14 non-null     float64
 12  lab_date        14 non-null     object 
 13  test_type       14 non-null     object 
 14  test_value      14 non-null     object 
 15  diagnosis_flag  14 non-null     object 
dtypes: float64(2), int64(2), object(12)
memory usage: 1.9+ KB


In [195]:
# --- Parse dates ---
df_f["signup_date"] = pd.to_datetime(df_f["signup_date"])
df_f["date"] = pd.to_datetime(df_f["date"])
df_f["scheduled_date"] = pd.to_datetime(df_f["scheduled_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["signup_date"] = pd.to_datetime(df_f["signup_date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["date"] = pd.to_datetime(df_f["date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["scheduled_date"] = pd.to_datetime(df_f["scheduled_date"])


### Feature Engineering

In [196]:
df = df_f.copy()

In [197]:
# feature_eng-1: Gap between first appointment and signup_date
# compute first appointment per patient
first_app = (
    df.groupby("patient_id")["scheduled_date"]
      .min()
      .reset_index()
      .rename(columns={"scheduled_date": "first_app_date"})
)

# merge back
df = df.merge(first_app, on="patient_id", how="left")

# create new column
df["days_from_signup_to_first_app"] = (
    (df["first_app_date"] - df["signup_date"]).dt.days
)

# optional: drop helper column if you don’t need it
# df = df.drop(columns=["first_app_date"])

print(df[["patient_id", "signup_date", "first_app_date", "days_from_signup_to_first_app"]].head())


   patient_id signup_date first_app_date  days_from_signup_to_first_app
0           1  2025-01-15     2025-06-01                            137
1           1  2025-01-15     2025-06-01                            137
2           1  2025-01-15     2025-06-01                            137
3           1  2025-01-15     2025-06-01                            137
4           2  2024-12-20     2025-06-10                            172


In [198]:
# feature_eng-2: Had any action before app in the last 14 days
# difference in days
df["days_diff"] = (df["scheduled_date"] - df["date"]).dt.days

# flag: engagement happened within 7 days before appt (and before, not after)
df["had_action_last_14d"] = ((df["days_diff"] >= 0) & (df["days_diff"] <= 14)).astype(int)

print(df[["patient_id", "scheduled_date", "date", "days_diff", "had_action_last_14d"]].head())

   patient_id scheduled_date       date  days_diff  had_action_last_14d
0           1     2025-06-01 2025-06-01          0                    1
1           1     2025-06-01 2025-06-03         -2                    0
2           1     2025-07-14 2025-06-01         43                    0
3           1     2025-07-14 2025-06-03         41                    0
4           2     2025-06-10 2025-06-05          5                    1


In [199]:
# feature_eng-3: Did the patient have any positive results before appointment.
# create a flag where lab is Positive and before the appointment
df["positive_before"] = ((df["diagnosis_flag"] == "Positive") & 
                         (df["lab_date"] < df["scheduled_date"])).astype(int)

df

Unnamed: 0,patient_id,gender,age,region,signup_date,appointment_id,scheduled_date,attended_flag,provider_type,date,...,action_count,lab_date,test_type,test_value,diagnosis_flag,first_app_date,days_from_signup_to_first_app,days_diff,had_action_last_14d,positive_before
0,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-01,...,2.0,2025-05-10,HbA1c,5.8,Negative,2025-06-01,137,0,1,0
1,1,M,34,East,2025-01-15,101.0,2025-06-01,Y,GP,2025-06-03,...,1.0,2025-05-10,HbA1c,5.8,Negative,2025-06-01,137,-2,0,0
2,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-01,...,2.0,2025-05-10,HbA1c,5.8,Negative,2025-06-01,137,43,0,0
3,1,M,34,East,2025-01-15,102.0,2025-07-14,N,Specialist,2025-06-03,...,1.0,2025-05-10,HbA1c,5.8,Negative,2025-06-01,137,41,0,0
4,2,F,47,North,2024-12-20,103.0,2025-06-10,Y,Therapist,2025-06-05,...,3.0,2025-05-12,LDL,180,Positive,2025-06-10,172,5,1,1
5,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-02,...,1.0,2025-05-18,BP,120/80,Negative,2025-06-15,133,13,1,0
6,3,M,55,South,2025-02-02,104.0,2025-06-15,Y,GP,2025-06-04,...,2.0,2025-05-18,BP,120/80,Negative,2025-06-15,133,11,1,0
7,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-02,...,1.0,2025-05-18,BP,120/80,Negative,2025-06-15,133,30,0,0
8,3,M,55,South,2025-02-02,105.0,2025-07-02,Y,Specialist,2025-06-04,...,2.0,2025-05-18,BP,120/80,Negative,2025-06-15,133,28,0,0
9,4,F,29,West,2025-01-10,106.0,2025-06-03,Y,Therapist,2025-06-07,...,4.0,2025-05-22,Glucose,95,Negative,2025-06-03,144,-4,0,0


In [200]:
# Extract weekday from schduled_date
# Extract weekday as an integer (0=Monday, 6=Sunday)
df['scheduled_weekday'] = df['scheduled_date'].dt.dayofweek 

In [201]:
df.columns

Index(['patient_id', 'gender', 'age', 'region', 'signup_date',
       'appointment_id', 'scheduled_date', 'attended_flag', 'provider_type',
       'date', 'action_type', 'action_count', 'lab_date', 'test_type',
       'test_value', 'diagnosis_flag', 'first_app_date',
       'days_from_signup_to_first_app', 'days_diff', 'had_action_last_14d',
       'positive_before', 'scheduled_weekday'],
      dtype='object')

In [202]:
df.drop(['patient_id','signup_date', 'appointment_id', 'date', 'action_type', 'action_count', 'lab_date', 'test_value', 'test_type', 'diagnosis_flag', 'first_app_date', 'days_diff'],
        inplace = True, axis=1)
df

Unnamed: 0,gender,age,region,scheduled_date,attended_flag,provider_type,days_from_signup_to_first_app,had_action_last_14d,positive_before,scheduled_weekday
0,M,34,East,2025-06-01,Y,GP,137,1,0,6
1,M,34,East,2025-06-01,Y,GP,137,0,0,6
2,M,34,East,2025-07-14,N,Specialist,137,0,0,0
3,M,34,East,2025-07-14,N,Specialist,137,0,0,0
4,F,47,North,2025-06-10,Y,Therapist,172,1,1,1
5,M,55,South,2025-06-15,Y,GP,133,1,0,6
6,M,55,South,2025-06-15,Y,GP,133,1,0,6
7,M,55,South,2025-07-02,Y,Specialist,133,0,0,2
8,M,55,South,2025-07-02,Y,Specialist,133,0,0,2
9,F,29,West,2025-06-03,Y,Therapist,144,0,0,1


In [203]:
# define target
# 0 if attended and 1 if missed
df["attended_flag"] = (df["attended_flag"] == "N").astype(int)
df

Unnamed: 0,gender,age,region,scheduled_date,attended_flag,provider_type,days_from_signup_to_first_app,had_action_last_14d,positive_before,scheduled_weekday
0,M,34,East,2025-06-01,0,GP,137,1,0,6
1,M,34,East,2025-06-01,0,GP,137,0,0,6
2,M,34,East,2025-07-14,1,Specialist,137,0,0,0
3,M,34,East,2025-07-14,1,Specialist,137,0,0,0
4,F,47,North,2025-06-10,0,Therapist,172,1,1,1
5,M,55,South,2025-06-15,0,GP,133,1,0,6
6,M,55,South,2025-06-15,0,GP,133,1,0,6
7,M,55,South,2025-07-02,0,Specialist,133,0,0,2
8,M,55,South,2025-07-02,0,Specialist,133,0,0,2
9,F,29,West,2025-06-03,0,Therapist,144,0,0,1


In [204]:
# sort by scheduled_date
df_sorted = df.sort_values("scheduled_date")

# choose split point (70% train, 30% test)
split_idx = int(0.7 * len(df_sorted))

# split features and target
X_train = df_sorted.iloc[:split_idx].drop(columns=["attended_flag"])
y_train = df_sorted.iloc[:split_idx]["attended_flag"]

X_test = df_sorted.iloc[split_idx:].drop(columns=["attended_flag"])
y_test = df_sorted.iloc[split_idx:]["attended_flag"]


In [205]:
X_train

Unnamed: 0,gender,age,region,scheduled_date,provider_type,days_from_signup_to_first_app,had_action_last_14d,positive_before,scheduled_weekday
0,M,34,East,2025-06-01,GP,137,1,0,6
1,M,34,East,2025-06-01,GP,137,0,0,6
9,F,29,West,2025-06-03,Therapist,144,0,0,1
4,F,47,North,2025-06-10,Therapist,172,1,1,1
5,M,55,South,2025-06-15,GP,133,1,0,6
6,M,55,South,2025-06-15,GP,133,1,0,6
11,F,38,East,2025-06-22,GP,176,1,1,6
13,M,41,West,2025-06-29,GP,158,0,0,6
12,M,63,North,2025-07-01,Therapist,137,0,1,1


In [206]:
X_test

Unnamed: 0,gender,age,region,scheduled_date,provider_type,days_from_signup_to_first_app,had_action_last_14d,positive_before,scheduled_weekday
7,M,55,South,2025-07-02,Specialist,133,0,0,2
8,M,55,South,2025-07-02,Specialist,133,0,0,2
2,M,34,East,2025-07-14,Specialist,137,0,0,0
3,M,34,East,2025-07-14,Specialist,137,0,0,0
10,F,29,West,2025-07-19,GP,144,0,0,5


In [207]:
y_train

0     0
1     0
9     0
4     0
5     0
6     0
11    0
13    0
12    1
Name: attended_flag, dtype: int64

In [208]:
y_test

7     0
8     0
2     1
3     1
10    1
Name: attended_flag, dtype: int64

In [None]:
# Define columns by type
cat_cols   = ["gender", "region", "provider_type"]
num_scale  = ["age", "days_from_signup_to_first_app", "scheduled_weekday"]  
num_pass   = ["had_action_last_14d", "positive_before"]  

In [218]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_scale),
        ("bin", "passthrough", num_pass),
    ],
    remainder="drop",
)


clf = Pipeline(
    steps=[
        ("prep", preprocess),
        ("model", LogisticRegression(max_iter=1000, C=1.0, class_weight="balanced", penalty='l2')),
    ]
)

In [223]:
# Fit and evaluate

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

#print("Accuracy:", accuracy_score(y_test, pred))
#print(classification_report(y_test, pred, digits=3, zero_division=0))

What more can be done:
1. Can generate synthetic data to improve results
2. Can implement cross validation