In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from datetime import datetime, timedelta


In [3]:
# read the train and test data
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")


In [4]:
train_df.groupby('Patient-Uid').max()
test_df.groupby('Patient-Uid').max()

Unnamed: 0_level_0,Date,Incident
Patient-Uid,Unnamed: 1_level_1,Unnamed: 2_level_1
a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2019-05-21,TEST_TYPE_0
a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2019-10-23,TEST_TYPE_3
a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-10-21,TEST_TYPE_1
a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2020-03-21,TEST_TYPE_0
a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2019-11-09,SYMPTOM_TYPE_3
...,...,...
a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-03-24,DRUG_TYPE_7
a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_8
a102726b-1c7c-11ec-bfbf-16262ee38c7f,2020-01-02,DRUG_TYPE_7
a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-21,DRUG_TYPE_7


In [5]:
train_df.sort_values(by = ['Date'], inplace=True)
test_df.sort_values(by = ['Date'], inplace=True)

In [147]:
# create a positive and negative set for developing the model
positive_df = train_df.loc[train_df["Incident"] == "TARGET DRUG"].copy()
negative_df = train_df.loc[train_df["Incident"] != "TARGET DRUG"].copy()

In [174]:
# calculate the date range for the validation set
start_date = positive_df["Date"].max()
end_date = start_date + pd.Timedelta(days=30)


In [175]:
# select patients in the future for the validation set
future_df = train_df.loc[train_df["Date"] >= start_date]


In [176]:
future_df["Target_Drug_Taken"] = future_df.groupby("Patient-Uid")["Incident"].transform(
    lambda x: "TARGET DRUG" in x.tolist()
).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df["Target_Drug_Taken"] = future_df.groupby("Patient-Uid")["Incident"].transform(


In [177]:
# balance the data
num_positive_samples = positive_df.shape[0]
negative_df_sampled = negative_df.sample(num_positive_samples, random_state=42)
merged_df = pd.concat([positive_df, negative_df_sampled], ignore_index=True)


In [178]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident
0,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG
1,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG
2,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG
3,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG
4,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG


In [179]:
# create a column for the number of days since the last visit for each patient
merged_df["Days_Since_Last_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff().dt.days
merged_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
# create a column for the number of days until the next visit for each patient
merged_df["Days_Until_Next_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
merged_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
# create a column for the number of visits for each patient
merged_df["Num_Visits"] = merged_df.groupby("Patient-Uid")["Date"].transform("count")

# create a column for the number of visits in the last 30 days for each patient
merged_df["Visits_Last_30_Days"] = merged_df.groupby("Patient-Uid")["Date"].transform(
    lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum()
)



In [180]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
0,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0.0,455.0,15,1
1,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0.0,-424.0,20,1
2,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0.0,-142.0,13,3
3,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0.0,-288.0,18,1
4,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0.0,36.0,8,1


In [181]:
# split the data into training and validation sets
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)



In [182]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
0,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0.0,455.0,15,1
1,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0.0,-424.0,20,1
2,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0.0,-142.0,13,3
3,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0.0,-288.0,18,1
4,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0.0,36.0,8,1


In [183]:
features=["Days_Since_Last_Visit", "Num_Visits","Days_Until_Next_Visit","Visits_Last_30_Days"]
target=["Incident"]

In [184]:
train_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
38308,a0eafe81-1c7c-11ec-8a01-16262ee38c7f,2018-11-06,TARGET DRUG,96.0,64.0,11,1
28073,a0ecd811-1c7c-11ec-8e8c-16262ee38c7f,2019-06-01,TARGET DRUG,350.0,41.0,13,1
122708,a0e1c371-1c7c-11ec-8f81-16262ee38c7f,2019-05-11,DRUG_TYPE_1,0.0,0.0,1,1
122683,a0efd47f-1c7c-11ec-90f6-16262ee38c7f,2019-01-16,DRUG_TYPE_12,959.0,1063.0,15,3
93006,a0eb0674-1c7c-11ec-8874-16262ee38c7f,2019-03-12,DRUG_TYPE_6,286.0,507.0,9,1


In [185]:
# train a decision tree classifier
dtc=DecisionTreeClassifier()
dtc.fit(train_df[features], train_df[target])

In [186]:
val_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
75493,a0ecaea3-1c7c-11ec-97c5-16262ee38c7f,2018-06-26,DRUG_TYPE_1,-738.0,-627.0,6,2
89311,a0eac0f5-1c7c-11ec-9bc6-16262ee38c7f,2018-03-21,PRIMARY_DIAGNOSIS,-77.0,0.0,2,1
29417,a0ed5bc1-1c7c-11ec-bd99-16262ee38c7f,2020-05-22,TARGET DRUG,0.0,107.0,20,1
93986,a0e62365-1c7c-11ec-8d35-16262ee38c7f,2016-03-30,DRUG_TYPE_9,-1497.0,-1420.0,6,2
113879,a0dfcc13-1c7c-11ec-867e-16262ee38c7f,2018-05-12,DRUG_TYPE_0,0.0,0.0,1,1


In [188]:
# make predictions on training set
train_preds = dtc.predict(train_df[features])
train_acc = accuracy_score(train_df[target], train_preds)
train_f1 = f1_score(train_df[target], train_preds, average='weighted')
train_cm = confusion_matrix(train_df[target], train_preds)

print("Training accuracy:", train_acc)
print("Training F1-score:", train_f1)
print("Training confusion matrix:")
print(train_cm)

Training accuracy: 0.9195986908171235
Training F1-score: 0.9210411707927665
Training confusion matrix:
[[4605   30    0 ...    0    0    0]
 [ 220 7249    0 ...    0    0    0]
 [   7   13  312 ...    0    0    0]
 ...
 [   5    1    0 ...  122    0    0]
 [   0    2    0 ...    0   21    0]
 [   0    0    0 ...    0    0    2]]


In [189]:
# make predictions on validation set
val_df["Prediction"] = dtc.predict(val_df[features])

# evaluate model using F1-score
val_f1 = f1_score(val_df[target], val_df["Prediction"], average='weighted')
print("Validation F1-score:", val_f1)

Validation F1-score: 0.4644465613504464


In [191]:
# generate predictions for the test set
test_df["Days_Since_Last_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff().dt.days
test_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
test_df["Days_Until_Next_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
test_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
test_df["Num_Visits"] = test_df.groupby("Patient-Uid")["Date"].transform("count")

# create a column for the number of visits in the last 30 days for each patient
test_df["Visits_Last_30_Days"] = test_df.groupby("Patient-Uid")["Date"].transform(
    lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum()
)


In [133]:
# define the features and target variable
features = ["Days_Since_Last_Visit", "Days_Until_Next_Visit","Num_Visits","Visits_Last_30_Days"]
target = "Incident"

In [134]:
# train a decision tree classifier
dtc=DecisionTreeClassifier()
dtc.fit(train_df[features], train_df[target])

In [197]:
# Predict on test set
test_preds = dtc.predict_proba(test_df[features])[:, 1]
test_df['Label'] = np.where(test_preds > 0.5, 1, 0)


In [198]:
test_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Prediction,Label
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0,0.0,-678.0,55,1,0,0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0,678.0,320.0,55,1,0,0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2,-320.0,-369.0,55,1,0,0
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1,369.0,396.0,55,1,0,0
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0,-396.0,-186.0,55,1,0,0


In [196]:
test_df["Prediction"].unique()

array([0, 1])

In [199]:
# evaluate the model on the validation set
val_df["Days_Since_Last_Visit"] = val_df.groupby("Patient-Uid")["Date"].diff().dt.days
val_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
val_df["Days_Until_Next_Visit"] = val_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
val_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
val_df["Prediction"] = dtc.predict(val_df[features])
# Predict on test set
test_preds = dtc.predict_proba(val_df[features])[:, 1]
val_df['Label'] = np.where(test_preds > 0.5, 1, 0)


In [200]:
f1 = f1_score(val_df[target], val_df["Prediction"],average='weighted')
print("Validation F1-score:", f1)

Validation F1-score: 0.350891805183519


In [202]:
# save the final predictions to a csv file
val_df[["Patient-Uid", "Label"]].to_csv("final_submission.csv", index=False)

In [203]:
df2=pd.read_csv("final_submission.csv")

In [204]:
df2.head(5)

Unnamed: 0,Patient-Uid,Label
0,a0ecaea3-1c7c-11ec-97c5-16262ee38c7f,0
1,a0eac0f5-1c7c-11ec-9bc6-16262ee38c7f,0
2,a0ed5bc1-1c7c-11ec-bd99-16262ee38c7f,0
3,a0e62365-1c7c-11ec-8d35-16262ee38c7f,0
4,a0dfcc13-1c7c-11ec-867e-16262ee38c7f,0
