In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from datetime import datetime, timedelta

In [77]:
#Data collection and Data Processing
#Parquet is a Column Oriented Data file, it is a Apache Product

train_set = pd.read_parquet("train.parquet")
test_set = pd.read_parquet("test.parquet")

In [82]:
#Grouping the data by Patient ID ( so that we will get single patient's complete records together) and
#sorting the values by date (so as to read/predict if the output drug is/will be prescribed within 30 days)
#We will have apply the Grouping and Sorting mentioned above to both Test Set and Training Set

train_set.groupby('Patient-Uid')
train_set.sort_values(by=['Date'], inplace=True)
test_set.groupby('Patient-Uid')
test_set.sort_values(by=['Date'], inplace=True)

In [84]:
#Divide the training data into two sets 1. Medical records of patients who has got output Drug Prescribed (Tar_Drug_Pres_Train)
# 2. Medical records of patients who has not got output Drug Prescribed(No_Tar_Drug_Pres_Train)

Tar_Drug_Pres_Train = train_set.loc[train_set['Incident'] == 'TARGET DRUG'].copy()
No_Tar_Drug_Pres_Train = train_set.loc[train_set['Incident'] != 'TARGET DRUG'].copy()

In [85]:
# calculate the date range for the validation set
# Get the Maximum largest date in the set so as to pick the upcoming dates
# start_date = Tar_Drug_Pres_Train["Date"].max()
# end_date = start_date + pd.Timedelta(days=30)
# select patients in the future for the validation set
# future_df = train_set.loc[train_set["Date"] >= start_date]
num_positive_samples = Tar_Drug_Pres_Train.shape[0]
negative_df_sampled = No_Tar_Drug_Pres_Train.sample(num_positive_samples, random_state=42)
merged_df = pd.concat([Tar_Drug_Pres_Train, negative_df_sampled], ignore_index=True)

In [87]:
# create a column for the number of days since the last visit for each patient
print(merged_df.columns)
merged_df["Days_Since_Last_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff().dt.days
merged_df["Days_Since_Last_Visit"].fillna(0, inplace=True)

Index(['Patient-Uid', 'Date', 'Incident'], dtype='object')


In [91]:
# create a column for the number of days until the next visit for each patient
merged_df["Days_Until_Next_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
merged_df["Days_Until_Next_Visit"].fillna(0, inplace=True)

In [92]:
# create a column for the number of visits for each patient
merged_df["Num_Visits"] = merged_df.groupby("Patient-Uid")["Date"].transform("count")

In [93]:
# print(merged_df['Days_Until_Next_Visit'].head())
# create a column for the number of visits in the last 30 days for each patient


In [94]:
merged_df["Visits_Last_30_Days"] = merged_df.groupby("Patient-Uid")["Date"].transform(
    lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum()
)

In [96]:
print(merged_df.columns)

Index(['Patient-Uid', 'Date', 'Incident', 'Days_Since_Last_Visit',
       'Days_Until_Next_Visit', 'Num_Visits', 'Visits_Last_30_Days'],
      dtype='object')


In [97]:
# #Split the data into Test and Training set( as we are having one more Training set i will consider this Training set as Validation set)
train_df, verfication_df = train_test_split(merged_df, train_size=0.8, random_state=42)

In [98]:
# #Categorizing the fields as Inputs and Output
Input=["Days_Since_Last_Visit", "Num_Visits","Days_Until_Next_Visit","Visits_Last_30_Days"]
Output=["Incident"]

In [35]:
predictor=DecisionTreeClassifier()
predictor.fit(train_df[Input], train_df[Output])
train_preds = predictor.predict(train_df[Input])

train_acc = accuracy_score(train_df[Output], train_preds)
train_f1 = f1_score(train_df[Output], train_preds, average='weighted')
train_cm = confusion_matrix(train_df[Output], train_preds)
print("Training accuracy:", train_acc)
print("Training F1-score:", train_f1)
print("Training confusion matrix:")

Training accuracy: 0.9240432179120021
Training F1-score: 0.9253314755039803
Training confusion matrix:


In [39]:
# # make predictions on validation set
verfication_df["Prediction"] = predictor.predict(verfication_df[Input])

In [42]:
# # evaluate model using F1-score
val_f1 = f1_score(verfication_df[Output], verfication_df["Prediction"], average='weighted')
print("Validation F1-score:", val_f1)

Validation F1-score: 0.527848120705207


In [44]:
verfication_df.head()

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Prediction
75493,a0efdb82-1c7c-11ec-93af-16262ee38c7f,2019-10-12,DRUG_TYPE_10,-160.0,0.0,5,1,DRUG_TYPE_2
89311,a0e56154-1c7c-11ec-8c1d-16262ee38c7f,2018-12-23,DRUG_TYPE_6,0.0,697.0,3,1,DRUG_TYPE_12
29417,a0ec8ee7-1c7c-11ec-950b-16262ee38c7f,2019-07-23,TARGET DRUG,27.0,-28.0,29,2,TARGET DRUG
93986,a0eda11e-1c7c-11ec-a7e6-16262ee38c7f,2017-06-27,SYMPTOM_TYPE_2,117.0,146.0,11,1,DRUG_TYPE_0
113879,a0e4db0f-1c7c-11ec-b911-16262ee38c7f,2017-03-14,PRIMARY_DIAGNOSIS,-526.0,-526.0,10,1,DRUG_TYPE_0


In [46]:
verfication_df[["Patient-Uid", "Incident", "Prediction"]].to_csv("submission1.csv", index=False)

#Predictions for test dataset:

In [50]:
test_df["Days_Since_Last_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff().dt.days
test_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
test_df["Days_Until_Next_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
test_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
test_df["Num_Visits"] = test_df.groupby("Patient-Uid")["Date"].transform("count")

# create a column for the number of visits in the last 30 days for each patient

test_df["Visits_Last_30_Days"] = test_df.groupby("Patient-Uid")["Date"].transform(lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum())

In [51]:
# define the features and target variable as input and output

Input = ["Days_Since_Last_Visit", "Days_Until_Next_Visit","Num_Visits","Visits_Last_30_Days"]
Output = "Incident"

In [53]:
test_df.head()

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
1133793,a101303d-1c7c-11ec-8cc8-16262ee38c7f,2015-04-07,DRUG_TYPE_9,0.0,-10.0,635,15
650926,a0fe825a-1c7c-11ec-8759-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0.0,-5.0,100,1
443464,a0fce209-1c7c-11ec-82bf-16262ee38c7f,2015-04-07,PRIMARY_DIAGNOSIS,0.0,0.0,70,1
763050,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,2015-04-07,DRUG_TYPE_1,0.0,0.0,287,4
745903,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0.0,-5.0,54,1


In [54]:
# prediction

test_df["Prediction"] = predictor.predict(test_df[Input])

In [56]:
test_df.head()

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Prediction
1133793,a101303d-1c7c-11ec-8cc8-16262ee38c7f,2015-04-07,DRUG_TYPE_9,0.0,-10.0,635,15,TARGET DRUG
650926,a0fe825a-1c7c-11ec-8759-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0.0,-5.0,100,1,TARGET DRUG
443464,a0fce209-1c7c-11ec-82bf-16262ee38c7f,2015-04-07,PRIMARY_DIAGNOSIS,0.0,0.0,70,1,TEST_TYPE_1
763050,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,2015-04-07,DRUG_TYPE_1,0.0,0.0,287,4,PRIMARY_DIAGNOSIS
745903,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0.0,-5.0,54,1,TARGET DRUG


In [57]:
final_test_data = test_df

In [58]:
final_test_data.head()

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Prediction
1133793,a101303d-1c7c-11ec-8cc8-16262ee38c7f,2015-04-07,DRUG_TYPE_9,0.0,-10.0,635,15,TARGET DRUG
650926,a0fe825a-1c7c-11ec-8759-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0.0,-5.0,100,1,TARGET DRUG
443464,a0fce209-1c7c-11ec-82bf-16262ee38c7f,2015-04-07,PRIMARY_DIAGNOSIS,0.0,0.0,70,1,TEST_TYPE_1
763050,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,2015-04-07,DRUG_TYPE_1,0.0,0.0,287,4,PRIMARY_DIAGNOSIS
745903,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0.0,-5.0,54,1,TARGET DRUG


In [68]:
final_test_data.loc[final_test_data["Prediction"] == "TARGET DRUG", "Label"] = int(1)
final_test_data.loc[final_test_data["Prediction"] != "TARGET DRUG", "Label"] = int(0)

In [69]:
final_test_data.head()

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Prediction,Label
1133793,a101303d-1c7c-11ec-8cc8-16262ee38c7f,2015-04-07,DRUG_TYPE_9,0.0,-10.0,635,15,TARGET DRUG,1
650926,a0fe825a-1c7c-11ec-8759-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0.0,-5.0,100,1,TARGET DRUG,1
443464,a0fce209-1c7c-11ec-82bf-16262ee38c7f,2015-04-07,PRIMARY_DIAGNOSIS,0.0,0.0,70,1,TEST_TYPE_1,0
763050,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,2015-04-07,DRUG_TYPE_1,0.0,0.0,287,4,PRIMARY_DIAGNOSIS,0
745903,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0.0,-5.0,54,1,TARGET DRUG,1


In [70]:
final_test_data[["Patient-Uid", "Label"]].to_csv("Final_submission.csv", index=False)