In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from datetime import datetime, timedelta


In [7]:
# read the train and test data
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")


In [8]:
train_df.groupby('Patient-Uid').max()
test_df.groupby('Patient-Uid').max()

Unnamed: 0_level_0,Date,Incident
Patient-Uid,Unnamed: 1_level_1,Unnamed: 2_level_1
a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2019-05-21,TEST_TYPE_0
a0f9e9f9-1c7c-11ec-b565-16262ee38c7f,2019-10-23,TEST_TYPE_3
a0f9ea43-1c7c-11ec-aa10-16262ee38c7f,2019-10-21,TEST_TYPE_1
a0f9ea7c-1c7c-11ec-af15-16262ee38c7f,2020-03-21,TEST_TYPE_0
a0f9eab1-1c7c-11ec-a732-16262ee38c7f,2019-11-09,SYMPTOM_TYPE_3
...,...,...
a102720c-1c7c-11ec-bd9a-16262ee38c7f,2020-03-24,DRUG_TYPE_7
a102723c-1c7c-11ec-9f80-16262ee38c7f,2019-07-06,DRUG_TYPE_8
a102726b-1c7c-11ec-bfbf-16262ee38c7f,2020-01-02,DRUG_TYPE_7
a102729b-1c7c-11ec-86ba-16262ee38c7f,2019-04-21,DRUG_TYPE_7


In [9]:
train_df.sort_values(by = ['Date'], inplace=True)
test_df.sort_values(by = ['Date'], inplace=True)

In [14]:
#converting date column to pandas datetime type
train_df['Date']=pd.to_datetime(train_df['Date'],format='%Y-%m-%d')

In [15]:
positive_set=train_df[train_df['Incident'] == 'TARGET DRUG']

# Set the current date as a reference point
current_date = positive_set.Date.max()

# Calculate the cutoff date 30 days before the current date
cutoff_date = current_date - timedelta(days=30)

# Filter the data for patients who have taken "Target Drug" within the last 30 days
positive_set = train_df[(train_df['Incident'] == 'TARGET DRUG') & (train_df['Date'] >= cutoff_date)]

In [16]:
negative_set = train_df[train_df['Incident'] != 'TARGET DRUG'].sample(frac=1)[:len(positive_set)] 

In [19]:
# Combine the positive and negative sets
data = pd.concat([positive_set, negative_set])

# Sort the data by date
data = data.sort_values(by='Date')

# Create a target variable indicating whether the patient is eligible or not
data['Eligible'] = np.where(data['Incident'] == 'TARGET DRUG', 1, 0)

In [21]:
data.reset_index(drop=True)

Unnamed: 0,Patient-Uid,Date,Incident,Eligible
0,a0e5a46c-1c7c-11ec-8e52-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0
1,a0ef40ff-1c7c-11ec-892c-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0
2,a0e02ef4-1c7c-11ec-8b65-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0
3,a0ea2a61-1c7c-11ec-bf3c-16262ee38c7f,2015-04-08,SYMPTOM_TYPE_10,0
4,a0e9f9ad-1c7c-11ec-9419-16262ee38c7f,2015-04-09,DRUG_TYPE_11,0
...,...,...,...,...
5777,a0ec8e4d-1c7c-11ec-8282-16262ee38c7f,2020-09-03,TARGET DRUG,1
5778,a0ed9185-1c7c-11ec-8063-16262ee38c7f,2020-09-03,TARGET DRUG,1
5779,a0ef6a3b-1c7c-11ec-8aaa-16262ee38c7f,2020-09-03,TARGET DRUG,1
5780,a0ed98d5-1c7c-11ec-8624-16262ee38c7f,2020-09-03,TARGET DRUG,1


In [23]:
freq_features = data.groupby('Patient-Uid').agg({'Incident': 'count'}).reset_index()
freq_features.columns = ['Patient-Uid', 'Freq']

In [24]:
time_features = data.groupby('Patient-Uid').agg({'Date': ['min', 'max']}).reset_index()
time_features.columns = ['Patient-Uid', 'Min_Date', 'Max_Date']
time_features['Time_Diff'] = (pd.to_datetime(current_date) - time_features['Max_Date']).dt.days

In [25]:
data = pd.merge(data, freq_features, on='Patient-Uid', how='left')
data = pd.merge(data, time_features, on='Patient-Uid', how='left')

In [27]:
data.isnull().sum()
data.fillna(0,inplace=True)

In [28]:
features = ['Freq', 'Time_Diff']
target = 'Eligible'
X=data[features]
y=data[target]

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [30]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train,y_train)

# Make predictions on the validation set
y_pred = rf.predict(X_test)
f1 = f1_score(y_test,y_pred)

print(f"F1 score: {f1:.3f}")

F1 score: 0.933


In [31]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_lg = model.predict(X_test)
f1=f1_score(y_test,y_pred_lg)

print(f"F1 score: {f1:.3f}")

F1 score: 0.923


# Prediction on Test data

In [32]:
t_data=test_df[:100000]

In [33]:
current_date_1=test_df['Date'].max()

In [35]:
test_freq_features = t_data.groupby('Patient-Uid').agg({'Incident': 'count'}).reset_index()
test_freq_features.columns = ['Patient-Uid', 'Freq']

test_time_features = t_data.groupby('Patient-Uid').agg({'Date': ['min', 'max']}).reset_index()
test_time_features.columns = ['Patient-Uid', 'Min_Date', 'Max_Date']
test_time_features['Time_Diff'] = (pd.to_datetime('2020-08-04') - test_time_features['Max_Date']).dt.days

# Merge the features
test_data = pd.merge(t_data, test_freq_features, on='Patient-Uid', how='left')
test_data = pd.merge(t_data, test_time_features, on='Patient-Uid', how='left')

# Fill missing values with 0
test_data.fillna(0, inplace=True)

In [37]:
t_data['label'] = rf.predict(t_data[features])

KeyError: "None of [Index(['Freq', 'Time_Diff'], dtype='object')] are in the [columns]"

In [11]:
# create a positive and negative set for developing the model
positive_df = train_df.loc[train_df["Incident"] == "TARGET DRUG"].copy()
negative_df = train_df.loc[train_df["Incident"] != "TARGET DRUG"].copy()

In [13]:
negative_set = train_df[train_df['Incident'] != 'TARGET DRUG'].sample(frac=1)[:len(positive_set)] 

In [120]:
# calculate the date range for the validation set
start_date = positive_df["Date"].max()
end_date = start_date + pd.Timedelta(days=30)


In [121]:
# select patients in the future for the validation set
future_df = train_df.loc[train_df["Date"] >= start_date]


In [122]:
future_df["Target_Drug_Taken"] = future_df.groupby("Patient-Uid")["Incident"].transform(
    lambda x: "TARGET DRUG" in x.tolist()
).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df["Target_Drug_Taken"] = future_df.groupby("Patient-Uid")["Incident"].transform(


In [123]:
# balance the data
num_positive_samples = positive_df.shape[0]
negative_df_sampled = negative_df.sample(num_positive_samples, random_state=42)
merged_df = pd.concat([positive_df, negative_df_sampled], ignore_index=True)


In [124]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident
0,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-22,TARGET DRUG
1,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-23,TARGET DRUG
2,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-02,TARGET DRUG
3,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-04,TARGET DRUG
4,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-09,TARGET DRUG


In [125]:
# create a column for the number of days since the last visit for each patient
merged_df["Days_Since_Last_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff().dt.days
merged_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
# create a column for the number of days until the next visit for each patient
merged_df["Days_Until_Next_Visit"] = merged_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
merged_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
# create a column for the number of visits for each patient
merged_df["Num_Visits"] = merged_df.groupby("Patient-Uid")["Date"].transform("count")

# create a column for the number of visits in the last 30 days for each patient
merged_df["Visits_Last_30_Days"] = merged_df.groupby("Patient-Uid")["Date"].transform(
    lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum()
)



In [126]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
0,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-22,TARGET DRUG,0.0,-1.0,4,1
1,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-23,TARGET DRUG,1.0,-1066.0,4,1
2,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-02,TARGET DRUG,0.0,-2.0,17,1
3,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-04,TARGET DRUG,2.0,-5.0,17,1
4,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-09,TARGET DRUG,5.0,-596.0,17,1


In [127]:
# split the data into training and validation sets
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)



In [128]:
merged_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
0,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-22,TARGET DRUG,0.0,-1.0,4,1
1,a0edb54c-1c7c-11ec-8c2b-16262ee38c7f,2017-02-23,TARGET DRUG,1.0,-1066.0,4,1
2,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-02,TARGET DRUG,0.0,-2.0,17,1
3,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-04,TARGET DRUG,2.0,-5.0,17,1
4,a0ef3297-1c7c-11ec-b81e-16262ee38c7f,2017-03-09,TARGET DRUG,5.0,-596.0,17,1


In [129]:
features=["Days_Since_Last_Visit", "Num_Visits","Days_Until_Next_Visit","Visits_Last_30_Days"]
target=["Incident"]

In [130]:
train_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
38308,a0eaba1a-1c7c-11ec-b8e3-16262ee38c7f,2019-10-31,TARGET DRUG,37.0,-26.0,11,1
28073,a0ec0b3b-1c7c-11ec-948e-16262ee38c7f,2019-07-06,TARGET DRUG,57.0,-76.0,9,1
122708,a0e338e8-1c7c-11ec-9949-16262ee38c7f,2018-05-10,PRIMARY_DIAGNOSIS,352.0,0.0,3,1
122683,a0eb9ce1-1c7c-11ec-a283-16262ee38c7f,2019-11-13,PRIMARY_DIAGNOSIS,540.0,0.0,12,2
93006,a0ebd723-1c7c-11ec-bf90-16262ee38c7f,2020-07-30,DRUG_TYPE_2,1548.0,625.0,11,2


In [131]:
# train a decision tree classifier
dtc=DecisionTreeClassifier()
dtc.fit(train_df[features], train_df[target])

In [132]:
val_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days
75493,a0efdb82-1c7c-11ec-93af-16262ee38c7f,2019-10-12,DRUG_TYPE_10,-160.0,0.0,5,1
89311,a0e56154-1c7c-11ec-8c1d-16262ee38c7f,2018-12-23,DRUG_TYPE_6,0.0,697.0,3,1
29417,a0ec8ee7-1c7c-11ec-950b-16262ee38c7f,2019-07-23,TARGET DRUG,27.0,-28.0,29,2
93986,a0eda11e-1c7c-11ec-a7e6-16262ee38c7f,2017-06-27,SYMPTOM_TYPE_2,117.0,146.0,11,1
113879,a0e4db0f-1c7c-11ec-b911-16262ee38c7f,2017-03-14,PRIMARY_DIAGNOSIS,-526.0,-526.0,10,1


In [133]:
# make predictions on training set
train_preds = dtc.predict(train_df[features])
train_acc = accuracy_score(train_df[target], train_preds)
train_f1 = f1_score(train_df[target], train_preds, average='weighted')
train_cm = confusion_matrix(train_df[target], train_preds)

print("Training accuracy:", train_acc)
print("Training F1-score:", train_f1)
print("Training confusion matrix:")
print(train_cm)

Training accuracy: 0.9240432179120021
Training F1-score: 0.9253314755039803
Training confusion matrix:
[[4692   24    0 ...    0    0    0]
 [ 211 7270    0 ...    0    0    0]
 [   6   11  336 ...    0    0    0]
 ...
 [   1    6    0 ...  103    0    0]
 [   0    0    0 ...    0    9    0]
 [   0    0    0 ...    0    0    3]]


In [134]:
# make predictions on validation set
val_df["Prediction"] = dtc.predict(val_df[features])

# evaluate model using F1-score
val_f1 = f1_score(val_df[target], val_df["Prediction"], average='weighted')
print("Validation F1-score:", val_f1)

Validation F1-score: 0.5279602218989331


In [135]:
# generate predictions for the test set
test_df["Days_Since_Last_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff().dt.days
test_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
test_df["Days_Until_Next_Visit"] = test_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
test_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
test_df["Num_Visits"] = test_df.groupby("Patient-Uid")["Date"].transform("count")

# create a column for the number of visits in the last 30 days for each patient
test_df["Visits_Last_30_Days"] = test_df.groupby("Patient-Uid")["Date"].transform(
    lambda x: ((x.max() - x) < pd.Timedelta(days=30)).sum()
)


In [136]:
# define the features and target variable
features = ["Days_Since_Last_Visit", "Days_Until_Next_Visit","Num_Visits","Visits_Last_30_Days"]
target = "Incident"

In [137]:
# train a decision tree classifier
dtc=DecisionTreeClassifier()
dtc.fit(train_df[features], train_df[target])

In [141]:
test_df["Prediction"] = dtc.predict(test_df[features])


In [142]:
# Predict on test set
test_preds = dtc.predict_proba(test_df[features])[:, 1]
test_df['Label'] = np.where(test_preds > 0.5, 1, 0)


In [143]:
test_df.head(5)

Unnamed: 0,Patient-Uid,Date,Incident,Days_Since_Last_Visit,Days_Until_Next_Visit,Num_Visits,Visits_Last_30_Days,Label,Prediction
1133793,a101303d-1c7c-11ec-8cc8-16262ee38c7f,2015-04-07,DRUG_TYPE_9,0.0,-10.0,635,15,0,TARGET DRUG
650926,a0fe825a-1c7c-11ec-8759-16262ee38c7f,2015-04-07,DRUG_TYPE_7,0.0,-5.0,100,1,0,TARGET DRUG
443464,a0fce209-1c7c-11ec-82bf-16262ee38c7f,2015-04-07,PRIMARY_DIAGNOSIS,0.0,0.0,70,1,0,TEST_TYPE_1
763050,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,2015-04-07,DRUG_TYPE_1,0.0,0.0,287,4,0,PRIMARY_DIAGNOSIS
745903,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,2015-04-07,DRUG_TYPE_2,0.0,-5.0,54,1,0,TARGET DRUG


In [144]:
test_df["Prediction"].unique()

array(['TARGET DRUG', 'TEST_TYPE_1', 'PRIMARY_DIAGNOSIS', 'TEST_TYPE_0',
       'DRUG_TYPE_0', 'SYMPTOM_TYPE_14', 'DRUG_TYPE_6', 'SYMPTOM_TYPE_3',
       'DRUG_TYPE_1', 'DRUG_TYPE_3', 'DRUG_TYPE_7', 'DRUG_TYPE_5',
       'DRUG_TYPE_11', 'DRUG_TYPE_2', 'DRUG_TYPE_9', 'SYMPTOM_TYPE_6',
       'SYMPTOM_TYPE_2', 'TEST_TYPE_3', 'DRUG_TYPE_8', 'TEST_TYPE_2',
       'SYMPTOM_TYPE_7', 'DRUG_TYPE_12', 'SYMPTOM_TYPE_0', 'DRUG_TYPE_14',
       'SYMPTOM_TYPE_8', 'SYMPTOM_TYPE_29', 'SYMPTOM_TYPE_5',
       'DRUG_TYPE_10', 'SYMPTOM_TYPE_17', 'DRUG_TYPE_15',
       'SYMPTOM_TYPE_13', 'SYMPTOM_TYPE_11'], dtype=object)

In [145]:
# evaluate the model on the validation set
val_df["Days_Since_Last_Visit"] = val_df.groupby("Patient-Uid")["Date"].diff().dt.days
val_df["Days_Since_Last_Visit"].fillna(0, inplace=True)
val_df["Days_Until_Next_Visit"] = val_df.groupby("Patient-Uid")["Date"].diff(-1).dt.days
val_df["Days_Until_Next_Visit"].fillna(0, inplace=True)
val_df["Prediction"] = dtc.predict(val_df[features])
# Predict on test set
test_preds = dtc.predict_proba(val_df[features])[:, 1]
val_df['Label'] = np.where(test_preds > 0.5, 1, 0)


In [146]:
f1 = f1_score(test_df[target], test_df["Prediction"],average='weighted')
print("Validation F1-score:", f1)

Validation F1-score: 0.02193242740964698


In [148]:
# save the final predictions to a csv file
test_df[["Patient-Uid", "Label"]].to_csv("final_submission1.csv", index=False)

In [149]:
df2=pd.read_csv("final_submission1.csv")

In [150]:
df2.head(5)

Unnamed: 0,Patient-Uid,Label
0,a101303d-1c7c-11ec-8cc8-16262ee38c7f,0
1,a0fe825a-1c7c-11ec-8759-16262ee38c7f,0
2,a0fce209-1c7c-11ec-82bf-16262ee38c7f,0
3,a0ff8b78-1c7c-11ec-aed9-16262ee38c7f,0
4,a0ff612e-1c7c-11ec-b5bc-16262ee38c7f,0


In [151]:
df2["Label"].unique()

array([0, 1], dtype=int64)