## Mount drive to access data files

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing library


In [10]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

## data paths

In [4]:
train_path = '/content/drive/MyDrive/Structured_Data_Assignment/Structured_Data_Assignment/train.parquet'
test_path = '/content/drive/MyDrive/Structured_Data_Assignment/Structured_Data_Assignment/test.parquet'

## Load the train data


In [16]:
data= pd.read_parquet(train_path)

In [17]:
data

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1
...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3220868 entries, 0 to 29080911
Data columns (total 3 columns):
 #   Column       Dtype         
---  ------       -----         
 0   Patient-Uid  object        
 1   Date         datetime64[ns]
 2   Incident     object        
dtypes: datetime64[ns](1), object(2)
memory usage: 98.3+ MB


In [33]:
# Filter data for "Target Drug" patients
target_drug_data = data[data['Incident'] == 'TARGET DRUG']

if target_drug_data.empty:
    print("No patients have taken the 'Target Drug'.")
else:
    # Drop-off counts
    dropoff_counts = target_drug_data.groupby(pd.Grouper(key='Date', freq='M'))['Patient-Uid'].nunique()

    # Drop-off rate
    num_months_with_dropoff = dropoff_counts[dropoff_counts > 0].count()
    dropoff_rate = num_months_with_dropoff / dropoff_counts.shape[0]

    if dropoff_rate > 0:
        print("Drop-off Rate:")
        print(f"{dropoff_rate:.2%} of patients dropped off each month.")
    else:
        print("No instances of drop-off.")

    # Events driving drop-off
    dropoff_events = target_drug_data[target_drug_data['Patient-Uid'].duplicated(keep=False)]['Incident']
    print("\nEvents driving drop-off (Sample):")
    print(dropoff_events.head(5))

    dropoff_events_counts = dropoff_events.value_counts()
    if not dropoff_events_counts.empty:
        print("\nEvents driving drop-off:")
        for event, count in dropoff_events_counts.items():
            print(f"Event: {event}\tCount: {count}")
    else:
        print("No specific events identified.")

Drop-off Rate:
100.00% of patients dropped off each month.

Events driving drop-off (Sample):
3294791    TARGET DRUG
3296990    TARGET DRUG
3305387    TARGET DRUG
3309423    TARGET DRUG
3309494    TARGET DRUG
Name: Incident, dtype: object

Events driving drop-off:
Event: TARGET DRUG	Count: 66322


## Load the test data


In [34]:
data= pd.read_parquet(test_path)

In [35]:
data

Unnamed: 0,Patient-Uid,Date,Incident
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0
...,...,...,...
1372854,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-05-11,DRUG_TYPE_13
1372856,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2018-08-22,DRUG_TYPE_2
1372857,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-02-04,DRUG_TYPE_2
1372858,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-09-25,DRUG_TYPE_8


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065524 entries, 0 to 1372859
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Patient-Uid  1065524 non-null  object        
 1   Date         1065524 non-null  datetime64[ns]
 2   Incident     1065524 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 32.5+ MB


In [37]:
# Filter data for "Target Drug" patients
target_drug_data = data[data['Incident'] == 'TARGET DRUG']

if target_drug_data.empty:
    print("No patients have taken the 'Target Drug'.")
else:
    # Drop-off counts
    dropoff_counts = target_drug_data.groupby(pd.Grouper(key='Date', freq='M'))['Patient-Uid'].nunique()
    # Drop-off rate
    num_months_with_dropoff = dropoff_counts[dropoff_counts > 0].count()
    dropoff_rate = num_months_with_dropoff / dropoff_counts.shape[0]

    if dropoff_rate > 0:
        print("Drop-off Rate:")
        print(f"{dropoff_rate:.2%} of patients dropped off each month.")
    else:
        print("No instances of drop-off.")

    # Events driving drop-off
    dropoff_events = target_drug_data[target_drug_data['Patient-Uid'].duplicated(keep=False)]['Incident']
    print("\nEvents driving drop-off (Sample):")
    print(dropoff_events.head(5))

    dropoff_events_counts = dropoff_events.value_counts()
    if not dropoff_events_counts.empty:
        print("\nEvents driving drop-off:")
        for event, count in dropoff_events_counts.items():
            print(f"Event: {event}\tCount: {count}")
    else:
        print("No specific events identified.")

No patients have taken the 'Target Drug'.
