In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the data

diag_data = pd.read_csv('DiagnosisSample.csv')
treat_data = pd.read_csv('TreatmentSample.csv')

In [3]:
# Get cancer types and number of patients for each cancer type

true_diag_count = diag_data['IsCancerDiagnosis'].value_counts()
cancer_diag = diag_data[diag_data['IsCancerDiagnosis']==True]

cancer_types = cancer_diag['Diagnosis'].unique()
print('Cancer types:')
print(cancer_types[0])
print(cancer_types[1])
      
print("")

num_patients = cancer_diag['Diagnosis'].value_counts()
print("Number of patients for each cancer type:")
print("Breast cancer =",num_patients[0])
print("Colon cancer =",num_patients[1])

Cancer types:
Breast Cancer
Colon Cancer

Number of patients for each cancer type:
Breast cancer = 22
Colon cancer = 11


In [9]:
# Get days before diagnosed patients get treated for patients
# that are getting treatment. Filter them by cancer type and 
# get the average duration between diagnosis and treatment
# for cancer type.

first_treat_data = treat_data.drop_duplicates('PatientID')
date_data = pd.merge(cancer_diag,first_treat_data,on='PatientID')
date_data = date_data.drop_duplicates('PatientID')

date_data['DaysToTreatment']=pd.to_datetime(date_data['TreatmentDate'])-pd.to_datetime(date_data['DiagnosisDate'])

df = pd.DataFrame({
    'PatientID':date_data['PatientID'],
    'CancerType':date_data['Diagnosis'],
    'DaysToTreatment':date_data['DaysToTreatment']
})

df=df[['PatientID','CancerType','DaysToTreatment']]
print(df)

bc_data = date_data[date_data['Diagnosis']=='Breast Cancer']
avg_bc_days = bc_data['DaysToTreatment'].mean()

cc_data = date_data[date_data['Diagnosis']=='Colon Cancer']
avg_cc_days = cc_data['DaysToTreatment'].mean()

print("")
print("The average days before treatment for breast cancer patients is",avg_bc_days.days)
print("")
print("The average days before treatment for colon cancer patients is",avg_cc_days.days)

    PatientID     CancerType DaysToTreatment
0        2038  Breast Cancer          3 days
1        2120  Breast Cancer         15 days
2        2175  Breast Cancer          4 days
3        2407  Breast Cancer          6 days
4        2425  Breast Cancer          4 days
5        2462  Breast Cancer          4 days
6        2763  Breast Cancer          4 days
7        2770   Colon Cancer          6 days
8        3095   Colon Cancer          3 days
9        3449   Colon Cancer          4 days
11       3757   Colon Cancer          5 days
14       3948  Breast Cancer          4 days
15       4256  Breast Cancer          5 days
16       4354  Breast Cancer          5 days
17       4374  Breast Cancer          5 days
19       4692  Breast Cancer          3 days
20       5259  Breast Cancer          4 days
21       6281  Breast Cancer          4 days
22       6321  Breast Cancer          4 days
23       6837   Colon Cancer          5 days
24       6877   Colon Cancer          7 days
26       6

In [5]:
# Get patients that are not treated

patient_list = []
treated_patients = treat_data['PatientID'].unique()

for patient in diag_data['PatientID']:
    if patient in treated_patients:
        pass
    else:
        patient_list.append(patient)

print("Number of patients not treated:")
print(len(patient_list))
print("")
print("PatientID's for untreated patients:")
for patient in patient_list:
    print(patient)

Number of patients not treated:
6

PatientID's for untreated patients:
2634
5657
6840
7937
8615
8827


In [6]:
# Get number of patients that undergo a second line
# of therapy. Divide that by total number of unique
# patients to get percentage of patients that require
# a second line of treatment.

grouped_data = treat_data.groupby(by=['PatientID'])['DrugCode'].unique()

unq_patients = len(treat_data['PatientID'].unique())

second_line_count = 0
for name, group in treat_data.groupby(by=['PatientID']):
    drugs = group.DrugCode.unique()
    date_list = [list(group[group['DrugCode'] == d].TreatmentDate)[0] for d in drugs]
    if len(date_list)==2:
        if date_list[0]!=date_list[1]:
            second_line_count+=1
    elif len(date_list)==3:
        if date_list[2]!=date_list[1]:
            second_line_count+=1
    else:
        pass
    

print(second_line_count,"patients undergo a second line of treatment")
print("")
print("Percentage of patients undergoing second line of treatment is",str(round(second_line_count*100/unq_patients,2)),"%")



9 patients undergo a second line of treatment

Percentage of patients undergoing second line of treatment is 32.14 %


In [7]:
# Get average duration of treatment for each drug

treat_data.sort_values(by=['PatientID','TreatmentDate'],inplace=True)

druga_data = treat_data[treat_data['DrugCode']=='A']
initial_druga = druga_data.drop_duplicates('PatientID',keep='first')
last_druga = druga_data.drop_duplicates('PatientID',keep='last')

a_ini_last_merge = pd.merge(initial_druga,last_druga,on='PatientID')

a_ini_last_merge['TreatmentDatesDiff']=pd.to_datetime(a_ini_last_merge['TreatmentDate_y'])-pd.to_datetime(a_ini_last_merge['TreatmentDate_x'])

drugb_data = treat_data[treat_data['DrugCode']=='B']
initial_drugb = drugb_data.drop_duplicates('PatientID',keep='first')
last_drugb = drugb_data.drop_duplicates('PatientID',keep='last')

b_ini_last_merge = pd.merge(initial_drugb,last_drugb,on='PatientID')

b_ini_last_merge['TreatmentDatesDiff']=pd.to_datetime(b_ini_last_merge['TreatmentDate_y'])-pd.to_datetime(b_ini_last_merge['TreatmentDate_x'])

drugc_data = treat_data[treat_data['DrugCode']=='C']
initial_drugc = drugc_data.drop_duplicates('PatientID',keep='first')
last_drugc = drugc_data.drop_duplicates('PatientID',keep='last')

c_ini_last_merge = pd.merge(initial_drugc,last_drugc,on='PatientID')

c_ini_last_merge['TreatmentDatesDiff']=pd.to_datetime(c_ini_last_merge['TreatmentDate_y'])-pd.to_datetime(c_ini_last_merge['TreatmentDate_x'])

print("Drug A has an average treatment duration of",a_ini_last_merge['TreatmentDatesDiff'].mean().days,"days")
print("Drug B has an average treatment duration of",b_ini_last_merge['TreatmentDatesDiff'].mean().days,"days")
print("Drug C has an average treatment duration of",c_ini_last_merge['TreatmentDatesDiff'].mean().days,"days")

Drug A has an average treatment duration of 57 days
Drug B has an average treatment duration of 58 days
Drug C has an average treatment duration of 110 days
