In [10]:
import os
import pandas as pd
import plotly_express as px
# read the combined dataset
df = pd.read_excel('Outputs/FinalDataset.xlsx')
colors  = ['#BC5308', '#FFECD1', '#C5CAB8', '#FF7D00', '#8AA79F', '#FFB569', '#15616D', '#001524']
df.head()

Unnamed: 0.1,Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Type
0,0,NCT02426125,A Study of Ramucirumab (LY3009806) Plus Doceta...,https://clinicaltrials.gov/study/NCT02426125,RANGE,COMPLETED,The main purpose of this study is to evaluate ...,YES,Urothelial Carcinoma,DRUG: Ramucirumab|DRUG: Docetaxel|DRUG: Placebo,...,15679|I4T-MC-JVDC|2014-003655-66,2015-07-13,2017-04-21,2022-07-26,2015-04-24,2019-01-25,2023-08-21,"Highlands Oncology Group, Fayetteville, Arkans...",Study Protocol|Statistical Analysis Plan,Cancer-studies
1,1,NCT04910425,PSMA-Targeted 18F-DCFPyL PET/MRI for the Detec...,https://clinicaltrials.gov/study/NCT04910425,,NOT_YET_RECRUITING,This phase II trial studies how well 18F-DCFPy...,NO,Prostate Carcinoma,DRUG: Fluorine F 18 DCFPyL|DRUG: Gadobenate Di...,...,NU 19U05|NCI-2021-05593|STU00212326|NU 19U05|P...,2023-06-17,2026-06-17,2028-07,2021-06-02,,2022-08-03,"Northwestern University, Chicago, Illinois, 60...",,Cancer-studies
2,2,NCT04116125,Omitting Biopsy of SEntinel Lymph Node With Ra...,https://clinicaltrials.gov/study/NCT04116125,OBSERB,NOT_YET_RECRUITING,"The OBSERB study is a multi-center, non-blinde...",NO,Breast Neoplasm Female|Lymphatic Metastasis,PROCEDURE: Radiotherapy,...,2019-09-023,2020-07-01,2023-06-30,2025-06-30,2019-10-04,,2019-10-04,,,Cancer-studies
3,3,NCT03566225,Pioglitazone Versus Metformin as First Treatme...,https://clinicaltrials.gov/study/NCT03566225,,COMPLETED,Participants with PCOS will be divided into tw...,NO,Pioglitazone,DRUG: Pioglitazone|DRUG: Metformin|DRUG: Clomi...,...,AinShamaU,2018-01-30,2021-02-28,2021-03-30,2018-06-25,,2021-06-02,"Ain Shams Univerisity, Cairo, Egypt",,Cancer-studies
4,4,NCT01756625,"PREMIUM, Observational Study",https://clinicaltrials.gov/study/NCT01756625,,UNKNOWN,PREMIUM is an observational pharmaco-epidemiol...,NO,First Line WT KRAS mCRC,,...,PREMIUM,2010-01,2012-03,2013-06,2012-12-27,,2012-12-27,"Institut Sainte-Catherine, Avignon, Vaucluse, ...",,Cancer-studies


In [4]:
# check the type of data that we have
df.info() #enrollment is the only numeric data that we have the rest are text based 
# The study title and the URL and acronymes are just metadata that give more information about the clinical trials that are undergoing

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161863 entries, 0 to 161862
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  161863 non-null  int64  
 1   NCT Number                  161863 non-null  object 
 2   Study Title                 161863 non-null  object 
 3   Study URL                   161863 non-null  object 
 4   Acronym                     48532 non-null   object 
 5   Study Status                161863 non-null  object 
 6   Brief Summary               161863 non-null  object 
 7   Study Results               161863 non-null  object 
 8   Conditions                  161857 non-null  object 
 9   Interventions               144610 non-null  object 
 10  Primary Outcome Measures    153676 non-null  object 
 11  Secondary Outcome Measures  119675 non-null  object 
 12  Other Outcome Measures      13107 non-null   object 
 13  Sponsor       

# Univariate analysis

In [26]:


# Assuming 'df' is your DataFrame
type_counts = df['Type'].value_counts().reset_index()
type_counts.columns = ['Type', 'Count']

fig = px.bar(type_counts, x='Count', y='Type', text='Count', orientation='h',color_discrete_sequence=colors, template='plotly_white',)

fig.update_traces(textposition='inside')
fig.update_layout(xaxis_title='Count', yaxis_title='Type', title='Counts of Each Type')
fig.update_layout(title_text='The distribution of each study in the dataset', title_x=0.5, title_y=0.95)
fig.show()


In [48]:
y = df['Study Results'].value_counts().reset_index()
display(y) # the study results are just yes or no

results  = df.groupby(['Study Results','Type']).size().reset_index().rename(columns = {0:'n'})
fig = px.bar(results,x ='Type',y ='n',color = 'Study Results' ,color_discrete_sequence=colors[:1]+colors[-1:], template='plotly_white')
fig.update_layout(title_text='Does the study have results', title_x=0.5, title_y=0.95)
fig.show()



Unnamed: 0,index,Study Results
0,NO,142050
1,YES,19813


In [59]:
cond = df.groupby(['Conditions','Type']).size().reset_index().rename(columns = {0:'n'}).sort_values('n',ascending=False)
cond.head(10) # the top 10 conditions that are being studied

Unnamed: 0,Conditions,Type,n
9935,Breast Cancer,Cancer-studies,4454
58527,Prostate Cancer,Cancer-studies,2603
33046,HIV Infections,HIV-studies,2092
23234,Coronary Artery Disease,Heart,1601
35684,Heart Failure,Heart,1489
12464,COVID-19,Covid 19-studies,1484
14417,Cancer,Cancer-studies,1433
21409,Colorectal Cancer,Cancer-studies,1396
12467,COVID-19,Pneumonia-studies,1380
43221,Lung Cancer,Cancer-studies,1371


# Analysis 

In [66]:
study = df.groupby(['Study Status','Type']).size().reset_index().rename(columns = {0:'n'})
fig = px.pie(study, names='Study Status', values='n', color_discrete_sequence=colors, template='plotly_white',
             hole=0.4,facet_col = 'Type')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.update_layout(title_text='What is the current status of each study?', title_x=0.5, title_y=0.95)
# Show the plot
fig.show()

In [65]:
# This is just to see the brief summary of each of the study
# select one of the study
cancer = df[df['Type']=='Cancer-studies']
cancer.head(10)

Unnamed: 0.1,Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Type
0,0,NCT02426125,A Study of Ramucirumab (LY3009806) Plus Doceta...,https://clinicaltrials.gov/study/NCT02426125,RANGE,COMPLETED,The main purpose of this study is to evaluate ...,YES,Urothelial Carcinoma,DRUG: Ramucirumab|DRUG: Docetaxel|DRUG: Placebo,...,15679|I4T-MC-JVDC|2014-003655-66,2015-07-13,2017-04-21,2022-07-26,2015-04-24,2019-01-25,2023-08-21,"Highlands Oncology Group, Fayetteville, Arkans...",Study Protocol|Statistical Analysis Plan,Cancer-studies
1,1,NCT04910425,PSMA-Targeted 18F-DCFPyL PET/MRI for the Detec...,https://clinicaltrials.gov/study/NCT04910425,,NOT_YET_RECRUITING,This phase II trial studies how well 18F-DCFPy...,NO,Prostate Carcinoma,DRUG: Fluorine F 18 DCFPyL|DRUG: Gadobenate Di...,...,NU 19U05|NCI-2021-05593|STU00212326|NU 19U05|P...,2023-06-17,2026-06-17,2028-07,2021-06-02,,2022-08-03,"Northwestern University, Chicago, Illinois, 60...",,Cancer-studies
2,2,NCT04116125,Omitting Biopsy of SEntinel Lymph Node With Ra...,https://clinicaltrials.gov/study/NCT04116125,OBSERB,NOT_YET_RECRUITING,"The OBSERB study is a multi-center, non-blinde...",NO,Breast Neoplasm Female|Lymphatic Metastasis,PROCEDURE: Radiotherapy,...,2019-09-023,2020-07-01,2023-06-30,2025-06-30,2019-10-04,,2019-10-04,,,Cancer-studies
3,3,NCT03566225,Pioglitazone Versus Metformin as First Treatme...,https://clinicaltrials.gov/study/NCT03566225,,COMPLETED,Participants with PCOS will be divided into tw...,NO,Pioglitazone,DRUG: Pioglitazone|DRUG: Metformin|DRUG: Clomi...,...,AinShamaU,2018-01-30,2021-02-28,2021-03-30,2018-06-25,,2021-06-02,"Ain Shams Univerisity, Cairo, Egypt",,Cancer-studies
4,4,NCT01756625,"PREMIUM, Observational Study",https://clinicaltrials.gov/study/NCT01756625,,UNKNOWN,PREMIUM is an observational pharmaco-epidemiol...,NO,First Line WT KRAS mCRC,,...,PREMIUM,2010-01,2012-03,2013-06,2012-12-27,,2012-12-27,"Institut Sainte-Catherine, Avignon, Vaucluse, ...",,Cancer-studies
5,5,NCT03063125,Perioperative Hypogonadism in Men Undergoing R...,https://clinicaltrials.gov/study/NCT03063125,,COMPLETED,The purpose of this study is to examine the re...,YES,Bladder Cancer|Low Testosterone Levels,,...,STUDY00140348,2017-03-31,2018-04-20,2018-04-20,2017-02-24,2020-09-08,2021-08-03,"University of Kansas Medical Center, Kansas Ci...",Study Protocol and Statistical Analysis Plan|I...,Cancer-studies
6,6,NCT05158725,Comparison of Colonoscopy Adenoma Detection Yield,https://clinicaltrials.gov/study/NCT05158725,,RECRUITING,A Prospective Randomized Comparison of colonos...,NO,Adenoma Colon,DEVICE: Standard Colonoscopy|DEVICE: Discovery...,...,G-EYE/Discovery,2021-11-08,2022-05,2022-06,2021-12-15,,2021-12-15,Helios Dr. Horst Schmidt Kliniken Wiesbaden Gm...,,Cancer-studies
7,7,NCT00643825,"Prolonged Adjuvant Temozolomide vs ""Stop & Go""...",https://clinicaltrials.gov/study/NCT00643825,PATSGO,UNKNOWN,This study will test the hypothesis that prolo...,NO,Glioblastoma,DRUG: Temozolomide|DRUG: Temozolomide,...,UCL-ONCO 06-004,2008-01,2011-01,2012-01,2008-03-26,,2010-07-23,"Cliniques Universitaires Saint-Luc, Brussels, ...",,Cancer-studies
8,8,NCT00070525,Tipifarnib in Treating Young Patients With Rec...,https://clinicaltrials.gov/study/NCT00070525,,COMPLETED,This phase II trial is studying how well tipif...,NO,Childhood High-grade Cerebral Astrocytoma|Chil...,DRUG: tipifarnib,...,NCI-2012-01806|NCI-2012-01806|CDR0000334862|CO...,2003-11,2006-09,,2003-10-07,,2013-10-08,"Children's Oncology Group, Arcadia, California...",,Cancer-studies
9,9,NCT02996825,Mirvetuximab Soravtansine and Gemcitabine Hydr...,https://clinicaltrials.gov/study/NCT02996825,,ACTIVE_NOT_RECRUITING,This phase I trial studies the side effects an...,NO,Recurrent Breast Carcinoma|Recurrent Fallopian...,DRUG: Gemcitabine|DRUG: Gemcitabine Hydrochlor...,...,16294|NCI-2016-01913|16294,2017-03-22,2023-11-22,2023-11-22,2016-12-19,,2023-06-15,"City of Hope Medical Center, Duarte, Californi...",,Cancer-studies
