1. Intervention Model - Done
2. Intervention Type - Done
3. Phase - Done
4. Min Age - Done
5. Max Age - Done
6. Adverse Event Data (None, Missing, Present) - Done
7. Dropout Rate - Done
8. Primary Purpose
9. Allocation - Done
10. Enrollment - Done
11. Gender - Done
12. Trial Length - Done
13. Masking Info - Done
14. Arms Group - Done

In [None]:
import json
import numpy as np
import pandas as pd

In [None]:
with open("/content/data_samples.json", 'r') as f:
  data_samples = json.load(f)

In [None]:
len(data_samples)

5896

### Start Creating the Dataset

In [None]:
dataset = []

In [None]:
count = 0

In [None]:
for study in data_samples:
    try:

        # Extract Identification and Phas
        study_id = study['protocolSection']['identificationModule'].get('nctId', None)
        phase = study['protocolSection']['designModule'].get('phases', [None])[0]
        if phase == "NA":
            phase = np.nan
        start_date_str = study['protocolSection']['statusModule']['studyFirstSubmitDate']
        end_date_str = study['protocolSection']['statusModule']['lastUpdateSubmitDate']
        sponsor_type = study['protocolSection']['sponsorCollaboratorsModule']['leadSponsor']['class']
        no_of_conditions = len(study['protocolSection']["conditionsModule"]['conditions'])
        try:

            no_of_locations = len(study['protocolSection']['contactsLocationsModule']['locations'])
            if no_of_locations == 0:
                no_of_locations = np.nan
        except Exception as e:
            no_of_locations = np.nan

        # Eatract Eligibility Data
        eligibility = study['protocolSection']['eligibilityModule']
        gender = eligibility.get('sex', None)
        min_age = eligibility.get('minimumAge', None)
        if min_age is None:
            min_age = np.nan
        else:
            min_age = int(min_age.split()[0])
        max_age = eligibility.get('maximumAge', None)
        if max_age is None:
            max_age = np.nan
        else:
            max_age = int(max_age.split()[0])

        # Extract Design and Enrollment
        design_info = study['protocolSection']['designModule'].get('designInfo', {})
        allocation = design_info.get('allocation', None)
        if allocation == "NA":
            allocation = np.nan
        intervention_model = design_info.get('interventionModel', None)
        masking = design_info.get('maskingInfo', {}).get('masking', None)
        enrollment = np.nan
        try:
            enrollment = study['protocolSection']['designModule']['enrollmentInfo'].get('count', None)
        except:
            enrollment = np.nan

        # Extract Arms Group and No of Interventions
        arms_group = study['protocolSection']['armsInterventionsModule'].get('armGroups', None)
        if arms_group is None:
            no_of_arms = np.nan
        else:
            no_of_arms = len(arms_group)

        #{'COMBINATION_PRODUCT','DIAGNOSTIC_TEST','GENETIC','RADIATION'}
        no_of_drug_interventions = 0
        no_of_behavioral_interventions = 0
        no_of_procedural_interventions = 0
        no_of_biological_interventions = 0
        no_of_device_interventions = 0
        no_of_dietary_interventions = 0
        no_of_other_interventions = 0
        no_of_combination_product_interventions = 0
        no_of_diagnostic_test_interventions = 0
        no_of_genetic_interventions = 0
        no_of_radiation_interventions = 0
        interventions = study['protocolSection']['armsInterventionsModule']['interventions']
        for item in interventions:
            intervention_type = item.get('type', None)
            if intervention_type == "DRUG":
                no_of_drug_interventions += 1
            elif intervention_type == "BEHAVIORAL":
                no_of_behavioral_interventions += 1
            elif intervention_type == "PROCEDURE":
                no_of_procedural_interventions += 1
            elif intervention_type == "BIOLOGICAL":
                no_of_biological_interventions += 1
            elif intervention_type == "DEVICE":
                no_of_device_interventions += 1
            elif intervention_type == "DIETARY_SUPPLEMENT":
                no_of_dietary_interventions += 1
            elif intervention_type == "OTHER":
                no_of_other_interventions += 1
            elif intervention_type == "COMBINATION_PRODUCT":
                no_of_combination_product_interventions += 1
            elif intervention_type == "DIAGNOSTIC_TEST":
                no_of_diagnostic_test_interventions += 1
            elif intervention_type == "GENETIC":
                no_of_genetic_interventions += 1
            elif intervention_type == "RADIATION":
                no_of_radiation_interventions += 1
            else:
                print(f"Diff: {intervention_type}")

        # Extract Adverse Event Data
        adverseEventDataAvailable = 0
        adverse_events = study.get("resultsSection", {}).get("adverseEventsModule", None)
        if adverse_events is not None:
            adverseEventDataAvailable = 1


        # Extract participant flow data
        total_dropout_rate = -1
        participant_flow = study.get("resultsSection", {}).get("participantFlowModule", {}).get("periods", None)
        if participant_flow is not None:
            total_started = 0
            total_not_completed = 0

            # Loop through periods to calculate totals
            for period in participant_flow:
                if "Overall Study" in period.get("title", ""):
                    for milestone in period.get("milestones", []):
                        if milestone.get("type", "") == "STARTED":
                            total_started += sum(int(group.get("numSubjects", 0)) for group in milestone.get("achievements", []))
                        elif milestone.get("type", "") == "NOT COMPLETED":
                            total_not_completed += sum(int(group.get("numSubjects", 0)) for group in milestone.get("achievements", []))

            # Calculate the total dropout rate
            if total_started > 0:
                total_dropout_rate = total_not_completed / total_started
            else:
                total_dropout_rate = 0

            total_dropout_rate = round(total_dropout_rate,2)

        # Data for Trail Outcomes
        outcomes_module = study.get('protocolSection', {}).get('outcomesModule', {})

        # Get the number of primary outcomes or assign np.nan if not present
        no_of_primary_outcomes = len(outcomes_module.get('primaryOutcomes', np.nan)) if outcomes_module.get('primaryOutcomes') else np.nan

        # Get the number of secondary outcomes or assign np.nan if not present
        no_of_secondary_outcomes = len(outcomes_module.get('secondaryOutcomes', np.nan)) if outcomes_module.get('secondaryOutcomes') else np.nan


        # Result Available
        has_result = study["hasResults"]



        dataset.append({
            'study_id': study_id,
            'phase': phase,
            'sponsor_type': sponsor_type,
            'no_of_locations': no_of_locations,
            'no_of_conditions': no_of_conditions,
            'enrollment': enrollment,
            'allocation': allocation,
            'intervention_model': intervention_model,
            'masking': masking,
            'no_of_arms': no_of_arms,
            'no_of_drug_interventions': no_of_drug_interventions,
            'no_of_behavioral_interventions': no_of_behavioral_interventions,
            'no_of_procedural_interventions': no_of_procedural_interventions,
            'no_of_biological_interventions': no_of_biological_interventions,
            'no_of_device_interventions': no_of_device_interventions,
            'no_of_dietary_interventions': no_of_dietary_interventions,
            'no_of_other_interventions': no_of_other_interventions,
            'adverseEventDataAvailable': adverseEventDataAvailable,
            'no_of_combination_product_interventions': no_of_combination_product_interventions,
            'no_of_diagnostic_test_interventions': no_of_diagnostic_test_interventions,
            'no_of_genetic_interventions': no_of_genetic_interventions,
            'no_of_radiation_interventions': no_of_radiation_interventions,
            'min_age': min_age,
            'max_age': max_age,
            'gender': gender,
            'total_dropout_rate': total_dropout_rate,
            'start_date': start_date_str,
            'end_date': end_date_str,
            'no_of_primary_outcomes': no_of_primary_outcomes,
            'no_of_secondary_outcomes': no_of_secondary_outcomes,
            'has_result': has_result
        })
    except Exception as e:
        print(e)
        continue

In [None]:
len(dataset)

5896

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
study_id,NCT01868646,NCT02879383,NCT01020123,NCT01972724,NCT01969084
phase,PHASE4,,PHASE2,PHASE4,PHASE4
sponsor_type,INDUSTRY,INDUSTRY,INDUSTRY,INDUSTRY,OTHER
no_of_locations,17.0,11.0,76.0,9.0,1.0
no_of_conditions,1,2,1,1,1
enrollment,190.0,109.0,530.0,114.0,45.0
allocation,RANDOMIZED,RANDOMIZED,RANDOMIZED,RANDOMIZED,RANDOMIZED
intervention_model,PARALLEL,CROSSOVER,PARALLEL,SINGLE_GROUP,PARALLEL
masking,QUADRUPLE,DOUBLE,DOUBLE,NONE,QUADRUPLE
no_of_arms,2.0,2.0,7.0,3.0,2.0


In [None]:
df.to_csv("dataset_v1.csv", index=False)