In [9]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import json
import time
import re

In [10]:
# Load JSON data (replace 'data.json' with the actual file path)
with open("trials.json", "r", encoding="utf-8") as file:
    json_data = json.load(file)

In [11]:
df = pd.DataFrame([{
    "RCT_ID": item.get("RCT ID", None),
    "Title": item.get("Title", None),
    "Status": item.get("Status", None),
    "Start_Date": item.get("Start date", None),
    "End_Date": item.get("End date", None),
    "Country": ", ".join([c["Country"] if isinstance(c, dict) else str(c) for c in item.get("Countries", [])]) if "Countries" in item else None,
    "Region": ", ".join([c["Region"] if isinstance(c, dict) and isinstance(c["Region"], str) else str(c["Region"]) if isinstance(c["Region"], list) else "" for c in item.get("Countries", [])]) if "Countries" in item else None,
    "PI_Name": item.get("Primary PI", {}).get("Name", None),
    "PI_Affiliation": item.get("Primary PI", {}).get("Affiliation", None),
    "Keywords": ", ".join(item.get("Keywords", [])) if item.get("Keywords") else None,
    "Additional_Keywords": ", ".join(item.get("Additional Keywords", [])) if item.get("Additional Keywords") else None,
    "JEL_Codes": ", ".join(item.get("JEL code(s)", [])) if item.get("JEL code(s)") else None,
    "Secondary_IDs": ", ".join(item.get("Secondary IDs", [])) if item.get("Secondary IDs") else None,
    "Abstract": item.get("Abstract", None),
    "External_Links": ", ".join(item.get("External Link(s)", {}).get("Link", [])) if isinstance(item.get("External Link(s)"), dict) else None,
    "Citation": item.get("Citation", None),
    "Former_Citation": item.get("Former Citation", None),
    "Intervention": item.get("Intervention(s)", None),
    "Intervention_Hidden": item.get("Intervention (Hidden)", None),
    "Intervention_Start_Date": item.get("Intervention Start Date", None),
    "Intervention_End_Date": item.get("Intervention End Date", None),
    "Primary_Outcomes": ", ".join(item.get("Primary Outcomes (end points)", [])) if item.get("Primary Outcomes (end points)") else None,
    "Primary_Outcomes_Explanation": item.get("Primary Outcomes (explanation)", None),
    "Secondary_Outcomes": ", ".join(item.get("Secondary Outcomes (end points)", [])) if item.get("Secondary Outcomes (end points)") else None,
    "Secondary_Outcomes_Explanation": item.get("Secondary Outcomes (explanation)", None),
    "Experimental_Design": item.get("Experimental Design", None),
    "Experimental_Design_Details": item.get("Experimental Design Details", None),
    "Randomization_Method": item.get("Randomization Method", None),
    "Randomization_Unit": item.get("Randomization Unit", None),
    "Treatment_Clustered": item.get("Was the treatment clustered?", None),
    "Planned_Clusters": item.get("Sample size: planned number of clusters", None),
    "Planned_Observations": item.get("Sample size: planned number of observations", None),
    "Sample_By_Treatment_Arms": item.get("Sample size (or number of clusters) by treatment arms", None),
    "Minimum_Detectable_Effect": item.get("Minimum detectable effect size for main outcomes (accounting for sampledesign and clustering)", None),
    "Intervention_Completed": item.get("Is the intervention completed?", None),
    "Data_Collection_Complete": item.get("Data Collection Complete", None),
    "Public_Data_Available": item.get("Is public data available?", None),
    "Program_Files": item.get("Program Files", None)
} for item in json_data.values()])

# Extract multiple investigators separately
df_pis = pd.concat([
    pd.DataFrame({
        "RCT_ID": item.get("RCT ID", None),
        "PI_Name": [pi.get("Name", None) for pi in item.get("Other Primary Investigators", [])],
        "PI_Affiliation": [pi.get("Affiliation", None) for pi in item.get("Other Primary Investigators", [])]
    }) for item in json_data.values() if "Other Primary Investigators" in item
], ignore_index=True)

# Extract IRB approvals separately
df_irbs = pd.concat([
    pd.DataFrame({
        "RCT_ID": item.get("RCT ID", None),
        "IRB_Name": [irb.get("Name", None) for irb in item.get("IRBs", [])],
        "IRB_Approval_Date": [irb.get("Approval Date", None) for irb in item.get("IRBs", [])],
        "IRB_Approval_Number": [irb.get("Approval Number", None) for irb in item.get("IRBs", [])]
    }) for item in json_data.values() if "IRBs" in item
], ignore_index=True)

In [12]:
df.columns

Index(['RCT_ID', 'Title', 'Status', 'Start_Date', 'End_Date', 'Country',
       'Region', 'PI_Name', 'PI_Affiliation', 'Keywords',
       'Additional_Keywords', 'JEL_Codes', 'Secondary_IDs', 'Abstract',
       'External_Links', 'Citation', 'Former_Citation', 'Intervention',
       'Intervention_Hidden', 'Intervention_Start_Date',
       'Intervention_End_Date', 'Primary_Outcomes',
       'Primary_Outcomes_Explanation', 'Secondary_Outcomes',
       'Secondary_Outcomes_Explanation', 'Experimental_Design',
       'Experimental_Design_Details', 'Randomization_Method',
       'Randomization_Unit', 'Treatment_Clustered', 'Planned_Clusters',
       'Planned_Observations', 'Sample_By_Treatment_Arms',
       'Minimum_Detectable_Effect', 'Intervention_Completed',
       'Data_Collection_Complete', 'Public_Data_Available', 'Program_Files'],
      dtype='object')

In [13]:
df['Country'].value_counts()

Country
United States of America                                       2014
Germany                                                         711
India                                                           531
China                                                           350
United Kingdom of Great Britain and Northern Ireland            281
                                                               ... 
Azerbaijan                                                        1
India, Norway                                                     1
Germany, Denmark, Estonia, Spain, Finland, Italy, Lithuania       1
Serbia, Turkey, Ukraine                                           1
Netherlands, United States of America                             1
Name: count, Length: 498, dtype: int64

In [14]:
df['Sample_By_Treatment_Arms']

0                                                      N/A
1                                24 clusters per study arm
2        98 school campuses control, 93 school campuses...
3                   69 treatment blocks,127 control blocks
4        2821 individuals free DFS,  3224 individuals n...
                               ...                        
10051    1,900 control, 744 ESG messages, 736 E message...
10052                                                    \
10053                                                   76
10054    The number of participants for each round is a...
10055          750 students per group (treatment, control)
Name: Sample_By_Treatment_Arms, Length: 10056, dtype: object

In [17]:
# Load the dataset
DATA_FILE = "trials.csv"
df = pd.read_csv(DATA_FILE)

def extract_numeric_value(text):
    """Extracts a single numeric value from text if it contains 3 or fewer words and one number."""
    words = text.split()
    numbers = re.findall(r'\d+', text)
    
    if len(words) <= 3 and len(numbers) == 1:
        return int(numbers[0])
    return None

# Pre-fill sample_size where possible
def auto_fill_sample_sizes():
    global df
    df.loc[df["sample_size"].isna(), "sample_size"] = df["Planned_Observations"].apply(lambda x: extract_numeric_value(str(x)))
    df.to_csv(DATA_FILE, index=False)
    print("✅ Auto-filled sample sizes where applicable.")

def process_data():
    global df  # Ensure we're modifying the global dataframe
    
    if "sample_size" not in df.columns:
        df["sample_size"] = None  # Initialize column
    
    auto_fill_sample_sizes()  # Apply auto-fill before manual entry
    
    row_idx = 0  # Track the current index dynamically
    
    while row_idx < len(df):  # Loop through all rows
        df_uncoded = df[df["sample_size"].isna()].copy()  # Refresh uncoded rows
        
        if df_uncoded.empty:
            print("No uncoded rows left!")
            break

        # Select the first uncoded row dynamically
        row = df_uncoded.iloc[0]
        planned_obs = str(row["Planned_Observations"])
        
        # Display row information
        clear_output(wait=True)
        print(f"{len(df_uncoded)} uncoded rows remaining.")
        print(f"Row: RCT_ID = {row['RCT_ID']}")
        print(f"Planned Observations: {planned_obs}")
        
        # Get input for sample size
        try:
            time.sleep(0.2)  # Small delay to stabilize input rendering
            sample_size = input("Enter sample size (or NA to skip, -1 to quit): ").strip()
            
            if sample_size == "-1":
                print("Exiting.")
                break
            
            if sample_size.upper() == "NA":
                df.loc[df["RCT_ID"] == row["RCT_ID"], "sample_size"] = "NA"
            else:
                sample_size = int(sample_size)  # Ensure integer input
                df.loc[df["RCT_ID"] == row["RCT_ID"], "sample_size"] = sample_size
            
            # Save updated dataframe
            df.to_csv(DATA_FILE, index=False)
            
            print(f"Saved sample size {sample_size} for RCT_ID {row['RCT_ID']}")
            time.sleep(0.5)  # Small pause to avoid rapid display changes
            row_idx += 1  # Move to next row

        except ValueError:
            print("❌ Please enter a valid integer or 'NA'.")
            time.sleep(0.5)  # Allow user to re-enter without clearing screen

    print("✅ Processing complete!")

# Run the function
process_data()

6294 uncoded rows remaining.
Row: RCT_ID = AEARCTR-0000096
Planned Observations: 4,572 households


Enter sample size (or NA to skip, -1 to quit):  -1


Exiting.
✅ Processing complete!


In [29]:
filtered_df = pd.to_numeric(df["sample_size"], errors="coerce").dropna()
filtered_df[filtered_df > 0].describe().round()

count       3754.0
mean        4236.0
std        88483.0
min            1.0
25%          400.0
50%          900.0
75%         2000.0
max      5000000.0
Name: sample_size, dtype: float64