In [2]:
import pandas as pd
import numpy as np

hint = pd.read_csv("trial_outcomes_v1.csv")
ours = pd.read_csv("ctg-studies.csv")
# obtained by mass download on having results, being phases 1-3, being interventional

In [None]:
# some data quality issues with the HINT dataset is that some terminated trials are labeled as 1,
# which indicates errors in labeling that probably transcend such glaring mistakes
# will have to find this again, it started with 0008xxx and was one of the first in one of the phase train/test (only 6, should be able to triangulate),
# but if not can probably assert this nonetheless

In [None]:
our_nct = pd.Series(ours["NCT Number"].astype(str))
hint_nct = pd.Series(hint["studyid"].astype(str))
len(hint_nct), len(our_nct)

(51291, 39355)

In [None]:
ours_only = our_nct[~our_nct.isin(hint_nct)]
hint_only = hint_nct[~hint_nct.isin(our_nct)]
both = our_nct[our_nct.isin(hint_nct)]
len(ours_only), len(hint_only), len(both)

(22894, 34547, 16461)

In [None]:
hint_only.iloc[1000]
# here we are auditing the data that is only in HINT

'NCT00087737'

In [None]:
filtered_hint = hint[~hint["trialOutcome"].isin(["Terminated, Unknown", "Terminated, Planned but never initiated", "Terminated, Other", "Completed, Outcome unknown", "Completed, Outcome unknown"])]

In [None]:
filtered_hint[filtered_hint["studyid"] == "NCT00087737"]
# looking this up, we find there are no study results associated, nor any clinical trial paper results that are successful in academia
# probably confused with some other similar trials on this drug that were successful
# so the hint label here is just wrong

Unnamed: 0,studyid,trialOutcome
1289,NCT00087737,"Completed, Positive outcome/primary endpoint(s..."


In [None]:
# I'll sample some more of these hint_only trials and check their validity
hint_only.iloc[2500]
# NCT00204672
# seems like could have been published a few years after, also results were submitted but never released
# https://link.springer.com/article/10.1007/s10620-008-0396-6
# this is also in filtered_hint_only

'NCT00204672'

In [None]:
filtered_hint_only = filtered_hint[filtered_hint["studyid"].isin(hint_only)]
"NCT00204672" in filtered_hint_only["studyid"].values, len(filtered_hint_only)

(True, 19947)

In [None]:
filtered_hint_only["studyid"][2500]
# they do have an associated study here with a positive result: https://pubmed.ncbi.nlm.nih.gov/21917759/
# NCT00147875

'NCT00147875'

In [None]:
filtered_hint_only["studyid"][5781]
# they do have an associated study here with a positive result: https://pubmed.ncbi.nlm.nih.gov/21917759/
# and this didn't even have any results submitted
# NCT00326339

'NCT00326339'

In [None]:
filtered_hint_only[filtered_hint_only["studyid"] == "NCT00326339"]

Unnamed: 0,studyid,trialOutcome
5781,NCT00326339,"Completed, Positive outcome/primary endpoint(s..."


In [None]:
to_test_for_Ruby = list(both.sample(n=25, random_state=42))
to_test_for_Ruby

['NCT02634801',
 'NCT01006655',
 'NCT00560833',
 'NCT01030965',
 'NCT03214380',
 'NCT01901302',
 'NCT02096354',
 'NCT00531960',
 'NCT02969408',
 'NCT01398956',
 'NCT01637090',
 'NCT00383435',
 'NCT01749930',
 'NCT02446418',
 'NCT01493531',
 'NCT03137407',
 'NCT01476202',
 'NCT02125877',
 'NCT01217957',
 'NCT01335932',
 'NCT01064414',
 'NCT01441596',
 'NCT01996839',
 'NCT00088465',
 'NCT01251276']

In [None]:
# from IQVIA folder
outcome2label = dict()
outcome2label[-1] = ["Terminated, Unknown", "Terminated, Planned but never initiated", "Terminated, Other", "Terminated, Lack of funding", "Terminated, Business decision - Pipeline reprioritization",
                     "Terminated, Business decision - Other", "Terminated, Business decision - Drug strategy shift", "Completed, Outcome unknown", "Completed, Outcome indeterminate"]
outcome2label[0] = ["Terminated, Poor enrollment", "Terminated, Safety/adverse effects", "Completed, Negative outcome/primary endpoint(s) not met", "Terminated, Lack of efficacy"]
outcome2label[1] = ["Completed, Early positive outcome", "Completed, Positive outcome/primary endpoint(s) met"]

mapping = {outcome: label for label, outcomes in outcome2label.items() for outcome in outcomes}

test_Ruby_answers = hint[hint["studyid"].isin(to_test_for_Ruby)]
test_Ruby_answers["trialOutcome_number"] = test_Ruby_answers["trialOutcome"].map(mapping)

test_Ruby_answers.to_csv("test_Ruby_answers.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_Ruby_answers["trialOutcome_number"] = test_Ruby_answers["trialOutcome"].map(mapping)


## Clean NCTIDs

In [None]:
"""

Ok, now we were being sloppy on some matters. Let's fix this.

"""

In [1]:
import pandas as pd
import numpy as np
import pickle

hint = pd.read_csv("trial_outcomes_v1.csv")
ours = pd.read_csv("ctg-studies.csv")

In [2]:
# 1) Read each CSV into a DataFrame
phase_i_train = pd.read_csv("phase_I_train.csv")
phase_i_test = pd.read_csv("phase_I_test.csv")
phase_i_valid = pd.read_csv("phase_I_valid.csv")

phase_ii_train = pd.read_csv("phase_II_train.csv")
phase_ii_test = pd.read_csv("phase_II_test.csv")
phase_ii_valid = pd.read_csv("phase_II_valid.csv")

phase_iii_train = pd.read_csv("phase_III_train.csv")
phase_iii_test = pd.read_csv("phase_III_test.csv")
phase_iii_valid = pd.read_csv("phase_III_valid.csv")

# 2) Extract the nctid column from each
phase_i_train_nctids = phase_i_train["nctid"]
phase_i_test_nctids = phase_i_test["nctid"]
phase_i_valid_nctids = phase_i_valid["nctid"]

phase_ii_train_nctids = phase_ii_train["nctid"]
phase_ii_test_nctids = phase_ii_test["nctid"]
phase_ii_valid_nctids = phase_ii_valid["nctid"]

phase_iii_train_nctids = phase_iii_train["nctid"]
phase_iii_test_nctids = phase_iii_test["nctid"]
phase_iii_valid_nctids = phase_iii_valid["nctid"]

# 3) Print out counts for train/test in each phase (and sum)
print("Phase I Train Count:", len(phase_i_train_nctids))
print("Phase I Test Count:", len(phase_i_test_nctids))
print("Phase I Valid Count:", len(phase_i_valid_nctids))
print("Phase I Total:", len(phase_i_train_nctids) + len(phase_i_test_nctids) + len(phase_i_valid_nctids), "\n")

print("Phase II Train Count:", len(phase_ii_train_nctids))
print("Phase II Test Count:", len(phase_ii_test_nctids))
print("Phase II Valid Count:", len(phase_ii_valid_nctids))
print("Phase II Total:", len(phase_ii_train_nctids) + len(phase_ii_test_nctids) + len(phase_ii_valid_nctids), "\n")

print("Phase III Train Count:", len(phase_iii_train_nctids))
print("Phase III Test Count:", len(phase_iii_test_nctids))
print("Phase III Valid Count:", len(phase_iii_valid_nctids))
print("Phase III Total:", len(phase_iii_train_nctids) + len(phase_iii_test_nctids) + len(phase_iii_valid_nctids), "\n")

# 4) Combine all train sets into one unique set
train_nctids_set = set(phase_i_train_nctids) | set(phase_ii_train_nctids) | set(phase_iii_train_nctids)
train_nctids_list = list(train_nctids_set)  # Convert to list if needed

# 5) Combine all test sets into one unique set
test_nctids_set = set(phase_i_test_nctids) | set(phase_ii_test_nctids) | set(phase_iii_test_nctids)
test_nctids_list = list(test_nctids_set)

valid_nctids_set = set(phase_i_valid_nctids) | set(phase_ii_valid_nctids) | set(phase_iii_valid_nctids)
valid_nctids_list = list(valid_nctids_set)

# 6) Combine train and test into a single list/Series of unique NCT IDs
all_nctids = list(train_nctids_set | test_nctids_set | valid_nctids_set)

# (Optional) Convert to pandas Series
train_nctids_series = pd.Series(train_nctids_list, name="train_nctid")
test_nctids_series = pd.Series(test_nctids_list, name="test_nctid")
all_nctids_series = pd.Series(all_nctids, name="all_nctid")

# Print final results
print(f"Total unique train NCT IDs: {len(train_nctids_list)}")
print(f"Total unique test NCT IDs: {len(test_nctids_list)}")
print(f"Total unique valid NCT IDs: {len(valid_nctids_list)}")
print(f"Total unique NCT IDs overall: {len(all_nctids)}")

Phase I Train Count: 1044
Phase I Test Count: 627
Phase I Valid Count: 117
Phase I Total: 1788 

Phase II Train Count: 4005
Phase II Test Count: 1654
Phase II Valid Count: 446
Phase II Total: 6105 

Phase III Train Count: 3094
Phase III Test Count: 1146
Phase III Valid Count: 344
Phase III Total: 4584 

Total unique train NCT IDs: 7641
Total unique test NCT IDs: 3165
Total unique valid NCT IDs: 901
Total unique NCT IDs overall: 11601


In [25]:
phase_iii_train.columns == phase_iii_valid.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [26]:
train_valid_df = pd.concat([
    phase_i_train, phase_i_valid,
    phase_ii_train, phase_ii_valid,
    phase_iii_train, phase_iii_valid
], ignore_index=True)
train_valid_df

Unnamed: 0,nctid,status,why_stop,label,phase,diseases,icdcodes,drugs,smiless,criteria
0,NCT01187615,terminated,,0,phase 1,['small cell lung carcinoma'],"[""['D02.20', 'D02.21', 'D02.22']""]",['regorafenib (bay73-4506) - sequential / cisp...,['[H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[P...,\n Inclusion Criteria:\n\n - ...
1,NCT01046487,completed,,1,phase 1,['cancer'],"[""['C05.2', 'C10.0', 'C16.0', 'C16.4', 'C17.0'...","['imatinib mesylate, cyclophosphamide (dosing ...",['CC1=NC(NC2=NC=C(S2)C(=O)NC2=C(C)C=CC=C2Cl)=C...,\n Inclusion Criteria:\n\n - ...
2,NCT01381887,completed,,1,phase 1,"['diabetes mellitus, type 2']","[""['E11.65', 'E11.9', 'E11.21', 'E11.36', 'E11...","['placebo', 'canagliflozin 300mg/placebo', 'ca...",['CN1C(=O)C=C(N2CCC[C@@H](N)C2)N(CC2=C(C=CC=C2...,\n Inclusion Criteria:\n\n - ...
3,NCT02015676,completed,,1,phase 1/phase 2,['breast cancer'],"[""['C79.81', 'D24.1', 'D24.2', 'D24.9', 'D49.3...","['trastuzumab', 'paclitaxel', 'myocet']",['[H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[P...,\n Inclusion Criteria:\n\n - ...
4,NCT01813955,terminated,\n patient recruitment insufficient\n,0,early phase 1,"['schizophrenia', 'cognitive deficits']","[""['F20.0', 'F20.1', 'F20.2', 'F20.3', 'F20.5'...",['papaverine or placebo'],['COC1=C(OC)C=C(CC2=NC=CC3=CC(OC)=C(OC)C=C23)C...,\n Inclusion Criteria:\n\n - ...
...,...,...,...,...,...,...,...,...,...,...
9045,NCT00679484,terminated,\n lack of subject recruitment\n,0,phase 3,"['chronic heart failure', 'high blood b-type (...","[""['I50.812', 'I50.22', 'I50.32', 'I50.813', '...",['olmesartan medoxomil + candesartan cilexetil...,['CCCC1=NC(=C(N1CC1=CC=C(C=C1)C1=C(C=CC=C1)C1=...,\n Inclusion Criteria:\n\n - ...
9046,NCT01057407,completed,,1,phase 3,"['chronic kidney disease', 'renal dialysis', '...","[""['I12.9', 'N18.9', 'I12.0', 'D63.1', 'N18.1'...","['asp1585', 'sevelamer hydrochloride']",['NCC=C.ClCC1CO1'],\n Inclusion Criteria:\n\n - ...
9047,NCT01126580,completed,,1,phase 3,"['diabetes mellitus, type 2']","[""['E11.65', 'E11.9', 'E11.21', 'E11.36', 'E11...","['metformin', 'ly2189265', 'placebo (oral)', '...","['CSCC[C@H](N)C(O)=O', 'CN1C(=O)C=C(N2CCC[C@@H...",\n Inclusion Criteria:\n\n - ...
9048,NCT00423813,completed,,1,phase 3,['fibromyalgia'],"[""['M79.7']""]","['placebo', 'xyrem®']",['CN1C(=O)C=C(N2CCC[C@@H](N)C2)N(CC2=C(C=CC=C2...,\n Inclusion Criteria:\n\n - ...


In [5]:
all_nctids, train_nctids, test_nctids, valid_nctids = all_nctids, train_nctids_list, test_nctids_list, valid_nctids_list
# this is all the dataset that's used for HINT

In [14]:
our_nct = pd.Series(ours["NCT Number"].astype(str))
hint_nct = pd.Series(hint["studyid"].astype(str))
len(hint_nct), len(our_nct)

(51291, 39355)

In [15]:
ours_only = list(our_nct[~our_nct.isin(hint_nct)])
hint_only = list(hint_nct[~hint_nct.isin(our_nct)])
both = list(our_nct[our_nct.isin(hint_nct)])
len(ours_only), len(hint_only), len(both)

(22894, 34547, 16461)

In [None]:
with open("ours_only.pkl", "wb") as f:
  pickle.dump(ours_only, f)

In [None]:
"NCT00531960" in both

True

In [None]:
hint[hint["studyid"] == "NCT00531960"]

# so this is an issue with the clinical trials government
# when I look into things properly, it becomes quite clear that even though it was marked as completed,
# it was actually terminated due to lack of efficacy
# https://pubmed.ncbi.nlm.nih.gov/23992877/

Unnamed: 0,studyid,trialOutcome
10810,NCT00531960,"Terminated, Lack of efficacy"


## Squeezing More out of HINT & Data Quality Statistics

In [11]:
# from IQVIA folder
outcome2label = dict()
outcome2label[-1] = ["Terminated, Unknown", "Terminated, Planned but never initiated", "Terminated, Other", "Terminated, Lack of funding", "Terminated, Business decision - Pipeline reprioritization",
                     "Terminated, Business decision - Other", "Terminated, Business decision - Drug strategy shift", "Completed, Outcome unknown", "Completed, Outcome indeterminate"]
outcome2label[0] = ["Terminated, Poor enrollment", "Terminated, Safety/adverse effects", "Completed, Negative outcome/primary endpoint(s) not met", "Terminated, Lack of efficacy"]
outcome2label[1] = ["Completed, Early positive outcome", "Completed, Positive outcome/primary endpoint(s) met"]

mapping = {outcome: label for label, outcomes in outcome2label.items() for outcome in outcomes}

In [12]:
# here, we will now try to squeeze out any data that the HINT team threw away from their dataset
hint_new = hint[~hint["studyid"].isin(all_nctids)] # just get the data that is in the IQVIA labels but not in the existing dataset
hint_new["trialOutcome_label"] = hint_new["trialOutcome"].map(mapping)
hint_new = hint_new[hint_new["trialOutcome_label"] != -1] # eliminate the -1 data to get 0 or 1 data that's usable
hint_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hint_new["trialOutcome_label"] = hint_new["trialOutcome"].map(mapping)


Unnamed: 0,studyid,trialOutcome,trialOutcome_label
2,NCT00000172,"Completed, Positive outcome/primary endpoint(s...",1
4,NCT00000174,"Completed, Negative outcome/primary endpoint(s...",0
8,NCT00000390,"Completed, Positive outcome/primary endpoint(s...",1
19,NCT00000897,"Terminated, Poor enrollment",0
20,NCT00000914,"Terminated, Poor enrollment",0
...,...,...,...
51251,NCT03711721,"Completed, Positive outcome/primary endpoint(s...",1
51254,NCT03713359,"Completed, Positive outcome/primary endpoint(s...",1
51262,NCT03716856,"Completed, Positive outcome/primary endpoint(s...",1
51265,NCT03717714,"Completed, Positive outcome/primary endpoint(s...",1


In [16]:
# we check to ensure this is truly not in our existing dataset
sample_ids = hint_new["studyid"].sample(5)
sample_ids[sample_ids.isin(all_nctids)] # it's not, as expected
# we now also check there's no overlap with ours_only ids
hint_new[hint_new["studyid"].isin(ours_only)] # again, no overlap, as expected

Unnamed: 0,studyid,trialOutcome,trialOutcome_label


In [44]:
# again, no overlap, for realsies
hint_new[hint_new["studyid"].isin(all_nctids)]

Unnamed: 0,studyid,trialOutcome,trialOutcome_label


In [17]:
len(hint_new[hint_new["trialOutcome_label"] == 1]), 15477 - len(hint_new[hint_new["trialOutcome_label"] == 1])
# there are quite a lot of positive trials here, which is interesting

(9634, 5843)

In [None]:
# executive decision at this point
# though there are some problems with the existing HINT dataset,
# the people who manually labeled this stuff did shockingly well at times,
# better than the clinicaltrials.gov dataset would indicate,
# and for the sake of ease and consistency we will not be modifying the original HINT dataset
# instead noting some errors and only augmenting with new data

In [34]:
# let's do some random sampling to see if this dataset is manually ok
# so this code is slightly wrong because I had neglected to account for the validation ids
# we will leave this code intact and then see if we can repair it later
sample_to_check = hint_new.sample(n=20, random_state=42)
sample_to_check

# NCT00082823: linked studies (first one), also first non-veeva search result, yield good results, phase 1
# NCT00371215: first non-veeva search result, phase 2
# NCT00244712: it's phase 4, but that's forgivable b/c is clinical, couched as success, but really a failure
# NCT00417729: correctly classified, another phase 4, low key forgivable b/c still clinical
# NCT01887002: checks out, also phase 4
# NCT00118768: seems to have died out, but was low key completed with no results, to me seems like unclear & should be -1, phase 2
# NCT00105300: phase 3, and correctly done
# NCT01639547: yeah, correct, low key no phase either
# NCT02228798: ew, this is observational and should have been filtered out
# NCT01701674: wait, this clearly isn't done, and this doesn't seem to be right

# NCT00112632: is uncertain, with no results update, possibly having failed, so either 0 or -1, either way, wrong
# NCT01176240: I'm baffled that this study wasn't in the ours_only results, because it's phase 3 and has results posted - goes to show our clinical trials search isn't perfect either - well-classified
# NCT00687076: again, framed as a success, and was certainly worthwhile research, but phase 4, and also not a success
# NCT01446900: correct labeling
# NCT02608060: correct labeling
# NCT01689532: it meets safety, but efficacy is conspicuously dropped, so it's actually -1
# NCT00444184: correct labeling
# NCT02267135: correct labeling
# NCT01328366: ew, observational, shouldn't be here
# NCT00322673: correct classification

Unnamed: 0,studyid,trialOutcome,trialOutcome_label
1175,NCT00082823,"Completed, Positive outcome/primary endpoint(s...",1
6749,NCT00371215,"Completed, Positive outcome/primary endpoint(s...",1
4059,NCT00244712,"Completed, Positive outcome/primary endpoint(s...",1
7914,NCT00417729,"Completed, Positive outcome/primary endpoint(s...",1
32634,NCT01887002,"Terminated, Poor enrollment",0
1982,NCT00118768,"Terminated, Safety/adverse effects",0
1704,NCT00105300,"Completed, Positive outcome/primary endpoint(s...",1
28341,NCT01639547,"Terminated, Poor enrollment",0
38014,NCT02228798,"Completed, Positive outcome/primary endpoint(s...",1
29487,NCT01701674,"Completed, Positive outcome/primary endpoint(s...",1


In [37]:
checked_sample = sample_to_check.copy()
checked_sample["ground_truth_label"] = [1, 1, 0, 1, 0, -1, 1, 0, -1, -1, -1, 1, -1, 0, 0, -1, 1, 1, -1, 0]
checked_sample.to_csv("groundtruth_hintnew_naive.csv")
checked_sample

Unnamed: 0,studyid,trialOutcome,trialOutcome_label,ground_truth_label
1175,NCT00082823,"Completed, Positive outcome/primary endpoint(s...",1,1
6749,NCT00371215,"Completed, Positive outcome/primary endpoint(s...",1,1
4059,NCT00244712,"Completed, Positive outcome/primary endpoint(s...",1,0
7914,NCT00417729,"Completed, Positive outcome/primary endpoint(s...",1,1
32634,NCT01887002,"Terminated, Poor enrollment",0,0
1982,NCT00118768,"Terminated, Safety/adverse effects",0,-1
1704,NCT00105300,"Completed, Positive outcome/primary endpoint(s...",1,1
28341,NCT01639547,"Terminated, Poor enrollment",0,0
38014,NCT02228798,"Completed, Positive outcome/primary endpoint(s...",1,-1
29487,NCT01701674,"Completed, Positive outcome/primary endpoint(s...",1,-1


In [55]:
# only one trial in validation was thrown out
checked_sample = pd.read_csv("groundtruth_hintnew_naive.csv")
checked_sample_correct = checked_sample[~checked_sample["studyid"].isin(all_nctids)].iloc[:, 1:]
checked_sample_correct.to_csv("groundtruth_hintnew_naive.csv", index=False)
# wait I didn't have to throw out validation, oh well, whatever

In [56]:
# overall data accuracy
accuracy = (checked_sample_correct['ground_truth_label'] == checked_sample_correct['trialOutcome_label']).mean()
std = np.sqrt(accuracy*(1-accuracy)/19)
# we can just simply compute binomial stdev and get the stdev 2/3 interval,
# using Gaussian approx for binomial in the last step
accuracy, std, (accuracy - std, accuracy + std)

# we use Gaussian approx for binomial
# ci_80 = accuracy - 1.282 * std, accuracy + 1.282 * std
# accuracy, std, ci_80 # it stayed 0.6 in both the 10 and 20 batches, so seems to be the true accuracy
# # tbh, we should just report stdev and

(0.5789473684210527,
 0.1132689656524148,
 (0.46567840276863787, 0.6922163340734675))

In [9]:
# we observe that many of the inaccurate trials are either observational or lack a specific phase attached to them
# so we get all the interventional clinical trial ids that have phases associated with them,
# then subtract out the set of such trials that are still recruiting,
# and then take the intersection of this set and our trials from above

interventional_phase_ids = set(pd.read_csv("ctg-studies_all_filtered.csv")["NCT Number"])
recruiting_interventional_phase_ids = set(pd.read_csv("ctg-studies_recruiting_filtered.csv")["NCT Number"])
len(interventional_phase_ids - recruiting_interventional_phase_ids), len(recruiting_interventional_phase_ids), len(interventional_phase_ids)
# our subtraction checks out

(174194, 25817, 200011)

In [23]:
interventional_phase_ids = set(pd.read_csv("ctg-studies_all_filtered.csv")["NCT Number"])
recruiting_interventional_phase_ids = set(pd.read_csv("ctg-studies_recruiting_filtered.csv")["NCT Number"])
improved_hint_new_ids = (interventional_phase_ids - recruiting_interventional_phase_ids) & set(hint_new["studyid"])
len(improved_hint_new_ids)
# not so many dropped

13072

In [36]:
# we can now evaluate on existing data
naive_sample = pd.read_csv("groundtruth_hintnew_naive.csv").iloc[:, 1:]
naive_sample_for_improved = naive_sample[naive_sample["studyid"].isin(improved_hint_new_ids)]
naive_sample_for_improved

Unnamed: 0,studyid,trialOutcome,trialOutcome_label,ground_truth_label
0,NCT00082823,"Completed, Positive outcome/primary endpoint(s...",1,1
1,NCT00371215,"Completed, Positive outcome/primary endpoint(s...",1,1
2,NCT00244712,"Completed, Positive outcome/primary endpoint(s...",1,0
3,NCT00417729,"Completed, Positive outcome/primary endpoint(s...",1,1
4,NCT01887002,"Terminated, Poor enrollment",0,0
5,NCT00118768,"Terminated, Safety/adverse effects",0,-1
6,NCT00105300,"Completed, Positive outcome/primary endpoint(s...",1,1
10,NCT00112632,"Completed, Positive outcome/primary endpoint(s...",1,-1
12,NCT00687076,"Completed, Positive outcome/primary endpoint(s...",1,-1
13,NCT01446900,"Terminated, Poor enrollment",0,0


In [None]:
# yeah, so I'm kind of baffled that the accuracy didn't change. maybe I'll check something

In [47]:
hint_new[hint_new["studyid"] == "NCT01328366"]

Unnamed: 0,studyid,trialOutcome,trialOutcome_label
22421,NCT01328366,"Completed, Positive outcome/primary endpoint(s...",1


In [52]:
hint_new_improved = hint_new[hint_new["studyid"].isin(improved_hint_new_ids)]
hint_new_improved

Unnamed: 0,studyid,trialOutcome,trialOutcome_label
2,NCT00000172,"Completed, Positive outcome/primary endpoint(s...",1
4,NCT00000174,"Completed, Negative outcome/primary endpoint(s...",0
8,NCT00000390,"Completed, Positive outcome/primary endpoint(s...",1
33,NCT00001703,"Terminated, Poor enrollment",0
35,NCT00001781,"Terminated, Safety/adverse effects",0
...,...,...,...
51190,NCT03692299,"Completed, Positive outcome/primary endpoint(s...",1
51194,NCT03692910,"Completed, Positive outcome/primary endpoint(s...",1
51254,NCT03713359,"Completed, Positive outcome/primary endpoint(s...",1
51262,NCT03716856,"Completed, Positive outcome/primary endpoint(s...",1


In [55]:
len(hint_new), len(hint_new_improved), len(hint_new) - len(hint_new_improved)

(15477, 13166, 2311)

In [57]:
hint_new_improved[hint_new_improved["studyid"] == "NCT01328366"]
# great! so we did weed out some of these trials
# I think the sampling was messed up earlier, hence this trial wasn't in the checked sample,
# but this is ok
# so qualitatively we did get rid of a few bad apples, we can run eval metrics later.
# but just assuming all the trials we eliminated were bad (which makes sense), we got 2/15 = 13% gain
# this is lowkey probably how we'll estimate, CI small enough on this that it's ok
# (0.5789473684210527, 0.1132689656524148, (0.46567840276863787, 0.6922163340734675)) was the original confidence interval
# we had 11/19
# now we just eliminated 2311/15477 trials. how update probabilities?
# the idea is that our mean is increased by eliminating the trials, 11/19*15477/13166
# but we also pretend that we've lost some of the sample, specifically 13166/15477
hint_new_quality = (0.5789473684210527, 0.1132689656524148, (0.46567840276863787, 0.6922163340734675))
new_mean = 11/19 * len(hint_new) / len(hint_new_improved)
new_sample_size = 19 * len(hint_new_improved) / len(hint_new)
new_std = np.sqrt(new_mean * (1 - new_mean) / new_sample_size)
hint_new_improved_quality = new_mean, new_std, (new_mean - new_std, new_mean + new_std)
hint_new_improved_quality

(0.680568769637903,
 0.1159749945003803,
 (0.5645937751375227, 0.7965437641382833))

In [61]:
hint_new_improved.to_csv("hint_new.csv")

### hint_old accuracy

In [35]:
# we now sample 20 to test the final hint dataset
# some weird discrepancies between final hint dataset and all_nctids, but whatever, that's on them
hint_old_sample = hint[hint["studyid"].isin(all_nctids)].sample(20, random_state=42)
hint_old_sample["trialOutcome_label"] = hint_old_sample["trialOutcome"].map(mapping)
hint_old_sample["ground_truth_label"] = [1, 1, 1, 1, 1, 0, -1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1]
hint_old_sample.to_csv("groundtruth_hintold.csv")
print(len(hint_old_sample))
hint_old_sample
# qualitatively, the HINT data is a lot easier to classify & verify than some of the additional data we're bringing in,
# which is perhaps why this data was included
# NCT02495623: positive outcome, 1
# NCT00678392: positive outcome, 1
# NCT02043756: positive outcome, 1
# NCT00383435: positive outcome, 1
# NCT00443053: positive outcome, 1
# NCT00212797: I mean, we hear no update, could put -1, but tbh benefit of the doubt for 0
# NCT01063517: there's a similar study on the same compound that was phase 1 that succeeded in Israel but not the phase 2 equivalent, so either -1 or 0, either way, it's wrong
# NCT00458302: positive outcome, 1
# NCT01104792: positive outcome, 1
# NCT03043599: not terminated, but correct on outcome, 0

# NCT00557700: negative outcome, 0
# NCT00168350: negative outcome, 0. also Jesus Christ - negative results were published 12 years after trial completion!?
# NCT00358644: is a success only because things were so abysmal, positive outcome, 1
# NCT02294474: positive outcome, wrt their goal, so 1
# NCT00440947: the study as framed is comparative and can't be successful, the real study didn't really lower that much the indicators, so 0/-1
# NCT00621725: negative outcome, 0
# NCT00727987: positive outcome, 1
# NCT00442780: no update, trialing again by same company 10+ years later on very specific patient demographic, means some theoretical underpinning but no cigar, so negative result, 0
# NCT01342016: safety, 0
# NCT00567996: positive outcome, 1

20


Unnamed: 0,studyid,trialOutcome,trialOutcome_label,ground_truth_label
41897,NCT02495623,"Completed, Positive outcome/primary endpoint(s...",1,1
14176,NCT00678392,"Completed, Positive outcome/primary endpoint(s...",1,1
35243,NCT02043756,"Completed, Positive outcome/primary endpoint(s...",1,1
7025,NCT00383435,"Completed, Positive outcome/primary endpoint(s...",1,1
8599,NCT00443053,"Completed, Positive outcome/primary endpoint(s...",1,1
3481,NCT00212797,"Completed, Negative outcome/primary endpoint(s...",0,0
16854,NCT01063517,"Completed, Negative outcome/primary endpoint(s...",0,-1
9005,NCT00458302,"Completed, Positive outcome/primary endpoint(s...",1,1
17747,NCT01104792,"Completed, Positive outcome/primary endpoint(s...",1,1
47810,NCT03043599,"Terminated, Lack of efficacy",0,0


In [36]:
# overall data accuracy
accuracy = (hint_old_sample['ground_truth_label'] == hint_old_sample['trialOutcome_label']).mean()
std = np.sqrt(accuracy*(1-accuracy)/20)
# we can just simply compute binomial stdev and get the stdev 2/3 interval,
# using Gaussian approx for binomial in the last step
accuracy, std, (accuracy - std, accuracy + std)
# main issue is that there are some seeming failures that are successes

(0.85, 0.07984359711335656, (0.7701564028866434, 0.9298435971133565))

## Ground Truth for ours_only

In [None]:
# then lastly will sample 20 from our generated dataset

In [4]:
with open("ours_only.pkl", "rb") as f:
  ours_only = pickle.load(f)

len(ours_only)

22894

In [None]:
# improvements in ground truth:
# some studies that HINT just inexplicably missed,
# or didn't have in their IQVIA internal corporate-labeled set that the researchers drew from
# majority (probably) are high-quality studies that were conducted after HINT was published

In [67]:
groundtruth_oursonly = pd.Series(ours_only).sample(15, random_state=42)
groundtruth_oursonly
# NCT00833248: failed, no stat sig results
# NCT02460692: either a success, or a -1, since both the drug and vaporized cannabis were much better than placebo but similar to each other
# NCT03375294: success
# NCT04901078: success, though some regimens had adverse effects, they found other regimens that didn't as much
# NCT05878522: seems to have failed, though not 100% confident on this
# another 3 + 3(?) more from Ruby
# then mb only another 10 to do, so we can sample only 15

Unnamed: 0,0
4850,NCT00833248
3505,NCT02460692
1131,NCT03375294
19069,NCT04901078
14860,NCT05878522
20696,NCT00418561
3334,NCT05751629
4475,NCT02650895
3806,NCT00866840
13628,NCT03707184
