# Prepare for review
In this notebook, we will process the results of processing patients through the Molecular Oncology Almanac with regard the molecular features and therapies.

In [1]:
import pandas as pd
import glob

handles = glob.glob('almanac_outputs/*/*.actionable.txt')
actionable = pd.concat(pd.read_csv(handle, sep='\t') for handle in handles)

handles = glob.glob('formatted_variants/*.txt')
variants = pd.concat(pd.read_csv(handle, sep='\t') for handle in handles)

supplement_0 = pd.read_excel('paper/2019-Sicklick-supplement.xlsx', sheet_name=0, header=2).loc[:82, :]
supplement_0.set_index('Study ID', drop=True, inplace=True)
supplement_0.index = supplement_0.index.astype(str)

supplement_1 = pd.read_excel('paper/2019-Sicklick-supplement.xlsx', sheet_name=1, header=2).loc[:82, :]
supplement_1.set_index('Study ID', drop=True, inplace=True)
supplement_1.index = supplement_1.index.astype(str)

supplement_2 = pd.read_excel('paper/2019-Sicklick-supplement.xlsx', sheet_name=2, header=2)

## Concatenate molecular features and Molecular Oncology Almanac results
This table will be used to create a scatter plot comparing the molecular features targeted by the study versus our methodology. Within the manual review we added TMB-Intermediate as a feature, as it was targeted in I-PREDICT, and renamed `High Mutational Burden` to be `TMB-High`. 

In [2]:
drop_columns = ['tumor_f', 'total_coverage', 
                'exac_af', 'exac_common', 'clinvar',
                'number_germline_mutations_in_gene', 
                'validation_total_coverage', 'validation_tumor_f', 'validation_detection_power',
                'feature_display', 'preclinical_efficacy_observed',
                'tumor_sample_barcode', 'normal_sample_barcode']

merged_features = (variants
 .merge(actionable, on=['feature', 'feature_type', 'alteration_type', 'alteration', 'patient_id'], how='left')
 .drop(drop_columns, axis=1)
)

merged_features['ipredict_targeted'] = ''
merged_features['patient_id'] = merged_features['patient_id'].astype(str)
merged_features.to_csv('merged-features.not_annotated.txt', sep='\t', index=False)

## Create a list of citations
We will manually review citations to compare against what we cited in the Molecular Oncology Almanac. We'll characterize each citation by evidence level as well as grab the date which the citation became available. 

In [3]:
citations = supplement_2.rename(columns={'References': 'reference', 'Unnamed: 1': 'citation'})
for column in ['year', 'month', 'day', 'date type', 'note', 'evidence', 'evidence note', 'ids used']:
    citations[column] = ''
citations.to_csv('citations.not_annotated.txt', sep='\t')

## Create a list of therapies used by the study and highlighted by the Molecular Oncology Almanac
This will be the main table used by the clinical trial figure, as we are comparing the therapeutic strategies. Therapeutic combinations, designated with a `+`, were separated to properly compare between the trial and highlights by the Molecular Oncology Almanac.

In [4]:
columns = ['Study ID',
           'Matched Drug(s) / Agent(s) Administered',
           'Drug(s) / Agent(s) Not Matched to Genomic Alteration for Patients with No Match',
           'References'
          ]
study_therapies = supplement_1.reset_index().loc[:, columns].set_index('Study ID')
study_therapies.index = study_therapies.index.astype(str)

columns = ['patient_id', 'therapy', 
           'study-evidence-strongest', 
           'almanac-evidence-strongest',
           'included-in-overlapping-strategy', 
           'strategy', 
           'study-gave-as-matched-therapy', 
           'study-citations-used-for-patient', 
           'comment-on-study-citation']

bins = ['Putatively Actionable', 
        'Investigate Actionability - High', 
        'Investigate Actionability - Low']

patients_list = []
for patient in study_therapies.index:
    patient = str(patient)
    patient_dataframe = pd.DataFrame(columns=columns)
    matched_list = study_therapies.fillna('').loc[patient, 'Matched Drug(s) / Agent(s) Administered'].split(', ')
    non_matched_list = study_therapies.fillna('').loc[patient, 'Drug(s) / Agent(s) Not Matched to Genomic Alteration for Patients with No Match'].split(', ')
    therapies_list = sorted(matched_list + non_matched_list)
    therapies_list.remove('')

    patient_dataframe['therapy'] = therapies_list

    idx = patient_dataframe[patient_dataframe['therapy'].isin(non_matched_list)].index
    patient_dataframe.loc[idx, 'study-gave-as-matched-therapy'] = 0
    patient_dataframe.loc[patient_dataframe.index.difference(idx), 'study-gave-as-matched-therapy'] = 1
    
    idx = (merged_features['patient_id'].eq(patient) & merged_features['sensitive_score_bin'].isin(bins))
    patient_features = merged_features[idx]
    for index in patient_features.index:
        therapy = patient_features.loc[index, 'sensitive_therapy_name'].lower()
        if therapy in patient_dataframe['therapy'].tolist():
            index_patient = patient_dataframe['therapy'].eq(therapy)
            patient_dataframe.loc[index_patient, 'almanac-evidence-strongest'] = patient_features.loc[index, 'sensitive_predictive_implication']
        else:
            almanac_unique_therapy = pd.DataFrame(columns=columns)
            almanac_unique_therapy.loc[0, 'therapy'] = therapy
            almanac_unique_therapy.loc[0, 'almanac-evidence-strongest'] = patient_features.loc[index, 'sensitive_predictive_implication']
            patient_dataframe = pd.concat([patient_dataframe, almanac_unique_therapy], ignore_index=True)
    
    if study_therapies.loc[patient, 'References'] == 'Not applicable':
        patient_dataframe['study-citations-used-for-patient'] = study_therapies.loc[patient, 'References']
    else:
        references = str(study_therapies.loc[patient, 'References']).split(',')
        references_formatted = []
        for reference in references:
            if '-' in reference:
                split=reference.split('-')
                reference = [str(value) for value in range(int(split[0]), int(split[1]) + 1)]
            references_formatted.extend(reference)
        patient_dataframe['study-citations-used-for-patient'] = ','.join(references_formatted)
    
    patient_dataframe['patient_id'] = patient
    patients_list.append(patient_dataframe)

therapy_comparison = pd.concat(patients_list, ignore_index=True)
therapy_comparison.to_csv('therapies.not_annotated.txt', sep='\t', index=False)

In [5]:
therapy_comparison

Unnamed: 0,patient_id,therapy,study-evidence-strongest,almanac-evidence-strongest,included-in-overlapping-strategy,strategy,study-gave-as-matched-therapy,study-citations-used-for-patient,comment-on-study-citation
0,2,crizotinib,,Guideline,,,1,1,
1,5,palbociclib,,Guideline,,,1,2,
2,5,nutlin-3,,Preclinical,,,,2,
3,7,gemcitabine,,,,,0,Not applicable,
4,7,nab-paclitaxel,,,,,0,Not applicable,
...,...,...,...,...,...,...,...,...,...
290,A044,everolimus,,Clinical evidence,,,,Not applicable,
291,A045,bevacizumab,,,,,1,256,
292,A045,trametinib,,,,,1,256,
293,A045,amg 510,,Clinical trial,,,,256,
