In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import os
import umap
import sklearn
from collections import defaultdict
import scanpy
import anndata as ad
import deicode
import matplotlib as mpl
from rdkit import Chem
from rdkit.Chem import Draw
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import networkx as nx

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
all_data = pd.read_csv("../data/mzmine_data_from_jasmine/15405_20240521-083236.txt", sep="\t")
genomic_ids = all_data[["sample_name","participantid"]]
genomic_ids = genomic_ids.drop_duplicates(subset=["participantid"]).reset_index(drop=True)
genomic_ids_dict = dict(zip(genomic_ids["participantid"], genomic_ids["sample_name"]))
genomic_ids_dict = defaultdict(lambda: "Unknown", genomic_ids_dict)

In [11]:
metabolomics_patient_metadata_sheet = pd.read_csv("../data/mzmine_data_from_jasmine/Plusrise_Metadata_abstract.csv", sep=",")
metabolomics_patient_metadata_sheet = metabolomics_patient_metadata_sheet[["ATTRIBUTE_participantid", "ATTRIBUTE_Metabolomics_ID"]]
patient_sample_dict = dict(zip(metabolomics_patient_metadata_sheet["ATTRIBUTE_participantid"].to_list(),
                               metabolomics_patient_metadata_sheet["ATTRIBUTE_Metabolomics_ID"].to_list()))
all_metabolomic_data = pd.read_csv("../data/mzmine_rerun/rise_plus_filtered_quant_table_rerun.csv", sep=",", index_col=0)
all_metabolomic_data_samples = all_metabolomic_data.iloc[:, 13:]
all_metabolomic_data_identifiers = all_metabolomic_data.iloc[:, 0:13]
all_metabolomic_data_samples.drop(columns=["Unnamed: 774"], inplace=True)
groups = np.array([item.split("_")[0] for item in all_metabolomic_data_samples.columns.to_list()])
all_metabolomic_data_samples = all_metabolomic_data_samples.rename(columns=lambda x : x.split(".")[0].split("_")[2])

patient_data = pd.read_csv("../output_data/matched_patients_without_replacement.csv", sep=",", index_col=0)
cancer_patients = patient_data[(patient_data["cancer"] == True)]
cancer_patients = cancer_patients[cancer_patients["absolute_propensity_difference"] < 0.01]
controls = patient_data.loc[cancer_patients["nearest_patient"]].index.to_list()
cancer_patient_genomic_ids = cancer_patients.index.to_list()
cancer_patient_genomic_ids = [genomic_ids_dict[item] for item in cancer_patient_genomic_ids]
cancer_labels = ["Cancer" for _ in cancer_patient_genomic_ids]
control_patient_genomic_ids = [genomic_ids_dict[item] for item in controls]
control_labels = ["Control" for _ in control_patient_genomic_ids]


all_genomic_ids = cancer_patient_genomic_ids + control_patient_genomic_ids
all_labels = cancer_labels + control_labels
all_dict = dict(zip(all_genomic_ids, all_labels))
all_dict = defaultdict(lambda: "NAN", all_dict)

all_genomic_ids = list(set(cancer_patient_genomic_ids + control_patient_genomic_ids))
all_genomic_ids.sort()
id_dataframe = pd.DataFrame({"cancer": cancer_patient_genomic_ids,
                            "control": control_patient_genomic_ids})

In [13]:
available_data = [
    "15405.X00231930",
    "15405.X00232011",
    "15405.X00232154",
    "15405.X00232179",
    "15405.X00232206",
    "15405.X00232265",
    "15405.X00232614",
    "15405.X00232646",
    "15405.X00232702",
    "15405.X00232718",
    "15405.X00232760",
    "15405.X00232833",
    "15405.X00232855",
    "15405.X00232994",
    "15405.X00233200",
    "15405.X00233370",
    "15405.X00233372",
    "15405.X00233574",
    "15405.X00233662",
    "15405.X00233690",
    "15405.X00233707",
    "15405.X00233922",
    "15405.X00233956"
]

# Select only rows with both columns in available_data
id_dataframe = id_dataframe[id_dataframe["cancer"].isin(available_data) & id_dataframe["control"].isin(available_data)]
final_list = id_dataframe["cancer"].to_list() + id_dataframe["control"].to_list()

final_dict = dict(zip(final_list, ["Cancer" for _ in id_dataframe["cancer"].to_list()] + ["Control" for _ in id_dataframe["control"].to_list()]))
final_dict = defaultdict(lambda: "NAN", final_dict)

In [None]:
final_dict.to_csv("../output_data/final_patient_dict.csv", sep=",", index=False)

defaultdict(<function __main__.<lambda>()>,
            {'15405.X00233956': 'Cancer',
             '15405.X00233370': 'Cancer',
             '15405.X00232154': 'Cancer',
             '15405.X00233662': 'Cancer',
             '15405.X00232206': 'Cancer',
             '15405.X00232855': 'Control',
             '15405.X00233922': 'Control',
             '15405.X00232833': 'Control',
             '15405.X00232614': 'Control',
             '15405.X00232646': 'Control'})

In [19]:
qiime_metadata =pd.read_csv("../qiime_workflow/data/rise_urinary_sample_metadata.txt", sep="\t")
qiime_metadata["Group"] = qiime_metadata["sample_name"].apply(lambda x: final_dict[x])
qiime_metadata.to_csv("../qiime_workflow/data/rise_urinary_sample_metadata_final.txt", sep="\t", index=False)
qiime_metadata = qiime_metadata[["sample_name", "Group"]]
qiime_metadata.to_csv("../qiime_workflow/data/rise_urinary_sample_metadata_filtered.txt", sep="\t", index=False)

In [17]:
qiime_metadata

Unnamed: 0,sample_name,age_4lvl_age_group,age_4lvl_impute_age_group,age_impute,anonymized_name,b1_age,b1_b10a_overall_health_self_assessment,b1_b10b_self_worth_assessment,b1_b10c_life_quality_assessment,b1_b11a_physical_activity_impact_bladder_vigorous,...,sex_at_birth,taxon_id,title,tube_id,urineipkit_v_boolean,urinekit1_v_boolean,urinekit2_v_boolean,vaginal_parity,weight,Group
0,15405.BLANK.3.12B,not applicable,not applicable,not applicable,control sample,not applicable,not applicable,not applicable,not applicable,not applicable,...,not applicable,256318,PLUS_Rise,control sample,control sample,control sample,control sample,not applicable,not applicable,NAN
1,15405.BLANK.4.10F,not applicable,not applicable,not applicable,control sample,not applicable,not applicable,not applicable,not applicable,not applicable,...,not applicable,256318,PLUS_Rise,control sample,control sample,control sample,control sample,not applicable,not applicable,NAN
2,15405.BLANK.4.5G,not applicable,not applicable,not applicable,control sample,not applicable,not applicable,not applicable,not applicable,not applicable,...,not applicable,256318,PLUS_Rise,control sample,control sample,control sample,control sample,not applicable,not applicable,NAN
3,15405.BLANK.PLUS.2.10B,not applicable,not applicable,not applicable,control sample,not applicable,not applicable,not applicable,not applicable,not applicable,...,not applicable,256318,PLUS_Rise,control sample,control sample,control sample,control sample,not applicable,not applicable,NAN
4,15405.BLANK.PLUS.2.10F,not applicable,not applicable,not applicable,control sample,not applicable,not applicable,not applicable,not applicable,not applicable,...,not applicable,256318,PLUS_Rise,control sample,control sample,control sample,control sample,not applicable,not applicable,NAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,15405.X00233952,18-25,18-25,23,,23,7,0,4,4,...,1,2516875,PlusRISE,363235444,TRUE,FALSE,FALSE,0,150,NAN
1022,15405.X00233956,65+,65+,76,,76,0,0,0,6,...,1,2516875,PlusRISE,363235441,TRUE,FALSE,FALSE,0,210,Cancer
1023,15405.X00233958,65+,65+,70,,70,3,1,1,3,...,1,2516875,PlusRISE,363235440,TRUE,FALSE,FALSE,0,165,NAN
1024,15405.X00233964,18-25,18-25,24,,24,4,5,4,2,...,1,2516875,PlusRISE,363146466,FALSE,FALSE,FALSE,0,151,NAN
