In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
from datetime import timedelta
import pyspark
import dxpy
import dxdata
import pandas as pd
import random
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import col, udf, to_date, mean, expr
from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType
from pyspark.ml.feature import Word2Vec
from pyspark.sql.window import Window
import ast


In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]
spark.sql("USE " + dispensed_database_name)

### Omics Data

In [None]:
#contains death records
death = pd.read_csv('/mnt/project/death.csv').drop('Unnamed: 0',axis=1).drop_duplicates()
death['death_date'] = pd.to_datetime(death['death_date'], yearfirst=True)

In [None]:
#contains all cancer diagnoses for omics patients
prot_date = pd.read_csv('/mnt/project/cancer_conds.csv', usecols=['eid','proteomics_date']).drop_duplicates()
prot_date['proteomics_date'] = pd.to_datetime(prot_date['proteomics_date'], yearfirst=True)

In [None]:
pdf = prot_date.merge(death, how='inner', on='eid')

In [None]:
pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]
pdf['indicator'] = 1

In [None]:
patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_omics_omicsword2vec_lc.csv').drop('Unnamed: 0',axis=1)
max_dates = 32

In [None]:
patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])

In [None]:
unique_patients = patient_day_embeddings['eid'].nunique()

In [None]:
#create np matrix to store input data, assign each patient to an index
patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}
RNN_data = np.full((400, max_dates, unique_patients), np.nan)

In [None]:
%%time
#populate np matrix with data
date_position = {}
for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):
    patient_id = row['eid']
    patient_index = patient_id_to_index[patient_id]
    
    if patient_id not in date_position:
        date_position[patient_id] = 0
    else:
        date_position[patient_id] += 1
        
    date_index = date_position[patient_id]
    
    for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):
        if date_index < max_dates:
            RNN_data[feature_index, date_index, patient_index] = feature_value


In [None]:
RNN_data = RNN_data.transpose(2,1,0)


In [None]:
np.save('RNN_data_omics_omicsw2v_lc.npy', RNN_data)

In [None]:
%%bash
dx upload RNN_data_omics_omicsw2v_lc.npy --path /

In [None]:
pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_omics_omicsw2v_lc.csv')


In [None]:
%%bash
dx upload eid_indices_omics_omicsw2v_lc.csv --path /

In [None]:
np.save('./visit_count_omics_omicsw2v_lc.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)

In [None]:
%%bash
dx upload visit_count_omics_omicsw2v_lc.npy --path /

In [None]:
idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T

In [None]:
pdf['eid'] = pdf['eid'].astype(int)
outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values

In [None]:
np.save('./outcomes_omics_omicsw2v_lc_3yr.npy', outcomes)

In [None]:
%%bash
dx upload outcomes_omics_omicsw2v_lc_3yr.npy --path /

### PT Data

In [None]:
patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_PT_lc_LARGER.csv').drop('Unnamed: 0',axis=1)
max_dates = 32

In [None]:
patient_day_embeddings = patient_day_embeddings.drop_duplicates(['eid','record_date'])

In [None]:
eids_omics = pd.read_csv('/mnt/project/eid_indices_omics_omicsw2v_lc.csv')['0']

In [None]:
patient_day_embeddings = patient_day_embeddings[~patient_day_embeddings['eid'].isin(eids_omics)]

In [None]:
patient_day_embeddings.shape

In [None]:
patient_day_embeddings['eid'].nunique()

In [None]:
combined_query = spark.sql("""
WITH EarliestCConds AS (
    SELECT 
        c.eid,
        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date
    FROM 
        omop_condition_occurrence c
    WHERE 
        c.condition_source_value LIKE 'C%'
    GROUP BY 
        c.eid
),
FilteredPatients AS (
    SELECT 
        ecc.eid,
        ecc.earliest_cond_date,
        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date 
    FROM 
        EarliestCConds ecc
    INNER JOIN 
        participant_0001 p ON ecc.eid = p.eid
    WHERE 
        ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND
        ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)
)

SELECT DISTINCT
    fp.eid, 
    d.death_date,
    fp.earliest_cond_date AS proteomics_date
FROM 
    FilteredPatients fp
JOIN
    omop_death d ON d.eid=fp.eid

""")

In [None]:
%%time
#query for death data and cancer diagnosis date
combined_query_results = combined_query.collect()

In [None]:
%%time
pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])

In [None]:
pdf['proteomics_date'] = pd.to_datetime(pdf['proteomics_date'], yearfirst=True)
pdf['death_date'] = pd.to_datetime(pdf['death_date'], dayfirst=True)
pdf = pdf.drop_duplicates()
pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]
pdf['indicator'] = 1

In [None]:
patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])

In [None]:
unique_patients = patient_day_embeddings['eid'].nunique()

In [None]:
patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}
RNN_data = np.full((400, max_dates, unique_patients), np.nan)

In [None]:
%%time
date_position = {}
for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):
    patient_id = row['eid']
    patient_index = patient_id_to_index[patient_id]
    
    if patient_id not in date_position:
        date_position[patient_id] = 0
    else:
        date_position[patient_id] += 1
        
    date_index = date_position[patient_id]
    
    for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):
        if date_index < max_dates:
            RNN_data[feature_index, date_index, patient_index] = feature_value


In [None]:
RNN_data = RNN_data.transpose(2,1,0)


In [None]:
np.save('RNN_data_PT_lc_LARGER.npy', RNN_data)

In [None]:
%%bash
dx upload RNN_data_PT_lc_LARGER.npy --path /

In [None]:
pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_PT_lc_LARGER.csv')


In [None]:
%%bash
dx upload eid_indices_PT_lc_LARGER.csv --path /

In [None]:
np.save('./visit_count_PT_lc_LARGER.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)

In [None]:
%%bash
dx upload visit_count_PT_lc_LARGER.npy --path /

In [None]:
patient_day_embeddings['eid'].nunique()

In [None]:
patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values

In [None]:
idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T

In [None]:
#one patient has duplicated death but the two records are a day apart and doesn't affect labeling
pdf = pdf.drop_duplicates('eid')


In [None]:
pdf['eid'] = pdf['eid'].astype(int)
outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values

In [None]:
outcomes.mean()

In [None]:
np.save('./outcomes_PT_lc_LARGER.npy', outcomes)

In [None]:
%%bash
dx upload outcomes_PT_lc_LARGER.npy --path /

In [None]:
len(outcomes)

### Pull Proteomics

In [None]:
%%time
# Initialize Spark session
spark = SparkSession.builder.appName("Proteomics Data Aggregation").getOrCreate()

# List of table names
table_names = [f"olink_instance_0_{str(i).zfill(4)}" for i in range(1, 13)]

# Create DataFrame for the first table
combined_df = spark.table(table_names[0])

# Join the rest of the tables
for table_name in table_names[1:]:
    # Join each table on 'eid'
    next_table_df = spark.table(table_name)
    combined_df = combined_df.join(next_table_df, "eid", "left")

# Write the result to a CSV file
combined_df.write.csv("all_proteomics", header=True)


In [None]:
%%bash
hdfs dfs -ls ./all_proteomics

In [None]:
%%bash
hdfs dfs -get ./all_proteomics ./


In [None]:
%%time
# Directory containing your CSV files
directory = './all_proteomics/'

# Read and combine all CSV files in the directory
all_csvs = [pd.read_csv(os.path.join(directory, file)) for file in os.listdir(directory) if file.endswith('.csv')]
combined_df = pd.concat(all_csvs, ignore_index=True)

# Write the combined DataFrame to a new CSV file
combined_df.to_csv('./all_proteomics_lc.csv', index=False)


In [None]:
%%bash
dx upload all_proteomics_lc.csv --path /