In [1]:
import pyspark
import dxpy
import dxdata
import pandas as pd
import random
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import col, udf, to_date, mean, expr
from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType
from pyspark.ml.feature import Word2Vec
from pyspark.sql.window import Window
import ast
import numpy as np


In [2]:
spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1g") \
    .getOrCreate()

# The SparkContext is accessible from the SparkSession as follows:
sc = spark.sparkContext

In [3]:
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

In [4]:
spark.sql("USE " + dispensed_database_name)

DataFrame[]

In [None]:
#cancer patients with initial diagnosis at most 12 months before initial UKBB visit
combined_query = spark.sql("""
WITH EarliestCConds AS (
    SELECT 
        c.eid,
        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date
    FROM 
        omop_condition_occurrence c
    WHERE 
        c.condition_source_value LIKE 'C%'
    GROUP BY 
        c.eid
),
FilteredPatients AS (
    SELECT 
        ecc.eid,
        ecc.earliest_cond_date,
        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date 
    FROM 
        EarliestCConds ecc
    INNER JOIN 
        participant_0001 p ON ecc.eid = p.eid
    WHERE 
        ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND
        ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)
)

SELECT 
    fp.eid, 
    c.concept_id, 
    c.record_date,
    DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date
FROM 
    FilteredPatients fp
JOIN (
    SELECT 
        o.eid, 
        o.condition_concept_id as concept_id, 
        TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date
    FROM 
        omop_condition_occurrence o
    UNION ALL
    SELECT 
        o.eid, 
        o.procedure_concept_id as concept_id, 
        TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date
    FROM 
        omop_procedure_occurrence o
    UNION ALL
    SELECT 
        o.eid, 
        o.drug_concept_id as concept_id, 
        TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date
    FROM 
        omop_drug_exposure o
    UNION ALL
    SELECT 
        o.eid, 
        o.observation_concept_id as concept_id, 
        TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date
    FROM 
        omop_observation o
    UNION ALL
    SELECT 
        o.eid, 
        o.measurement_concept_id as concept_id, 
        TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date
    FROM 
        omop_measurement o
) c ON fp.eid = c.eid AND c.record_date <= fp.earliest_cond_date

""")

In [None]:
%%time
combined_query_results = combined_query.collect()

In [None]:
%%time
pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])

In [None]:
pdf['eid'].nunique()

In [None]:
%%time
# Convert 'record_date' to datetime format in Pandas
pdf['record_date'] = pd.to_datetime(pdf['record_date'], format='%Y-%m-%d')


In [None]:
%%time
max_dates=32

# 1. Sort the DataFrame
pdf = pdf.sort_values(by=['eid', 'record_date'], ascending=[True, False])

# 2. Rank within each 'eid' group
pdf['date_rank'] = pdf.groupby('eid')['record_date'].rank(method='dense', ascending=False)

# 3. Filter based on rank
filtered_pdf = pdf[pdf['date_rank'] <= max_dates]


In [None]:
%%time
word_vectors = pd.read_csv('/mnt/project/PT_lc_word2vec.csv')
word_vectors['vector'] = word_vectors['vector'].apply(ast.literal_eval)


In [None]:
word_vectors['word'] = word_vectors['word'].astype(str)

In [None]:
filtered_pdf = filtered_pdf.merge(word_vectors, how='inner', left_on='concept_id', right_on='word').drop(['word','concept_id'],axis=1)


In [None]:
filtered_pdf.shape, filtered_pdf['eid'].nunique()

In [None]:
indices = filtered_pdf.index

In [None]:
%%time
embeddings_array = np.array(filtered_pdf['vector'].tolist(), dtype=np.float64)


In [None]:
embeddings_df = pd.DataFrame(embeddings_array, index=indices)


In [None]:
%%time
embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

# Join the new DataFrame with the original DataFrame
embedded_codes = filtered_pdf.join(embeddings_df)

In [None]:
%%time

# 4. Define your aggregation expressions
agg_funcs = {f'embedding_{i}': 'mean' for i in range(400)}

# Apply aggregation with the defined expressions
patient_day_embeddings_pd = embedded_codes.groupby(['eid', 'record_date']).agg(agg_funcs)


In [None]:
%%time
patient_day_embeddings_pd.reset_index().to_csv('./patient_day_embeddings_PT_lc_LARGER.csv', header=True)

In [None]:
%%bash
dx upload patient_day_embeddings_PT_lc_LARGER.csv --path /