In [1]:
import pandas as pd
import numpy as np
from scipy.stats import rankdata
import itertools
from tqdm import tqdm
import networkx as nx
import pickle
import os

tqdm.pandas()

In [2]:
final_df = pd.read_feather("data/opioid_concept_map.feather")
full_bio_cohort = pd.read_feather('data/opioid_cohort_details.feather')

In [3]:
final_df

Unnamed: 0,person_id,C0032961,C0041296,C0151632,CHEMBL112570,CHEMBL1166,CHEMBL1200830,CHEMBL1222250,CHEMBL1618018,CHEMBL2146121,...,UBERON:0001088,UBERON:0001089,UBERON:0001135,UBERON:0001264,UBERON:0001690,UBERON:0001968,UBERON:0001969,UBERON:0001970,UBERON:0001988,UBERON:0002113
0,304,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1516,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,4565,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,5127,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,7451,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17792,10446618,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
17793,10458182,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
17794,10458349,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
17795,10509936,1,0,0,0,0,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0


In [4]:
spoke = np.load('../../psev_repo/PSEV_matrix')
sep = np.load('../../psev_repo/PSEV_SEP_map')
spoke_node = np.load('../../psev_repo/PSEV_SPOKE_node_map')

spoke = pd.DataFrame(spoke, columns=spoke_node)
spoke.index = sep
spoke.index = spoke.index.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
spoke.columns = spoke.columns.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Create the Embeddings

In [5]:
person_id_index = final_df['person_id'].to_list()
final_df.drop(columns='person_id', inplace=True)

#Remove SEPs that are not in our final_df
spoke = spoke.loc[spoke.index.isin(final_df.columns)]

#Reindex final_df for dot product
final_df = final_df[spoke.index]

#PSEVs for each patient
patient_psevs = np.dot(final_df.values, spoke.values)

In [6]:
patient_psevs.shape

(17797, 389297)

# Process the PSEVs

## Option 1. No rank. Just run RF on nodes w top 30% of variance.

In [7]:
# Step 1: Calculate variance for each column— do it in chunks bec memory
chunk_size = 1000
num_columns = patient_psevs.shape[1]
variances = []

for start in range(0, num_columns, chunk_size):
    end = min(start + chunk_size, num_columns)
    chunk = patient_psevs[:, start:end]
    chunk_variances = np.var(chunk, axis=0)
    variances.extend(chunk_variances)

variances = np.array(variances)

# Step 2: Determine the threshold for the top 30%
threshold = np.percentile(variances, 70)  # 70th percentile

# Step 3: Find the columns with variance above the threshold
selected_columns = variances > threshold

# Step 4: Filter the array to retain only these columns
filtered_patient_psevs = patient_psevs[:, selected_columns]

In [8]:
filtered_patient_psevs.shape

(17797, 116484)

In [9]:
np.save('data/psevs/person_id_index.npy', np.array(person_id_index))
np.save('data/psevs/filtered_patient_psevs.npy', filtered_patient_psevs)
np.save('data/psevs/filtered_patient_psevs_columns.npy', np.array(spoke.columns[selected_columns]))

## Option 2: Node Specific Dropping of bottom 70% of variance

In [8]:
node_type = np.load('../../psev_repo/node_type_list.npy')
node_type = [x.decode('utf-8') if isinstance(x, bytes) else x for x in node_type]
node_type = pd.DataFrame({
    'node': spoke.columns,
    'type': node_type
})

unique_node_types = node_type['type'].unique()

In [None]:
for nt in unique_node_types:
    nt_patient_psevs = patient_psevs[:, node_type[node_type['type'] == nt].index]
    nodes = node_type[node_type['type'] == nt]['node']

    # Step 1: Calculate variance for each column
    chunk_size = 1000
    num_columns = nt_patient_psevs.shape[1]
    variances = []

    for start in range(0, num_columns, chunk_size):
        end = min(start + chunk_size, num_columns)
        chunk = nt_patient_psevs[:, start:end]
        chunk_variances = np.var(chunk, axis=0)
        variances.extend(chunk_variances)

    variances = np.array(variances)

    # Step 2: Determine the threshold for the top 30%
    threshold = np.percentile(variances, 70)  # 70th percentile

    # Step 3: Find the columns with variance above the threshold
    selected_columns = variances > threshold

    # Step 4: Filter the array to retain only these columns
    filtered_array = nt_patient_psevs[:, selected_columns]
    nodes = np.array(nodes[selected_columns])

    #Save the node-specific file
    np.save(f'data/nt_psevs/filtered_patient_psevs_{nt}.npy', filtered_array)
    np.save(f'data/nt_psevs/filtered_patient_psevs_columns_{nt}.npy', nodes)

np.save('data/nt_psevs/person_id_index.npy', np.array(person_id_index))


In [None]:
# pat_ids = np.load('data/nt_psevs/person_id_index.npy')
# columns = np.load('data/nt_psevs/filtered_patient_psevs_columns_Disease.npy', allow_pickle=True)
# psevs = np.load('data/nt_psevs/filtered_patient_psevs_Disease.npy')