In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump, load


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load node information

spoke = np.load('../../psev_repo/PSEV_matrix')
sep = np.load('../../psev_repo/PSEV_SEP_map')
spoke_node = np.load('../../psev_repo/PSEV_SPOKE_node_map')

spoke = pd.DataFrame(spoke, columns=spoke_node)
spoke.index = sep
spoke.index = spoke.index.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
spoke.columns = spoke.columns.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

node_type = np.load('../../psev_repo/node_type_list.npy')
node_type = [x.decode('utf-8') if isinstance(x, bytes) else x for x in node_type]
node_type = pd.DataFrame({
    'node': spoke.columns,
    'type': node_type
})

unique_node_types = node_type['type'].unique()

In [3]:
#Translating conditions
# disease_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_conditions_to_spoke_extended_2.tsv', sep = '\t')

disease_annotation = pd.read_csv('../../gbellucci/spoke_linkers/omop2spoke_combined.tsv', sep = '\t')
disease_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
disease_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict = dict(zip(disease_annotation['spoke_concept_id'], disease_annotation['condition_concept_id']))


#Translating Drugs
drug_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_drug_exposure_to_spoke_extended.tsv', sep = '\t')
drug_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
drug_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict.update(dict(zip(drug_annotation['spoke_concept_id'], drug_annotation['condition_concept_id'])))


#Translating measurements
lab_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_measurement_to_spoke_extended.tsv', sep = '\t')
lab_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
lab_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict.update(dict(zip(lab_annotation['spoke_concept_id'], lab_annotation['condition_concept_id'])))


# Assemble table to train on in Pandas

In [10]:
#First load the general top 30% PSEVs

pat_ids = np.load('data/alc_psevs/person_id_index.npy')
columns = np.load('data/alc_psevs/filtered_patient_psevs_columns.npy', allow_pickle=True)
psevs = np.load('data/alc_psevs/filtered_patient_psevs.npy')

full_bio_cohort = pd.read_feather('data/alc_cohort_details.feather')
full_bio_cohort['dependent'] = full_bio_cohort['most_frequent_condition'].notna()
label_dict = dict(zip(full_bio_cohort["person_id"], full_bio_cohort["dependent"]))

#Now load the node specific ones

# Initialize empty arrays for columns and psevs
nt_columns = None
nt_psevs = None

for nt in unique_node_types:
    ind_nt_psevs = np.load(f'data/alc_nt_psevs/filtered_patient_psevs_{nt}.npy')
    ind_nt_columns = np.load(f'data/alc_nt_psevs/filtered_patient_psevs_columns_{nt}.npy', allow_pickle=True)

    # Concatenate columns and psevs
    if nt_columns is None:
        nt_columns = ind_nt_columns
    else:
        nt_columns = np.concatenate((nt_columns, ind_nt_columns))  # Add new columns

    if nt_psevs is None:
        nt_psevs = ind_nt_psevs
    else:
        nt_psevs = np.hstack((nt_psevs, ind_nt_psevs))  # Add new data horizontally

In [11]:
psevs.shape

(24951, 116357)

In [12]:
columns.shape

(116357,)

In [13]:
nt_columns.shape

(116778,)

In [14]:
#Format tables for RF
Y = np.array([label_dict[pid] for pid in pat_ids if pid in label_dict])
X = psevs
nt_X = nt_psevs

# Train RF

In [18]:
def runGenericRF(X, Y, name):
    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_features=int(np.sqrt(X.shape[1])), random_state=42)

    # Train the model on the training data
    rf_clf.fit(X_train, Y_train)

    # Predict on the test data
    Y_pred = rf_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"{name} Model accuracy: {accuracy:.2f}")

    dump(rf_clf, f'models/alcohol_rf_model_{name}.joblib')

    return accuracy, rf_clf

In [None]:
accuracy_general, model_general = runGenericRF(X, Y, "general")

accuracy_nt, model_nt = runGenericRF(nt_X, Y, "nt")