In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump, load


  from .autonotebook import tqdm as notebook_tqdm


# Assemble table to train on in Pandas

In [4]:
#Load node information
sp_columns = np.load('../../psev_repo/PSEV_SPOKE_node_map')
sp_columns = np.array([x.decode('utf-8') if isinstance(x, bytes) else x for x in sp_columns])

node_type = np.load('../../psev_repo/node_type_list.npy')
node_type = [x.decode('utf-8') if isinstance(x, bytes) else x for x in node_type]
node_type = pd.DataFrame({
    'node': sp_columns,
    'type': node_type
})

unique_node_types = node_type['type'].unique()

In [5]:
#First load the general top 30% PSEVs

pat_ids = np.load('data/alc_psevs/person_id_index.npy')
columns = np.load('data/alc_psevs/filtered_patient_psevs_columns.npy', allow_pickle=True)
psevs = np.load('data/alc_psevs/filtered_patient_psevs.npy')

full_bio_cohort = pd.read_feather('data/alc_cohort_details.feather')
full_bio_cohort['dependent'] = full_bio_cohort['most_frequent_condition'].notna()
label_dict = dict(zip(full_bio_cohort["person_id"], full_bio_cohort["dependent"]))

#Now load the node specific ones

# Initialize empty arrays for columns and psevs
nt_columns = None
nt_psevs = None

for nt in unique_node_types:
    ind_nt_psevs = np.load(f'data/alc_nt_psevs/filtered_patient_psevs_{nt}.npy')
    ind_nt_columns = np.load(f'data/alc_nt_psevs/filtered_patient_psevs_columns_{nt}.npy', allow_pickle=True)

    # Concatenate columns and psevs
    if nt_columns is None:
        nt_columns = ind_nt_columns
    else:
        nt_columns = np.concatenate((nt_columns, ind_nt_columns))  # Add new columns

    if nt_psevs is None:
        nt_psevs = ind_nt_psevs
    else:
        nt_psevs = np.hstack((nt_psevs, ind_nt_psevs))  # Add new data horizontally

In [6]:
psevs.shape

(24951, 116357)

In [7]:
columns.shape

(116357,)

In [8]:
nt_columns.shape

(116778,)

In [9]:
#Format tables for RF
Y = np.array([label_dict[pid] for pid in pat_ids if pid in label_dict])
X = psevs
nt_X = nt_psevs

In [10]:
nt_X

array([[7.56050440e-06, 8.08882692e-06, 7.27053475e-06, ...,
        7.12677252e-06, 7.16525892e-06, 7.60939338e-06],
       [9.97108918e-06, 1.08293953e-05, 9.91531033e-06, ...,
        9.91846969e-06, 9.98264409e-06, 9.95641790e-06],
       [6.27657885e-05, 6.56307018e-05, 6.34117798e-05, ...,
        6.25746211e-05, 6.32389849e-05, 6.19102709e-05],
       ...,
       [2.43875979e-06, 2.60675620e-06, 2.47973094e-06, ...,
        2.52148834e-06, 2.54762858e-06, 2.43738737e-06],
       [2.57553620e-06, 3.52608549e-06, 2.57461124e-06, ...,
        2.45108822e-06, 2.49711729e-06, 2.39350766e-06],
       [7.47761783e-06, 8.17553428e-06, 7.34012406e-06, ...,
        7.33639581e-06, 7.44074714e-06, 7.15209262e-06]])

# Train RF

In [11]:
def runGenericRF(X, Y, name):
    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_features=int(np.sqrt(X.shape[1])), random_state=42)

    # Train the model on the training data
    rf_clf.fit(X_train, Y_train)

    # Predict on the test data
    Y_pred = rf_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"{name} Model accuracy: {accuracy:.2f}")

    dump(rf_clf, f'models/alcohol_rf_model_{name}.joblib')

    return accuracy, rf_clf

In [12]:
accuracy_general, model_general = runGenericRF(X, Y, "general")

accuracy_nt, model_nt = runGenericRF(nt_X, Y, "nt")

general Model accuracy: 0.89
nt Model accuracy: 0.96


# Interpret results

In [7]:
# Load here
model_general = load('models/alcohol_rf_model_general.joblib')
model_nt = load('models/alcohol_rf_model_nt.joblib')

In [8]:
#Load node information

spoke = np.load('../../psev_repo/PSEV_matrix')
sep = np.load('../../psev_repo/PSEV_SEP_map')
spoke_node = np.load('../../psev_repo/PSEV_SPOKE_node_map')

spoke = pd.DataFrame(spoke, columns=spoke_node)
spoke.index = sep
spoke.index = spoke.index.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
spoke.columns = spoke.columns.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

node_type = np.load('../../psev_repo/node_type_list.npy')
node_type = [x.decode('utf-8') if isinstance(x, bytes) else x for x in node_type]
node_type = pd.DataFrame({
    'node': spoke.columns,
    'type': node_type
})

unique_node_types = node_type['type'].unique()

In [9]:
#Translating conditions
# disease_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_conditions_to_spoke_extended_2.tsv', sep = '\t')

disease_annotation = pd.read_csv('../../gbellucci/spoke_linkers/omop2spoke_combined.tsv', sep = '\t')
disease_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
disease_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict = dict(zip(disease_annotation['spoke_concept_id'], disease_annotation['condition_concept_id']))


#Translating Drugs
drug_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_drug_exposure_to_spoke_extended.tsv', sep = '\t')
drug_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
drug_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict.update(dict(zip(drug_annotation['spoke_concept_id'], drug_annotation['condition_concept_id'])))


#Translating measurements
lab_annotation = pd.read_csv('../../psev_repo/omop_sep_map/filtered_omop_measurement_to_spoke_extended.tsv', sep = '\t')
lab_annotation.rename(columns={'OMOP': 'condition_concept_id'}, inplace=True)
lab_annotation.rename(columns={'SPOKE': 'spoke_concept_id'}, inplace=True)

spoke_to_omop_dict.update(dict(zip(lab_annotation['spoke_concept_id'], lab_annotation['condition_concept_id'])))


In [10]:
#Reload the columns
columns = np.load('data/alc_psevs/filtered_patient_psevs_columns.npy', allow_pickle=True)
nt_columns = None

for nt in unique_node_types:
    ind_nt_columns = np.load(f'data/alc_nt_psevs/filtered_patient_psevs_columns_{nt}.npy', allow_pickle=True)

    # Concatenate columns and psevs
    if nt_columns is None:
        nt_columns = ind_nt_columns
    else:
        nt_columns = np.concatenate((nt_columns, ind_nt_columns))  # Add new columns

In [11]:
feature_importance_df = pd.DataFrame({
        'feature': columns,
        'importance': model_general.feature_importances_
    })
    
feature_importance_df = pd.merge(feature_importance_df, node_type, left_on = "feature", right_on = "node").drop("feature", axis = 1).sort_values(by = 'importance', ascending = False)
feature_importance_df['omop'] = feature_importance_df['node'].map(spoke_to_omop_dict)

nt_feature_importance_df = pd.DataFrame({
        'feature': nt_columns,
        'importance': model_nt.feature_importances_
    })
    
nt_feature_importance_df = pd.merge(nt_feature_importance_df, node_type, left_on = "feature", right_on = "node").drop("feature", axis = 1).sort_values(by = 'importance', ascending = False)
nt_feature_importance_df['omop'] = nt_feature_importance_df['node'].map(spoke_to_omop_dict)

In [18]:
feature_importance_df.groupby('type').count()

Unnamed: 0_level_0,importance,node,omop
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Compound,108827,108827,24
Disease,25,25,7
Gene,22,22,0
PharmacologicClass,122,122,0
Protein,7360,7360,0
Symptom,1,1,0


In [16]:
nt_feature_importance_df.groupby('type').count()

Unnamed: 0_level_0,importance,node,omop
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anatomy,3974,3974,17
BiologicalProcess,3947,3947,6
CellularComponent,518,518,3
Compound,86027,86027,14
Disease,2739,2739,2050
Gene,5870,5870,28
MolecularFunction,1022,1022,1
Pathway,729,729,0
PharmacologicClass,524,524,4
Protein,10157,10157,0


In [19]:
feature_importance_df.sort_values('importance', ascending = False).head(240).drop('omop', axis = 1)

Unnamed: 0,importance,node,type
95,0.005228,A2A368,Protein
40978,0.005007,CHEMBL2146121,Compound
108997,0.004731,DB00704,Compound
25693,0.003411,CHEMBL1830698,Compound
7606,0.003267,CHEMBL1209746,Compound
...,...,...,...
23851,0.000347,CHEMBL1807813,Compound
49817,0.000346,CHEMBL2332365,Compound
6651,0.000346,CHEMBL1173701,Compound
19832,0.000344,CHEMBL1738804,Compound


In [21]:
t240 = feature_importance_df.sort_values('importance', ascending = False).head(240).drop('omop', axis = 1)
nt_t20 = nt_feature_importance_df.groupby('type', group_keys=False).apply(lambda group: group.nlargest(20, 'importance')).drop('omop', axis = 1)

  nt_t20 = nt_feature_importance_df.groupby('type', group_keys=False).apply(lambda group: group.nlargest(20, 'importance')).drop('omop', axis = 1)


In [22]:
t240.to_csv('features/alcohol_top_features_general.csv', index = False)
nt_t20.to_csv('features/alcohol_top_features_nt.csv', index = False)