In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump, load


In [2]:
embeddings = pd.read_feather("data/addiction_embeddings.feather")
embeddings

Unnamed: 0,1,10,100,1000,10000,100008586,10001,10002,10003,100037417,...,X6RAL5,X6RAN8,X6RAY8,X6RB12,X6REB3,X6REH9,X6RGC9,X6RLR1,X6RLX0,person_id
0,37400.0,255424.0,86562.0,54695.0,45865.0,71742.0,38549.0,17829.0,66042.0,63965.0,...,25071.0,101268.0,39291.5,70985.5,22922.0,4819.5,37013.0,28605.0,99823.0,1519
1,66518.0,170505.0,27722.0,38341.0,43020.0,69426.0,18287.0,27003.0,41278.0,54372.0,...,35105.0,69521.0,68115.5,82825.5,38870.5,19277.5,45794.0,35374.0,47802.5,4035
2,47818.0,88312.0,15192.0,45469.0,49076.0,71960.0,53371.0,14447.0,28243.0,68256.5,...,36173.0,69082.0,52937.5,70857.5,16557.5,23262.5,27747.5,25524.0,33105.5,4054
3,62250.0,128014.0,54179.0,44200.0,39926.0,68610.0,22870.0,24560.0,34557.0,57998.0,...,33969.0,124935.0,27172.0,63508.5,35297.5,17365.5,39864.0,32048.0,46073.0,4059
4,59808.0,365961.0,12737.0,39498.0,50086.0,86861.0,20291.0,33774.0,92190.0,56501.0,...,36165.0,85155.0,74806.5,218958.5,38084.5,14359.5,42711.0,38191.0,31696.0,4502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6059,63037.0,164422.0,25753.0,45288.0,38534.0,70532.5,34984.0,25153.0,42295.0,50152.0,...,39854.0,79754.0,20533.5,54038.0,34386.5,15868.5,41569.5,32954.0,66327.0,5735169
6060,59664.0,276477.0,23942.0,36237.0,54473.0,88924.0,3611.0,27933.0,356176.0,45767.0,...,32698.0,74535.0,322662.5,184945.5,32599.5,20098.5,56902.0,45087.0,44275.0,5735687
6061,59295.0,154196.0,56464.0,35850.5,40431.0,71191.0,21717.0,25327.0,28241.0,83477.0,...,38977.0,70847.0,8051.5,66362.5,39161.5,27129.5,43839.0,36979.5,34786.0,5736628
6062,62008.0,148613.0,28895.0,40338.0,46923.0,69489.0,16517.0,23323.0,36599.0,69050.0,...,37179.0,74825.0,20893.5,64931.5,34032.5,24224.0,45758.0,34392.0,40966.0,5738402


In [3]:
full_bio_cohort = pd.read_feather('data/opioid_cohort_details.feather')
full_bio_cohort

Unnamed: 0,person_id,first_presc_date,last_presc_date,n_visits,most_frequent_prescription,first_diagnosis_date,most_frequent_condition,dependent,gender_source_value,age,ethnicity_source_value,cutoff_date
0,445372,2013-12-03,2013-12-05,7,40232756,2016-01-07,44819555,True,Male,58,Not Hispanic or Latino,2014-06-03
1,4232757,2014-03-28,2020-06-06,68,40232756,2015-04-23,45586193,True,Female,64,Hispanic or Latino,2014-09-28
2,2427586,2014-11-17,2014-11-17,4,40232756,2016-07-11,45586193,True,Female,51,Not Hispanic or Latino,2015-05-17
3,4720971,2015-11-04,2020-02-14,76,40162511,2019-07-05,44822989,True,Female,49,Not Hispanic or Latino,2016-05-04
4,2683890,2012-10-09,2012-10-09,2,40162515,2019-06-25,1326498,True,Male,80,Not Hispanic or Latino,2013-04-09
...,...,...,...,...,...,...,...,...,...,...,...,...
6059,5627246,2013-08-04,2014-04-28,9,40232756,,0,False,Male,85,Not Hispanic or Latino,2014-02-04
6060,3389043,2019-01-02,2020-03-21,4,40162515,,0,False,Female,40,Not Hispanic or Latino,2019-07-02
6061,5683934,2013-11-15,2018-08-20,11,40232756,,0,False,Male,66,Not Hispanic or Latino,2014-05-15
6062,352591,2012-12-06,2019-05-23,6,40162515,,0,False,Male,79,Not Hispanic or Latino,2013-06-06


In [4]:
#Get labels unshuffled to match embedding order
e_indexed = embeddings.set_index('person_id')
f_indexed = full_bio_cohort[['person_id', 'dependent']].set_index('person_id')

labels = f_indexed.reindex(e_indexed.index)

In [6]:
X = embeddings.drop('person_id', axis = 1)
Y = labels['dependent']

# Random Forest Generic

In [7]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_clf.fit(X_train, Y_train)

# Predict on the test data
Y_pred = rf_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 0.94


In [8]:
dump(rf_clf, 'models/addiction_rf_model.joblib')

['addiction_rf_model.joblib']

# Random Forest with ADAboost

In [9]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the AdaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=50, random_state=42)

# Train the model on the training data
ada_clf.fit(X_train, Y_train)

# Predict on the test data
Y_pred = ada_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model accuracy: {accuracy:.2f}")



Model accuracy: 0.92


In [10]:
dump(ada_clf, 'models/addiction_adaboost_model.joblib')

['addiction_adaboost_model.joblib']

# Load Trained Models

In [4]:
rf = load('models/addiction_rf_model.joblib')
ada = load('models/addiction_adaboost_model.joblib')
embeddings = pd.read_feather("data/addiction_embeddings.feather")

In [9]:
feature_importance_df = pd.DataFrame({
        'feature': embeddings.drop('person_id', axis = 1).columns,
        'importance': rf.feature_importances_
    })
    
# Sort by importance and get the top 5 features
top_5_features = feature_importance_df.sort_values(by='importance', ascending=False).head(5)


In [27]:
feature_importance_df

Unnamed: 0,feature,importance
0,1,0.0
1,10,0.0
2,100,0.0
3,1000,0.0
4,10000,0.0
...,...,...
389292,X6REB3,0.0
389293,X6REH9,0.0
389294,X6RGC9,0.0
389295,X6RLR1,0.0


In [10]:
top_5_features

Unnamed: 0,feature,importance
148557,CHEMBL2172455,0.001193
303852,CHEMBL603053,0.001071
268191,CHEMBL470820,0.000991
240377,CHEMBL383182,0.000929
166629,CHEMBL2348474,0.000851


In [35]:
#Translating SPOKE to concept 
arr = np.load('../../psev_repo/PSEV_SPOKE_node_map')
arr = np.array([x.decode("utf-8") for x in arr])
arr


array(['1', '10', '100', ..., 'X6RGC9', 'X6RLR1', 'X6RLX0'], dtype='<U14')

In [None]:
#Translating SPOKE to concept 
arr = np.load('../../psev_repo/node_type_list.npy')
arr = np.array([x.decode("utf-8") for x in arr])
arr


In [23]:
drug_annotation[drug_annotation['spoke_concept_id'] == 'CHEMBL383182']

Unnamed: 0,condition_concept_id,spoke_concept_id


In [26]:
embeddings["CHEMBL383182"]

0       307408.0
1       346975.0
2       302030.0
3       350505.0
4       365222.0
          ...   
6059    363366.0
6060    372583.0
6061    285815.0
6062    349337.0
6063    299372.0
Name: CHEMBL383182, Length: 6064, dtype: float64