In [1]:
import fasttext

import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import os
import time
import sys
import glob
import re
import random
from math import ceil
from tqdm import tqdm_notebook as tqdm
import regex
import pickle
import time
import tables
import xml.etree.ElementTree as et
import subprocess
import imblearn


In [None]:
#cTakes_vectors
from icd9cms import search as search_d
from icd9pcs import search as search_p

#read choi et al embeddings file
codes = np.loadtxt('./Data/embeddings/claims_codes_hs_300.txt', skiprows=2, usecols=0, dtype=str)
embeddings = np.loadtxt('./Data/embeddings/claims_codes_hs_300.txt', skiprows=2, usecols=list(range(1,301)))

diagnoses_mlb = pickle.load(open('./Data/diagnoses_files/diagnoses_label_encoder.pkl', 'rb'))
procedure_mlb = pickle.load(open('./Data/procedure_files/procedure_label_encoder.pkl', 'rb'))

embeddings_dict = dict()
for i in range(len(codes)):
    if(codes[i].startswith('IPR_')):
        embeddings_dict[codes[i][4:]] = embeddings[i]
    if(codes[i].startswith('IDX_')):
        embeddings_dict[codes[i][4:]] = embeddings[i]
        
#find embeddings for non-leaf codes in filtered diagnoses codes
for code in tqdm(diagnoses_mlb.classes_):
    node = search_d(code)
    if(not node.is_leaf):
        leaves_subset = []
        for leaf_node in node.leaves:
            leaves_subset.append(leaf_node.alt_code)
        leaf_embeddings = [embeddings_dict[leaf_code] for leaf_code in leaves_subset if embeddings_dict.get(leaf_code) is not None]
        final_embedding = sum(leaf_embeddings)/len(leaf_embeddings)
        embeddings_dict[code] = final_embedding

#find embeddings for non-leaf codes in filtered procedure codes
for code in tqdm(procedure_mlb.classes_):
    node = search_p(code)
    if(not node.is_leaf):
        leaves_subset = []
        for leaf_node in node.leaves:
            leaves_subset.append(leaf_node.alt_code)
        leaf_embeddings = [embeddings_dict[leaf_code] for leaf_code in leaves_subset if embeddings_dict.get(leaf_code) is not None]
        final_embedding = sum(leaf_embeddings)/len(leaf_embeddings)
        embeddings_dict[code] = final_embedding

#get embeddings for medicines using fasttex
medicine_mlb = pickle.load(open('./Data/medicine_files/medicine_label_encoder.pkl', 'rb'))
model = fasttext.load_model('./Data/word_models/BioWordVec_PubMed_MIMICIII_d200.bin')
for medicine_name in medicine_mlb.classes_:
    embeddings_dict[medicine_name] = model.get_word_vector(medicine_name)    

In [None]:
diagnoses_mlb = pickle.load(open('./Data/diagnoses_files/diagnoses_label_encoder.pkl', 'rb'))
diagnoses_label_dict = pickle.load(open('./Data/diagnoses_files/diagnoses_label_dict.pkl', 'rb'))
procedure_label_dict = pickle.load(open('./Data/procedure_files/procedure_label_dict.pkl', 'rb'))
medicine_label_dict = pickle.load(open('./Data/medicine_files/medicine_label_dict.pkl', 'rb'))

ctakes_vectors_d = dict()
for filename in diagnoses_label_dict.keys():
    embeddings_list = [embeddings_dict[icd_code] for icd_code in diagnoses_mlb.inverse_transform(np.expand_dims(diagnoses_label_dict[filename],0))[0] if embeddings_dict.get(icd_code) is not None]
    if(embeddings_list != []):
        ctakes_vectors_d[filename] = sum(embeddings_list)/len(embeddings_list)
ctakes_vectors_p = dict()
for filename in procedure_label_dict.keys():
    embeddings_list = [embeddings_dict[icd_code] for icd_code in procedure_mlb.inverse_transform(np.expand_dims(procedure_label_dict[filename],0))[0] if embeddings_dict.get(icd_code) is not None]
    if(embeddings_list != []):
        ctakes_vectors_p[filename] = sum(embeddings_list)/len(embeddings_list)
ctakes_vectors_m = dict()
for filename in medicine_label_dict.keys():
    embeddings_list = [embeddings_dict[medicine_name] for medicine_name in medicine_mlb.inverse_transform(np.expand_dims(medicine_label_dict[filename],0))[0] if embeddings_dict.get(medicine_name) is not None]
    if(embeddings_list != []):
        ctakes_vectors_m[filename] = sum(embeddings_list)/len(embeddings_list)

ctakes_vectors = dict()
files_no_features = []
for filename in ctakes_vectors_d.keys():
    if((ctakes_vectors_p.get(filename) is not None) and (ctakes_vectors_m.get(filename) is not None)):
        ctakes_vectors[filename] = np.hstack((ctakes_vectors_d[filename],ctakes_vectors_p[filename],ctakes_vectors_m[filename]))
    else:
        files_no_features.append(filename)
with open('./Data/ctakes_vectors.pkl', 'wb') as handle:
    pickle.dump(ctakes_vectors, handle)
with open('./Data/ctakes_files_no_features.pkl', 'wb') as handle:
    pickle.dump(files_no_features, handle)

In [None]:
diagnoses_mlb = pickle.load(open('./Data/diagnoses_files/diagnoses_label_encoder.pkl', 'rb'))
procedure_mlb = pickle.load(open('./Data/procedure_files/procedure_label_encoder.pkl', 'rb'))
medicine_mlb = pickle.load(open('./Data/medicine_files/medicine_label_encoder.pkl', 'rb'))
diagnoses_label_dict = pickle.load(open('./Data/diagnoses_files/diagnoses_label_dict.pkl', 'rb'))
procedure_label_dict = pickle.load(open('./Data/procedure_files/procedure_label_dict.pkl', 'rb'))
medicine_label_dict = pickle.load(open('./Data/medicine_files/medicine_label_dict.pkl', 'rb'))
diagnoses_embeddings = pickle.load(open('./Data/label_embeddings/diagnoses_embeddings_word_emb_train.pkl', 'rb'))
procedure_embeddings = pickle.load(open('./Data/label_embeddings/procedure_embeddings_word_emb_train.pkl', 'rb'))
medicine_embeddings = pickle.load(open('./Data/label_embeddings/medicine_embeddings_word_emb_train.pkl', 'rb'))
diagnoses_embeddings_dict = {code:diagnoses_embeddings[i] for i , code in enumerate(diagnoses_mlb.classes_)}
procedure_embeddings_dict = {code:procedure_embeddings[i] for i , code in enumerate(procedure_mlb.classes_)}
medicine_embeddings_dict = {medicine:medicine_embeddings[i] for i , medicine in enumerate(medicine_mlb.classes_)}

ctakes_vectors_d = dict()
for filename in diagnoses_label_dict.keys():
    embeddings_list = [diagnoses_embeddings_dict[icd_code] for icd_code in diagnoses_mlb.inverse_transform(np.expand_dims(diagnoses_label_dict[filename],0))[0] if diagnoses_embeddings_dict.get(icd_code) is not None]
    if(embeddings_list != []):
        ctakes_vectors_d[filename] = sum(embeddings_list)/len(embeddings_list)
ctakes_vectors_p = dict()
for filename in procedure_label_dict.keys():
    embeddings_list = [procedure_embeddings_dict[icd_code] for icd_code in procedure_mlb.inverse_transform(np.expand_dims(procedure_label_dict[filename],0))[0] if procedure_embeddings_dict.get(icd_code) is not None]
    if(embeddings_list != []):
        ctakes_vectors_p[filename] = sum(embeddings_list)/len(embeddings_list)
ctakes_vectors_m = dict()
for filename in medicine_label_dict.keys():
    embeddings_list = [medicine_embeddings_dict[medicine_name] for medicine_name in medicine_mlb.inverse_transform(np.expand_dims(medicine_label_dict[filename],0))[0] if medicine_embeddings_dict.get(medicine_name) is not None]
    if(embeddings_list != []):
        ctakes_vectors_m[filename] = sum(embeddings_list)/len(embeddings_list)

ctakes_vectors = dict()
files_no_features = []
for filename in ctakes_vectors_d.keys():
    if((ctakes_vectors_p.get(filename) is not None) and (ctakes_vectors_m.get(filename) is not None)):
        ctakes_vectors[filename] = np.hstack((ctakes_vectors_d[filename],ctakes_vectors_p[filename],ctakes_vectors_m[filename]))
    else:
        files_no_features.append(filename)
with open('./Data/ctakes_vectors_word_emb_train.pkl', 'wb') as handle:
    pickle.dump(ctakes_vectors, handle)
# with open('./Data/ctakes_files_no_features.pkl', 'wb') as handle:
#     pickle.dump(files_no_features, handle)

In [None]:
#sent2vec
# model = fasttext.load_model('./Data/word_models/BioSentVec_PubMed_MIMICIII-bigram_d700.bin')
import sent2vec
model = sent2vec.Sent2vecModel()
model.load_model('./Data/word_models/BioSentVec_PubMed_MIMICIII-bigram_d700.bin')

In [None]:
data_dir = './Data_NEW_1/downstream_datasets/Obesity'
# x = model.get_sentence_vector('Person has malaria')
ds = pickle.load(open(f'{data_dir}/discharge_summaries_tokenized_clamp.pkl', 'rb'))
sent2vec_dict = {}
for key in tqdm(ds.keys()):
#     for sentence in ds[key]:
#         if len(sentence)>3:
#             print(' '.join(sentence))
#     print('\n\n\n')
#     continue
    sentence_emb = [model.embed_sentence(' '.join(sentence))[0,:] for sentence in ds[key][1:-1] if len(sentence)>3]
    if sentence_emb==[]:
        continue
    doc_emb = sum(sentence_emb)/len(sentence_emb)
    sent2vec_dict[key] = doc_emb
with open(f'./Data/downstream_datasets/Obesity/sent2vec_mod_dict.pkl', 'wb') as handle:
    pickle.dump(sent2vec_dict, handle)

In [None]:
add_disease_codes = {'CHF': ['428',],
'Hypertension': ['401-405'],
'Obesity' : ['278'],
'CAD': ['414'],
'Venous Insufficiency': ['459.81'],
'Gout': ['274.9'],
'Gallstones':  ['574'],
'Depression': ['296', '300', '309', '311'],
'Asthma': ['493'],
'Gerd': ['530.81'],
'OA': ['715'],
'Hypercholesterolemia':['272'],
'Hypertriglyceridemia':['272'],
'Diabetes': ['250'],
'OSA': ['327.23'],
'PVD':['443.9']}

In [None]:
df_cols = ['id', 'text', 'Asthma', 'CAD', 'CHF', 
               'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 
               'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 
               'OA', 'Obesity', 'OSA', 'PVD', 'Venous Insufficiency']

df_add = dict()
def_row = {'id':None, 'text':None, 'Asthma':'N', 'CAD':'N', 'CHF':'N', 
               'Depression':'N', 'Diabetes':'N', 'Gallstones':'N', 'GERD':'N', 'Gout':'N', 
               'Hypercholesterolemia':'N', 'Hypertension':'N', 'Hypertriglyceridemia':'N', 
               'OA':'N', 'Obesity':'N', 'OSA':'N', 'PVD':'N', 'Venous Insufficiency':'N'}

diagnoses_label_dict = pickle.load(open('./Data/diagnoses_files/diagnoses_label_dict.pkl', 'rb'))
diagnoses_mlb = pickle.load(open('./Data/diagnoses_files/diagnoses_label_encoder.pkl', 'rb'))
for disease in add_disease_codes.keys():
    print(disease)
    for code in add_disease_codes[disease]:
        x = 0
        if('.' in code):
            code = code.split('.')[0]
            x = 1
        if(code not in diagnoses_mlb.classes_):
            continue
        ind = diagnoses_mlb.classes_.tolist().index(code)
#         print(code,ind)
        for filename in diagnoses_label_dict.keys():
            if('noteevents' in filename and diagnoses_label_dict[filename][ind] == 1):
#                 continue
                if(df_add.get(filename) is None):
                    df_add[filename] = def_row.copy()
                    df_add[filename]['id'] = filename
                if(x == 0):
                    df_add[filename][disease] = 'Y'
                else:
                    df_add[filename][disease] = 'X'

df_add = pd.DataFrame.from_records(list(df_add.values()))
df_add = df_add.set_index('id')
df_add = df_add.fillna('X')
with open('./Data/downstream_datasets/Obesity/df_train_add.pkl', 'wb') as handle:
    pickle.dump(df_add, handle)

In [None]:
#obesity challenge

def read_data(data_file, labels_file):
    #read discharge summaries    
    xtree = et.parse(data_file)
    xroot = xtree.getroot()
    rows = []
    df_cols = ['id', 'text', 'Asthma', 'CAD', 'CHF', 
               'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 
               'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 
               'OA', 'Obesity', 'OSA', 'PVD', 'Venous Insufficiency']

    for docs in xroot:
        for node in docs:
            res = []
            res.append(node.attrib.get(df_cols[0]))
            res.append(node[0].text)
            res += ['X']*16
            rows.append({df_cols[i]: res[i] 
                         for i, _ in enumerate(df_cols)})

    out_df = pd.DataFrame(rows, columns=df_cols)
    out_df.set_index('id', inplace=True)

    #read labels
    xtree = et.parse(labels_file)
    xroot = xtree.getroot()
    for diseases in xroot:
        for node in diseases:
            disease = node.attrib.get('name')
            for entry in node:
                out_df.loc[entry.attrib.get('id'), disease] = entry.attrib.get('judgment')
    return out_df

df_train_1 = read_data('./Data/downstream_datasets/Obesity/obesity_patient_records_training.xml',
                      './Data/downstream_datasets/Obesity/obesity_standoff_intuitive_annotations_training.xml')
df_train_2 = read_data('./Data/downstream_datasets/Obesity/obesity_patient_records_training2.xml',
                      './Data/downstream_datasets/Obesity/obesity_standoff_annotations_training_addendum_intutive.xml')
df_train = pd.concat([df_train_1,df_train_2])
df_test = read_data('./Data/downstream_datasets/Obesity/obesity_patient_records_test.xml',
                   './Data/downstream_datasets/Obesity/obesity_standoff_annotations_test_intuitive.xml')

xtree = et.parse('./Data/downstream_datasets/Obesity/obesity_standoff_annotations_training_addendum_CHF.xml')
xroot = xtree.getroot()
for diseases in xroot:
    for node in diseases:
        disease = node.attrib.get('name')
        print(disease)
        for entry in node:
            df_train.loc[entry.attrib.get('id'), disease] = entry.attrib.get('judgment')
            
# for index, row in df_train.iterrows():
#     with open(f'./Data/downstream_datasets/Obesity/discharge_summaries/Obesity_train_{index}.txt', 'w') as file:
#         file.write(row['text'])
# for index, row in df_test.iterrows():
#     with open(f'./Data/downstream_datasets/Obesity/discharge_summaries/Obesity_test_{index}.txt', 'w') as file:
#         file.write(row['text'])
        
with open(f'./Data/downstream_datasets/Obesity/df_train.pkl', 'wb') as handle:
    pickle.dump(df_train, handle)
with open(f'./Data/downstream_datasets/Obesity/df_test.pkl', 'wb') as handle:
    pickle.dump(df_test, handle)

# subprocess.call(['java', '-jar', './Data/AdditionalData/code/SentenceSplitter.jar', 
#                  './Data/downstream_datasets/Obesity/discharge_summaries/', 
#                  './Data/downstream_datasets/Obesity/sentence_split_output'])

In [None]:
#find primary diagnoses
from icd9cms import search
patient2idx = pickle.load(open('./Data/patient2idx.pkl', 'rb'))

df_diagnoses_icd = pd.read_csv('./Data/MIMICIII_latest/DIAGNOSES_ICD.csv')
df_grouped = df_diagnoses_icd.groupby(['SUBJECT_ID','HADM_ID'])
primary_diagnoses_dict = {}
diagnoses_label_dict = pickle.load(open('./Data/diagnoses_files/diagnoses_dict_ctakes.pkl', 'rb'))
diagnoses_mlb = pickle.load(open('./Data/diagnoses_files/diagnoses_label_encoder.pkl', 'rb'))
diagnoses_classes = diagnoses_mlb.classes_
for patient_info, df_subset in df_grouped:
    df_subset = df_subset.sort_values('SEQ_NUM')
    icd_codes = df_subset['ICD9_CODE'].values.tolist()
    if(type(icd_codes[0]) != float and patient2idx.get(patient_info) is not None):
        primary_icd = icd_codes[0]
        parent = search(primary_icd).ancestors()[-1]
        if(parent in diagnoses_label_dict[f'noteevents_{patient2idx[patient_info]}.txt']):
            primary_diagnoses_dict[f'noteevents_{patient2idx[patient_info]}.txt'] = parent
            print('there')
        else:
            print(parent)
    
primary_diagnoses_dict = pd.DataFrame(list(primary_diagnoses_dict.values()), index=list(primary_diagnoses_dict.keys()), columns=['Primary ICD code'])

print(primary_diagnoses_dict.head())
#split data into train and test
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(primary_diagnoses_dict, test_size=0.2, stratify=primary_diagnoses_dict['Primary ICD code'])
with open(f'./Data/downstream_datasets/Primary_diagnoses/df_train.pkl', 'wb') as handle:
    pickle.dump(df_train, handle)
with open(f'./Data/downstream_datasets/Primary_diagnoses/df_test.pkl', 'wb') as handle:
    pickle.dump(df_test, handle)


In [None]:
#code in this section taken from github repo
import spacy
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora, models, similarities
from gensim.matutils import sparse2full
import numpy as np
import math

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#text2vec methods
class text2vec():
    def __init__(self, doc_list):
        #Initialize
        self.doc_list = doc_list
        self.nlp, self.docs, self.docs_dict = self._preprocess(self.doc_list)
    
    # Functions to lemmatise docs
    def _keep_token(self, t):
        return (t.is_alpha and 
                not (t.is_space or t.is_punct or 
                     t.is_stop or t.like_num))
    def _lemmatize_doc(self, doc):
        return [ t.lemma_ for t in doc if self._keep_token(t)]


    #Gensim to create a dictionary and filter out stop and infrequent words (lemmas).
    def _get_docs_dict(self, docs):
        docs_dict = Dictionary(docs)
        #CAREFUL: For small corpus please carefully modify the parameters for filter_extremes, or simply comment it out.
        #docs_dict.filter_extremes(no_below=2, no_above=0.5)
        docs_dict.compactify()
        return docs_dict

    # Preprocess docs
    def _preprocess(self, doc_list):
#         #Load spacy model
#         nlp  = spacy.load('en')
#         #lemmatise docs
#         docs = [self._lemmatize_doc(nlp(doc)) for doc in doc_list] 
#         print(docs)
        #GET TOKENIZED DATA
        
        #Get docs dictionary
        docs_dict = self._get_docs_dict(doc_list)
        nlp = None
        return nlp, doc_list, docs_dict


    # Gensim can again be used to create a bag-of-words representation of each document,
    # build the TF-IDF model, 
    # and compute the TF-IDF vector for each document.
    def _get_tfidf(self, docs, docs_dict):
        docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
        model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
        docs_tfidf  = model_tfidf[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
        return docs_vecs


    #Get avg w2v for one document
    def _document_vector(self, doc, docs_dict, nlp):
        # remove out-of-vocabulary words
        doc_vector = [nlp(word).vector for word in doc if word in docs_dict.token2id]
        return np.mean(doc_vector, axis=0)


    # Get a TF-IDF weighted Glove vector summary for document list
    # Input: a list of documents, Output: Matrix of vector for all the documents
    def tfidf_weighted_wv(self):
        model = fasttext.load_model('./Data/word_models/BioWordVec_PubMed_MIMICIII_d200.bin')
        #tf-idf
        docs_vecs   = self._get_tfidf(self.docs, self.docs_dict)

        #Load glove embedding vector for each TF-IDF term
        tfidf_emb_vecs = np.vstack([model.get_word_vector(self.docs_dict[i]) for i in range(len(self.docs_dict))])

        #To get a TF-IDF weighted Glove vector summary of each document, 
        #we just need to matrix multiply docs_vecs with tfidf_emb_vecs
        docs_emb = np.dot(docs_vecs, tfidf_emb_vecs)

        return docs_emb

    # Get average vector for document list
    def avg_wv(self):
        docs_vecs = np.vstack([self._document_vector(doc, self.docs_dict, self.nlp) for doc in self.docs])
        return docs_vecs

    # Get TF-IDF vector for document list
    def get_tfidf(self):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
        docs_tfidf  = model_tfidf[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
        return docs_vecs


    # Get Latent Semantic Indexing(LSI) vector for document list
    def get_lsi(self, num_topics=300):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lsi  = model_lsi[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
        return docs_vecs

    # Get Random Projections(RP) vector for document list
    def get_rp(self):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict)
        docs_rp  = model_rp[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp])
        return docs_vecs

    # Get Latent Dirichlet Allocation(LDA) vector for document list
    def get_lda(self, num_topics=100):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lda  = model_lda[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
        return docs_vecs

    # Get Hierarchical Dirichlet Process(HDP) vector for document list
    def get_hdp(self):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict)
        docs_hdp  = model_hdp[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_hdp])
        return docs_vecs

In [None]:
#obesity performance
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from xml.dom import minidom
import os
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import BorderlineSMOTE as mSmote, SVMSMOTE , ADASYN, RandomOverSampler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
import scipy

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

doc2vec_model = Doc2Vec.load('./Data/doc2vec/doc2vec.bin')

le = preprocessing.LabelEncoder()
scaler = StandardScaler()
# feature_selection = SelectKBest(mutual_info_classif, k=400)   
# feature_selection = LDA(n_components=500)
# feature_selection = PCA(n_components=500)
feature_selection = Isomap(n_components=100)
#coumpute features
discharge_summary_tokenized_dict = pickle.load(open(f'./Data/downstream_datasets/Obesity/discharge_summaries_tokenized_clamp.pkl', 'rb'))
filenames = list(discharge_summary_tokenized_dict.keys())
# doc_sentence_tokens = list(discharge_summary_tokenized_dict.values())
# doc_word_tokens = [[y for x in doc_sentences for y in x] for doc_sentences in doc_sentence_tokens]
# t2v = text2vec(doc_word_tokens)
# feature_vecs = t2v.get_tfidf()
# feature_dict = {filename:vec for filename,vec in zip(filenames,feature_vecs)}

#computer features using my approach
feature_dict = pickle.load(open('./Data/downstream_datasets/Obesity/document_embeddings_trans_flat_w_label_emb.pkl', 'rb'))
# feature_dict_mimic = pickle.load(open('./Data/document_embeddings_trans_word_emb_train.pkl', 'rb'))
# feature_dict.update(feature_dict_mimic)
# feature_dict = {filename:doc2vec_model.infer_vector([filename], alpha=0.025, steps=100) for filename in filenames}
# feature_dict = pickle.load(open(f'./Data/downstream_datasets/Obesity/sent2vec_mod_dict.pkl', 'rb'))
# feature_dict = pickle.load(open(f'./Data/downstream_datasets/Obesity/model_results.pkl', 'rb'))
# feature_dict = pickle.load(open(f'./Data/ctakes_vectors_word_emb_train.pkl', 'rb'))


comorbidities = ['Asthma', 'CAD', 'CHF', 
               'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 
               'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 
               'OA', 'Obesity', 'OSA', 'PVD', 'Venous Insufficiency']
# comorbidities = ['OSA',]
AUC = []
PRFS = []

root = minidom.Document()
diseaseset = root.createElement('diseaseset')
root.appendChild(diseaseset)

diseasesetChild = root.createElement('diseases')
diseasesetChild.setAttribute('source', 'intuitive')
diseaseset.appendChild(diseasesetChild)

sm = mSmote(random_state=42, k_neighbors = 3)

for comorbidity in comorbidities:
    print(comorbidity)
    #train
    df_train = pickle.load(open(f'./Data/downstream_datasets/Obesity/df_train.pkl', 'rb'))
    df_train = df_train.set_index('Obesity_train_'+df_train.index.astype(str)+'.txt')
    df_train_add = pickle.load(open(f'./Data/downstream_datasets/Obesity/df_train_add.pkl', 'rb'))
    
    df_train = df_train[(df_train[comorbidity].isin(['Y', 'N']))]
#     df_train = sample(df_train, df_train_add, comorbidity, k=2)
    
    #     print('len of train before: ', len(df_train))
    filenames_w_features = [filename for filename in list(df_train.index) if feature_dict.get(filename) is not None]
    df_train = df_train.filter(items=filenames_w_features, axis=0)
#     print('len of train after: ', len(df_train))
    #get labels and filenames
    
    print(df_train[comorbidity].value_counts())
    labels = df_train[comorbidity].values
    #make label coder on training
    le.fit(labels)
#     print(le.classes_)
    if(len(le.classes_) < 2):
        continue
    labels = le.transform(labels) 
    filenames = list(df_train.index)
    features = [feature_dict[filename]for filename in filenames]# if feature_dict.get(f'Obesity_train_{filename}.txt') is not None]
    features = np.vstack(features)
    print('Train Size: ', len(features))
    features = scaler.fit_transform(features)
#     features = feature_selection.fit_transform(features, labels)
#     features_res, labels_res = sm.fit_resample(features, labels)

#     model = LogisticRegression(class_weight='balanced').fit(features,labels)
#     model = LogisticRegression(C=100,class_weight='balanced').fit(features,labels)
#     best_params = pickle.load(open(f'./Models/best_params/i2b2_{comorbidity}.pkl', 'rb'))
#     model = LogisticRegression(**best_params).fit(features,labels)
#     model = SVC(C=0.01, kernel='linear', class_weight='balanced').fit(features,labels)
#     model = SVC(kernel='linear').fit(features_res,labels_res)
#     model = RandomForestClassifier().fit(features,labels)
#     model = BalancedRandomForestClassifier(class_weight='balanced').fit(features,labels)
#     model = RFE(LogisticRegression(class_weight='balanced'), 500).fit(features,labels)
    distributions = {'C': loguniform(1e-4,1e-1), 'penalty':['l1', 'l2'], 'max_iter':loguniform(1e1, 1e3), 'tol':loguniform(1e-4,1e-1), 
                     'class_weight':['balanced', None], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
    model = RandomizedSearchCV(LogisticRegression(), distributions, random_state=0).fit(features,labels)
#     distributions = {'C': loguniform(1e-4,1e-1), 'gamma':loguniform(1e-4,1e-1), 'max_iter':loguniform(1e1, 1e3), 'tol':loguniform(1e-4,1e-1), 
#                      'class_weight':['balanced', None], 'shrinking': [True, False]}
#     model = RandomizedSearchCV(SVC(kernel='linear'), distributions, random_state=0).fit(features,labels)
#     with open(f'./Models/best_params/i2b2_{comorbidity}.pkl', 'wb') as handle:
#         pickle.dump(model.best_params_, handle)
    
    #train perf 
    y_pred = model.predict(features)
    print(classification_report(labels, y_pred))
    
    
    #test
    df_test = pickle.load(open(f'./Data/downstream_datasets/Obesity/df_test.pkl', 'rb'))
    filenames_w_features = [filename.split('.')[0].split('_')[-1] for filename in feature_dict.keys() if 'Obesity_test' in filename]
    df_test = df_test.filter(items=filenames_w_features, axis=0)
    df_test = df_test[(df_test[comorbidity].isin(['Y', 'N','Q']))]
#     labels = df_test[comorbidity].values
#     labels = le.transform(labels)
#     n_labels = len(np.unique(labels))
#     print('number of labels: ', n_labels)
    filenames = list(df_test.index)
#     print(filenames)
    features = [feature_dict[f'Obesity_test_{filename}.txt']for filename in filenames]# if feature_dict.get(f'Obesity_test_{filename}.txt') is not None]
    features = np.vstack(features)
    print('Test Size: ', len(features))
    features = scaler.transform(features)
#     features = feature_selection.transform(features)
    print(features.shape)
    
    y_pred = model.predict(features)
    
    diseasesChild = root.createElement('disease')
    diseasesChild.setAttribute('name', comorbidity)
    diseasesetChild.appendChild(diseasesChild)
    for filename, y_pred_i in sorted(zip(filenames, le.inverse_transform(y_pred)), key=lambda x:int(x[0])):
        predictionElem = root.createElement('doc')
        predictionElem.setAttribute('id', filename)
        predictionElem.setAttribute('judgment', str(y_pred_i))
        diseasesChild.appendChild(predictionElem)
#     print(classification_report(labels, y_pred))
#     print('auc: ', roc_auc_score(labels, y_pred))
#     AUC.append(roc_auc_score(labels, y_pred))
#     print('prf ', precision_recall_fscore_support(labels, y_pred, labels=[1,]))
#     print('prf ', precision_recall_fscore_support(labels, y_pred, average='macro'))
#     elem = list(precision_recall_fscore_support(labels, y_pred, average='macro'))
#     elem.append(n_labels)
#     elem[3] = len(df_test)
#     print(elem)
#     PRFS.append(elem)

# total_samples = sum([elem[3] for elem in PRFS])

xml_str = root.toprettyxml(indent = "", newl = "\n", encoding = "utf-8")

with open('./Data/downstream_datasets/Obesity/test_han_bert.xml' , 'wb+') as f:
    f.write(xml_str)


In [None]:
total_samples = sum([elem[3] for elem in PRFS])
# macro_auc = sum(AUC)/len(AUC)
# micro_auc = sum([auc*elem[3][0] for auc,elem in zip(AUC,PRFS)])/total_samples
# print('AUC: ', macro_auc, micro_auc)
macro_precision = sum([elem[0] for elem in PRFS])/len(PRFS)
micro_precision = sum([elem[0]*elem[3] for elem in PRFS])/total_samples
print('Precision: ', macro_precision,micro_precision)
macro_recall = sum([elem[1] for elem in PRFS])/len(PRFS)
micro_recall = sum([elem[1]*elem[0] for elem in PRFS])/total_samples
print('Recall: ', macro_recall,micro_recall)
macro_fscore = sum([elem[2] for elem in PRFS])/len(PRFS)
micro_fscore = sum([elem[2]*elem[3] for elem in PRFS])/total_samples
macro_fscore_new = sum([elem[2]*elem[-1] for elem in PRFS])/sum([elem[-1] for elem in PRFS])
print('Fscore: ', macro_fscore,macro_fscore_new,micro_fscore)

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.svm import LinearSVC
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import precision_recall_fscore_support
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.svm import OneClassSVM 
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2 , mutual_info_classif
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.utils.fixes import loguniform
import scipy

class ModelPerf():
    def __init__(self, task='Morbidity', multiclass=False): #taks : morbidity or primary diagnoses prediction
        self.task = task
        self.t2v = None
#         self.feature_dict = self.comput_features_doc2vec()
#         self.feature_dict = self.compute_features() 
        self.feature_dict = pickle.load(open(f'./Data/document_embeddings_trans_with_patient_notes_1.pkl', 'rb'))
        self.feature_dict_ref = pickle.load(open(f'./Data/ctakes_vectors_word_emb_train.pkl', 'rb'))
#         self.feature_dict = pickle.load(open(f'./Data/model_results.pkl', 'rb'))
#         self.feature_dict = pickle.load(open(f'./Data/sent2vec_dict.pkl', 'rb'))
#         self.feature_dict = pickle.load(open(f'./Data/bert_dict.pkl', 'rb'))
        self.le = preprocessing.LabelEncoder()
        self.scaler = PowerTransformer(method = 'yeo-johnson')
#         self.feature_selection = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
        self.feature_selection = SelectKBest(mutual_info_classif, k=500)
        self.dim_reducer = TruncatedSVD(n_components=100, random_state=20)
        self.model = None
        self.rfe = None
        self.multiclass = multiclass
        self.train()
        self.test()
#         self.find_imp_features()
        
    def compute_features(self):
        ps = PorterStemmer() 
        discharge_summary_tokenized_dict = pickle.load(open(f'./Data/downstream_datasets/{self.task}/discharge_summaries_tokenized_clamp.pkl', 'rb'))
        filenames = list(discharge_summary_tokenized_dict.keys())
        doc_sentence_tokens = list(discharge_summary_tokenized_dict.values())
        doc_word_tokens = [[y for x in doc_sentences for y in x] for doc_sentences in doc_sentence_tokens]
        self.t2v = text2vec(doc_word_tokens)
        feature_vecs = self.t2v.get_tfidf()
        feature_dict = {filename:vec for filename,vec in zip(filenames,feature_vecs)}
        return feature_dict
    
    def comput_features_doc2vec(self):
        doc2vec_model = Doc2Vec.load('./Data/doc2vec/doc2vec.bin')
        discharge_summary_tokenized_dict = pickle.load(open(f'./Data/downstream_datasets/{self.task}/discharge_summaries_tokenized_clamp.pkl', 'rb'))
        filenames = list(discharge_summary_tokenized_dict.keys())
        feature_dict = {filename:doc2vec_model.infer_vector([filename], alpha=0.025, steps=100) for filename in filenames}
        return feature_dict
    
    def get_features(self, filenames, mode='train'):
        features = [self.feature_dict[filename] for filename in filenames]
    
    def get_labels(self, df, mode='train'):   
        labels = df['12'].values
        if(mode == 'train'):
            self.le.fit(labels)
            print(self.le.classes_)
        labels = self.le.transform(labels)
        filenames = list(df.index)
        return labels, filenames
    
    def train(self):
        df_train = pickle.load(open(f'./Data/downstream_datasets/{self.task}/df_train.pkl', 'rb'))
        filenames_w_features = list(set(list(self.feature_dict_ref.keys())).intersection(set(list(self.feature_dict.keys()))))
        df_train = df_train[df_train.index.isin(filenames_w_features)]
        print(len(df_train))
        labels, filenames = self.get_labels(df_train)
        features = self.get_features(filenames)
        print(features.shape)
#         features = self.scaler.fit_transform(features)
#         features = self.feature_selection.fit_transform(features, labels)
#         features = self.dim_reducer.fit_transform(features)
        if(self.multiclass):
            self.model = LogisticRegression(class_weight='balanced', multi_class='ovr').fit(features,labels)
        else:
#             self.model = LogisticRegression().fit(features,labels)
#             self.model = LogisticRegression(**pickle.load(open('./Models/best_params/1Y.pkl', 'rb'))).fit(features,labels)
            distributions = {'C': loguniform(1e-4,2), 'penalty':['l1', 'l2'], 'max_iter':loguniform(1e1, 1e3), 'tol':loguniform(1e-5,1e-1), 
                     'class_weight':['balanced', None], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
            self.model = RandomizedSearchCV(LogisticRegression(), distributions, random_state=0).fit(features,labels)
#             self.model = LinearSVC(class_weight='balanced').fit(features,labels)
#             self.model = RandomForestClassifier(class_weight='balanced', random_state=15325).fit(features,labels)
#             self.model = BalancedRandomForestClassifier().fit(features,labels)
#             self.model = OneClassSVM(nu=861/21308, gamma=0.001).fit(features)
    
    def test(self):
        df_test = pickle.load(open(f'./Data/downstream_datasets/{self.task}/df_test.pkl', 'rb'))
        filenames_w_features = list(set(list(self.feature_dict_ref.keys())).intersection(set(list(self.feature_dict.keys()))))
        df_test = df_test[df_test.index.isin(filenames_w_features)]
        print(len(df_test))
        labels, filenames = self.get_labels(df_test, mode='test')
        features = self.get_features(filenames, mode='test')
#         features = self.dim_reducer.transform(features)
#         features = self.scaler.transform(features)
#         features = self.feature_selection.transform(features)
        y_pred = self.model.predict(features)
        print(classification_report(labels, y_pred))
        print('auc: ', roc_auc_score(labels, self.model.predict_proba(features)[:,1])*100)
        print('AP:', average_precision_score(labels, self.model.predict_proba(features)[:,1])*100)
        print(precision_recall_fscore_support(labels, y_pred, average='macro'))
        print(precision_recall_fscore_support(labels, y_pred, average='micro'))
        
    def find_imp_features(self):
        df_train = pickle.load(open(f'./Data/downstream_datasets/{self.task}/df_train.pkl', 'rb'))
        labels, filenames = self.get_labels(df_train)
        features = self.get_features(filenames)
#         features = self.scaler.fit_transform(features)
        model = LogisticRegression(class_weight='balanced')
        self.rfe = RFE(model, 100)
        self.rfe = self.rfe.fit(features, labels)